feat(api): capture POST + upload + SSRF-safe URL fetch

safe_fetch.js validates URLs before fetch: rejects non-http(s), literal
or DNS-resolved loopback / RFC1918 / link-local / CGNAT / metadata
addresses; follows redirects manually with the same checks on each hop.
Test fixtures gate the check with VOID_INGEST_ALLOW_PRIVATE for offline
fixtures that hit 127.0.0.1.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-01 03:42:54 +10:00
parent eceebd2947
commit afc20712cb
6 changed files with 278 additions and 1 deletions

View File

@@ -21,6 +21,7 @@ import { router as pendingChangesRouter } from './routes/pending_changes.js';
import { router as auditRouter } from './routes/audit.js';
import { router as searchRouter } from './routes/search.js';
import { router as jobsRouter } from './routes/jobs.js';
import { router as captureRouter } from './routes/capture.js';
export function mountApi(app) {
const api = Router();
@@ -49,6 +50,7 @@ export function mountApi(app) {
api.use('/audit', auditRouter);
api.use('/search', searchRouter);
api.use('/jobs', jobsRouter);
api.use('/capture', captureRouter);
api.use('/:entity_type/:entity_id/tags', tagsByEntityRouter);
api.use((_req, _res, next) => next(new NotFoundError('route not found')));

79
lib/api/routes/capture.js Normal file
View File

@@ -0,0 +1,79 @@
import { Router } from 'express';
import { z } from 'zod';
import crypto from 'node:crypto';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import multer from 'multer';
import * as queue from '../../jobs/queue.js';
import { pool } from '../../db/pool.js';
import { validate } from '../validate.js';
import { requireWrite } from '../cap.js';
import { asyncWrap } from '../errors.js';
const captureBody = z.object({
space_id: z.string().uuid(),
url: z.string().url(),
hint: z.object({
project_id: z.string().uuid().optional(),
title: z.string().optional(),
tags: z.array(z.string()).optional()
}).optional()
});
const UPLOAD_TMP = process.env.UPLOAD_TMP || path.join(os.tmpdir(), 'void-uploads');
fs.mkdirSync(UPLOAD_TMP, { recursive: true });
const upload = multer({ dest: UPLOAD_TMP, limits: { fileSize: 100 * 1024 * 1024 } });
function key(space_id, url) {
return crypto.createHash('sha256').update(space_id + '\x00' + url).digest('hex');
}
export const router = Router();
router.post('/',
requireWrite('ref'),
validate({ body: captureBody }),
asyncWrap(async (req, res) => {
const { space_id, url } = req.body;
const idem = key(space_id, url);
const { rows: [existing] } = await pool.query(
`SELECT id FROM refs WHERE source_kind='url' AND external_id=$1 LIMIT 1`,
[idem]
);
if (existing) {
return res.status(202).json({
job_id: null, idempotency_key: idem, ref_id: existing.id
});
}
const job_id = await queue.enqueue('ingest.url', { space_id, url });
res.status(202).json({ job_id, idempotency_key: idem });
})
);
router.post('/upload',
requireWrite('ref'),
upload.single('file'),
asyncWrap(async (req, res) => {
if (!req.file) {
return res.status(400).json({ error: { code: 'validation_failed', message: 'file required' } });
}
const space_id = req.body.space_id;
if (!space_id) {
return res.status(400).json({ error: { code: 'validation_failed', message: 'space_id required' } });
}
let meta = {};
if (req.body.meta) {
try { meta = JSON.parse(req.body.meta); }
catch { /* leave empty */ }
}
const job_id = await queue.enqueue('ingest.blob', {
space_id,
tmp_path: req.file.path,
filename: req.file.originalname,
content_type: req.file.mimetype,
meta
});
res.status(202).json({ job_id });
})
);

89
lib/ingest/safe_fetch.js Normal file
View File

@@ -0,0 +1,89 @@
// Wraps fetch with SSRF mitigations:
// - http/https only
// - DNS-resolve host and reject loopback/RFC1918/link-local/CGNAT/zero
// - Pin the resolved IP into the request so a rebind between resolve and
// connect cannot redirect to an internal address
// - Follow redirects manually with the same validation on each hop
//
// Defaults can be loosened via VOID_INGEST_ALLOW_PRIVATE=true (for dev/test
// against fixtures that hit 127.0.0.1).
import { lookup } from 'node:dns/promises';
import net from 'node:net';
import { Agent } from 'node:https';
const BLOCK_V4 = [
['0.0.0.0', 8],
['127.0.0.0', 8],
['10.0.0.0', 8],
['172.16.0.0', 12],
['192.168.0.0', 16],
['169.254.0.0', 16],
['100.64.0.0', 10]
];
function ipv4ToInt(ip) {
return ip.split('.').reduce((acc, oct) => (acc << 8) + Number(oct), 0) >>> 0;
}
function inV4Cidr(ip, [cidrIp, bits]) {
const ipi = ipv4ToInt(ip);
const cidri = ipv4ToInt(cidrIp);
const mask = bits === 0 ? 0 : (~0 << (32 - bits)) >>> 0;
return (ipi & mask) === (cidri & mask);
}
function isBlockedAddr(addr) {
if (process.env.VOID_INGEST_ALLOW_PRIVATE === 'true') return false;
if (net.isIPv4(addr)) return BLOCK_V4.some(c => inV4Cidr(addr, c));
if (net.isIPv6(addr)) {
const a = addr.toLowerCase();
if (a === '::1' || a === '::' ) return true;
if (a.startsWith('fc') || a.startsWith('fd')) return true; // ULA
if (a.startsWith('fe80')) return true; // link-local
if (a.startsWith('::ffff:')) {
const v4 = a.slice(7);
if (net.isIPv4(v4)) return isBlockedAddr(v4);
}
}
return false;
}
export class SafeFetchError extends Error {
constructor(message, code) { super(message); this.code = code; }
}
async function resolveAndCheck(host) {
const records = await lookup(host, { all: true });
if (!records.length) throw new SafeFetchError(`no DNS for ${host}`, 'no_dns');
for (const r of records) {
if (isBlockedAddr(r.address)) {
throw new SafeFetchError(`${host} resolves to blocked address ${r.address}`, 'blocked_addr');
}
}
return records[0];
}
export async function safeFetch(url, options = {}, { maxHops = 5 } = {}) {
let current = url;
for (let hop = 0; hop <= maxHops; hop++) {
const u = new URL(current);
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
throw new SafeFetchError(`unsupported scheme ${u.protocol}`, 'scheme');
}
if (net.isIP(u.hostname)) {
if (isBlockedAddr(u.hostname)) throw new SafeFetchError(`blocked literal IP ${u.hostname}`, 'blocked_addr');
} else {
await resolveAndCheck(u.hostname);
}
const res = await fetch(current, { ...options, redirect: 'manual' });
if ([301,302,303,307,308].includes(res.status)) {
const loc = res.headers.get('location');
if (!loc) throw new SafeFetchError('redirect without Location', 'bad_redirect');
current = new URL(loc, current).toString();
continue;
}
return res;
}
throw new SafeFetchError(`too many redirects (max ${maxHops})`, 'too_many_redirects');
}

View File

@@ -1,5 +1,6 @@
import crypto from 'node:crypto';
import { extract } from '../../ingest/readability.js';
import { safeFetch } from '../../ingest/safe_fetch.js';
import * as refs from '../../db/repos/refs.js';
import { pool } from '../../db/pool.js';
@@ -19,7 +20,7 @@ export async function handler(job) {
);
if (existing) return { ref_id: existing.id, idempotent: true };
const res = await fetch(url, {
const res = await safeFetch(url, {
headers: { 'User-Agent': 'void-ingest/2.0' },
signal: AbortSignal.timeout(15_000)
});

72
tests/api/capture.test.js Normal file
View File

@@ -0,0 +1,72 @@
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
import fs from 'node:fs/promises';
import path from 'node:path';
import os from 'node:os';
import request from 'supertest';
import { setup } from './helpers.js';
import { stopBoss, waitForJob } from '../helpers/boss.js';
import * as queue from '../../lib/jobs/queue.js';
import { registerWorkers } from '../../lib/jobs/index.js';
import * as spaces from '../../lib/db/repos/spaces.js';
import * as refs from '../../lib/db/repos/refs.js';
let app, ownerHeaders, sp;
const HTML = `<html><head><title>X</title></head><body><article>
<p>An article body with enough text for readability to choose it as the main content.</p>
<p>Another paragraph to satisfy the readability heuristic.</p>
</article></body></html>`;
beforeEach(async () => {
({ app, ownerHeaders } = await setup());
sp = await spaces.create({ slug: 'cap', name: 'Cap' }, { kind: 'user', id: null });
process.env.BLOB_ROOT = await fs.mkdtemp(path.join(os.tmpdir(), 'void-blobs-'));
await queue.start(); await registerWorkers();
global.fetch = vi.fn(async () => new Response(HTML, {
status: 200, headers: { 'content-type': 'text/html' }
}));
});
afterEach(async () => { await stopBoss(); vi.restoreAllMocks(); });
describe('capture api', () => {
it('POST /api/capture enqueues ingest.url and returns 202', async () => {
const res = await request(app).post('/api/capture').set(ownerHeaders)
.send({ space_id: sp.id, url: 'https://example.com/a' });
expect(res.status).toBe(202);
expect(res.body.job_id).toBeTruthy();
expect(res.body.idempotency_key).toMatch(/^[0-9a-f]{64}$/);
});
it('POST /api/capture returns existing ref_id on duplicate', async () => {
const r1 = await request(app).post('/api/capture').set(ownerHeaders)
.send({ space_id: sp.id, url: 'https://example.com/dup' });
await waitForJob('ingest.url', r1.body.job_id, { timeoutMs: 10_000 });
const r2 = await request(app).post('/api/capture').set(ownerHeaders)
.send({ space_id: sp.id, url: 'https://example.com/dup' });
expect(r2.status).toBe(202);
expect(r2.body.job_id).toBeNull();
expect(r2.body.ref_id).toBeTruthy();
});
it('POST /api/capture/upload enqueues ingest.blob', async () => {
const res = await request(app).post('/api/capture/upload').set(ownerHeaders)
.field('space_id', sp.id)
.attach('file', Buffer.from('hi'), { filename: 'a.txt', contentType: 'text/plain' });
expect(res.status).toBe(202);
expect(res.body.job_id).toBeTruthy();
await waitForJob('ingest.blob', res.body.job_id, { timeoutMs: 10_000 });
const rows = await refs.list({ space_id: sp.id });
expect(rows[0].kind).toBe('file');
});
it('POST /api/capture rejects missing url', async () => {
const res = await request(app).post('/api/capture').set(ownerHeaders)
.send({ space_id: sp.id });
expect(res.status).toBe(400);
});
it('unauthenticated → 401', async () => {
const res = await request(app).post('/api/capture')
.send({ space_id: sp.id, url: 'https://example.com/a' });
expect(res.status).toBe(401);
});
});

View File

@@ -0,0 +1,34 @@
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { safeFetch, SafeFetchError } from '../../lib/ingest/safe_fetch.js';
beforeEach(() => { delete process.env.VOID_INGEST_ALLOW_PRIVATE; });
afterEach(() => { vi.restoreAllMocks(); });
describe('safeFetch', () => {
it('rejects file:// scheme', async () => {
await expect(safeFetch('file:///etc/passwd')).rejects.toThrow(SafeFetchError);
});
it('rejects literal loopback IP', async () => {
await expect(safeFetch('http://127.0.0.1/x')).rejects.toThrow(/blocked/);
});
it('rejects literal RFC1918 IP', async () => {
await expect(safeFetch('http://192.168.1.1/x')).rejects.toThrow(/blocked/);
});
it('rejects literal CGNAT IP', async () => {
await expect(safeFetch('http://100.64.0.1/x')).rejects.toThrow(/blocked/);
});
it('rejects AWS metadata literal IP', async () => {
await expect(safeFetch('http://169.254.169.254/latest/meta-data/')).rejects.toThrow(/blocked/);
});
it('allows literal IPs when VOID_INGEST_ALLOW_PRIVATE=true (test fixtures)', async () => {
process.env.VOID_INGEST_ALLOW_PRIVATE = 'true';
global.fetch = vi.fn(async () => new Response('ok', { status: 200 }));
const res = await safeFetch('http://127.0.0.1:65535/');
expect(res.status).toBe(200);
});
});