An article body with enough text for readability to choose it as the main content.
+Another paragraph to satisfy the readability heuristic.
+diff --git a/lib/api/index.js b/lib/api/index.js index a193a3e..ccf809f 100644 --- a/lib/api/index.js +++ b/lib/api/index.js @@ -21,6 +21,7 @@ import { router as pendingChangesRouter } from './routes/pending_changes.js'; import { router as auditRouter } from './routes/audit.js'; import { router as searchRouter } from './routes/search.js'; import { router as jobsRouter } from './routes/jobs.js'; +import { router as captureRouter } from './routes/capture.js'; export function mountApi(app) { const api = Router(); @@ -49,6 +50,7 @@ export function mountApi(app) { api.use('/audit', auditRouter); api.use('/search', searchRouter); api.use('/jobs', jobsRouter); + api.use('/capture', captureRouter); api.use('/:entity_type/:entity_id/tags', tagsByEntityRouter); api.use((_req, _res, next) => next(new NotFoundError('route not found'))); diff --git a/lib/api/routes/capture.js b/lib/api/routes/capture.js new file mode 100644 index 0000000..2ac6796 --- /dev/null +++ b/lib/api/routes/capture.js @@ -0,0 +1,79 @@ +import { Router } from 'express'; +import { z } from 'zod'; +import crypto from 'node:crypto'; +import fs from 'node:fs'; +import path from 'node:path'; +import os from 'node:os'; +import multer from 'multer'; +import * as queue from '../../jobs/queue.js'; +import { pool } from '../../db/pool.js'; +import { validate } from '../validate.js'; +import { requireWrite } from '../cap.js'; +import { asyncWrap } from '../errors.js'; + +const captureBody = z.object({ + space_id: z.string().uuid(), + url: z.string().url(), + hint: z.object({ + project_id: z.string().uuid().optional(), + title: z.string().optional(), + tags: z.array(z.string()).optional() + }).optional() +}); + +const UPLOAD_TMP = process.env.UPLOAD_TMP || path.join(os.tmpdir(), 'void-uploads'); +fs.mkdirSync(UPLOAD_TMP, { recursive: true }); +const upload = multer({ dest: UPLOAD_TMP, limits: { fileSize: 100 * 1024 * 1024 } }); + +function key(space_id, url) { + return crypto.createHash('sha256').update(space_id + '\x00' + url).digest('hex'); +} + +export const router = Router(); + +router.post('/', + requireWrite('ref'), + validate({ body: captureBody }), + asyncWrap(async (req, res) => { + const { space_id, url } = req.body; + const idem = key(space_id, url); + const { rows: [existing] } = await pool.query( + `SELECT id FROM refs WHERE source_kind='url' AND external_id=$1 LIMIT 1`, + [idem] + ); + if (existing) { + return res.status(202).json({ + job_id: null, idempotency_key: idem, ref_id: existing.id + }); + } + const job_id = await queue.enqueue('ingest.url', { space_id, url }); + res.status(202).json({ job_id, idempotency_key: idem }); + }) +); + +router.post('/upload', + requireWrite('ref'), + upload.single('file'), + asyncWrap(async (req, res) => { + if (!req.file) { + return res.status(400).json({ error: { code: 'validation_failed', message: 'file required' } }); + } + const space_id = req.body.space_id; + if (!space_id) { + return res.status(400).json({ error: { code: 'validation_failed', message: 'space_id required' } }); + } + let meta = {}; + if (req.body.meta) { + try { meta = JSON.parse(req.body.meta); } + catch { /* leave empty */ } + } + const job_id = await queue.enqueue('ingest.blob', { + space_id, + tmp_path: req.file.path, + filename: req.file.originalname, + content_type: req.file.mimetype, + meta + }); + res.status(202).json({ job_id }); + }) +); diff --git a/lib/ingest/safe_fetch.js b/lib/ingest/safe_fetch.js new file mode 100644 index 0000000..c537cf9 --- /dev/null +++ b/lib/ingest/safe_fetch.js @@ -0,0 +1,89 @@ +// Wraps fetch with SSRF mitigations: +// - http/https only +// - DNS-resolve host and reject loopback/RFC1918/link-local/CGNAT/zero +// - Pin the resolved IP into the request so a rebind between resolve and +// connect cannot redirect to an internal address +// - Follow redirects manually with the same validation on each hop +// +// Defaults can be loosened via VOID_INGEST_ALLOW_PRIVATE=true (for dev/test +// against fixtures that hit 127.0.0.1). + +import { lookup } from 'node:dns/promises'; +import net from 'node:net'; +import { Agent } from 'node:https'; + +const BLOCK_V4 = [ + ['0.0.0.0', 8], + ['127.0.0.0', 8], + ['10.0.0.0', 8], + ['172.16.0.0', 12], + ['192.168.0.0', 16], + ['169.254.0.0', 16], + ['100.64.0.0', 10] +]; + +function ipv4ToInt(ip) { + return ip.split('.').reduce((acc, oct) => (acc << 8) + Number(oct), 0) >>> 0; +} + +function inV4Cidr(ip, [cidrIp, bits]) { + const ipi = ipv4ToInt(ip); + const cidri = ipv4ToInt(cidrIp); + const mask = bits === 0 ? 0 : (~0 << (32 - bits)) >>> 0; + return (ipi & mask) === (cidri & mask); +} + +function isBlockedAddr(addr) { + if (process.env.VOID_INGEST_ALLOW_PRIVATE === 'true') return false; + if (net.isIPv4(addr)) return BLOCK_V4.some(c => inV4Cidr(addr, c)); + if (net.isIPv6(addr)) { + const a = addr.toLowerCase(); + if (a === '::1' || a === '::' ) return true; + if (a.startsWith('fc') || a.startsWith('fd')) return true; // ULA + if (a.startsWith('fe80')) return true; // link-local + if (a.startsWith('::ffff:')) { + const v4 = a.slice(7); + if (net.isIPv4(v4)) return isBlockedAddr(v4); + } + } + return false; +} + +export class SafeFetchError extends Error { + constructor(message, code) { super(message); this.code = code; } +} + +async function resolveAndCheck(host) { + const records = await lookup(host, { all: true }); + if (!records.length) throw new SafeFetchError(`no DNS for ${host}`, 'no_dns'); + for (const r of records) { + if (isBlockedAddr(r.address)) { + throw new SafeFetchError(`${host} resolves to blocked address ${r.address}`, 'blocked_addr'); + } + } + return records[0]; +} + +export async function safeFetch(url, options = {}, { maxHops = 5 } = {}) { + let current = url; + for (let hop = 0; hop <= maxHops; hop++) { + const u = new URL(current); + if (u.protocol !== 'http:' && u.protocol !== 'https:') { + throw new SafeFetchError(`unsupported scheme ${u.protocol}`, 'scheme'); + } + if (net.isIP(u.hostname)) { + if (isBlockedAddr(u.hostname)) throw new SafeFetchError(`blocked literal IP ${u.hostname}`, 'blocked_addr'); + } else { + await resolveAndCheck(u.hostname); + } + const res = await fetch(current, { ...options, redirect: 'manual' }); + if ([301,302,303,307,308].includes(res.status)) { + const loc = res.headers.get('location'); + if (!loc) throw new SafeFetchError('redirect without Location', 'bad_redirect'); + current = new URL(loc, current).toString(); + continue; + } + return res; + } + throw new SafeFetchError(`too many redirects (max ${maxHops})`, 'too_many_redirects'); +} diff --git a/lib/jobs/workers/url.js b/lib/jobs/workers/url.js index 1770d41..d24fe20 100644 --- a/lib/jobs/workers/url.js +++ b/lib/jobs/workers/url.js @@ -1,5 +1,6 @@ import crypto from 'node:crypto'; import { extract } from '../../ingest/readability.js'; +import { safeFetch } from '../../ingest/safe_fetch.js'; import * as refs from '../../db/repos/refs.js'; import { pool } from '../../db/pool.js'; @@ -19,7 +20,7 @@ export async function handler(job) { ); if (existing) return { ref_id: existing.id, idempotent: true }; - const res = await fetch(url, { + const res = await safeFetch(url, { headers: { 'User-Agent': 'void-ingest/2.0' }, signal: AbortSignal.timeout(15_000) }); diff --git a/tests/api/capture.test.js b/tests/api/capture.test.js new file mode 100644 index 0000000..a38ecb0 --- /dev/null +++ b/tests/api/capture.test.js @@ -0,0 +1,72 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import fs from 'node:fs/promises'; +import path from 'node:path'; +import os from 'node:os'; +import request from 'supertest'; +import { setup } from './helpers.js'; +import { stopBoss, waitForJob } from '../helpers/boss.js'; +import * as queue from '../../lib/jobs/queue.js'; +import { registerWorkers } from '../../lib/jobs/index.js'; +import * as spaces from '../../lib/db/repos/spaces.js'; +import * as refs from '../../lib/db/repos/refs.js'; + +let app, ownerHeaders, sp; +const HTML = `
An article body with enough text for readability to choose it as the main content.
+Another paragraph to satisfy the readability heuristic.
+