Files
Void-Homelab/lib/jobs/workers/url.js
root afc20712cb feat(api): capture POST + upload + SSRF-safe URL fetch
safe_fetch.js validates URLs before fetch: rejects non-http(s), literal
or DNS-resolved loopback / RFC1918 / link-local / CGNAT / metadata
addresses; follows redirects manually with the same checks on each hop.
Test fixtures gate the check with VOID_INGEST_ALLOW_PRIVATE for offline
fixtures that hit 127.0.0.1.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-01 03:42:54 +10:00

45 lines
1.3 KiB
JavaScript

import crypto from 'node:crypto';
import { extract } from '../../ingest/readability.js';
import { safeFetch } from '../../ingest/safe_fetch.js';
import * as refs from '../../db/repos/refs.js';
import { pool } from '../../db/pool.js';
export const NAME = 'ingest.url';
function key(space_id, url) {
return crypto.createHash('sha256').update(space_id + '\x00' + url).digest('hex');
}
export async function handler(job) {
const { space_id, url } = job.data;
const idem = key(space_id, url);
const { rows: [existing] } = await pool.query(
`SELECT id FROM refs WHERE source_kind='url' AND external_id=$1 LIMIT 1`,
[idem]
);
if (existing) return { ref_id: existing.id, idempotent: true };
const res = await safeFetch(url, {
headers: { 'User-Agent': 'void-ingest/2.0' },
signal: AbortSignal.timeout(15_000)
});
if (!res.ok) throw new Error(`fetch ${url}${res.status}`);
const html = await res.text();
const parsed = extract(html, url);
const row = await refs.create({
space_id,
kind: 'url',
source_url: url,
title: parsed.title || url,
summary: parsed.excerpt,
body_text: (parsed.textContent || '').slice(0, 200_000),
source_kind: 'url',
external_id: idem,
metadata: { site_name: parsed.siteName, byline: parsed.byline }
}, { kind: 'system', id: null });
return { ref_id: row.id };
}