Blackflame
+An essay on the Cradle aesthetic and the blackflame motif. Long enough for readability to consider this main content. Lorem ipsum dolor sit amet.
+Another paragraph that pads out the article for readability detection.
+diff --git a/lib/jobs/index.js b/lib/jobs/index.js index 040ef92..0b28f47 100644 --- a/lib/jobs/index.js +++ b/lib/jobs/index.js @@ -1,7 +1,8 @@ import * as queue from './queue.js'; import * as echo from './workers/echo.js'; +import * as url from './workers/url.js'; -const WORKERS = [echo]; +const WORKERS = [echo, url]; export async function registerWorkers() { for (const w of WORKERS) { diff --git a/lib/jobs/workers/url.js b/lib/jobs/workers/url.js new file mode 100644 index 0000000..1770d41 --- /dev/null +++ b/lib/jobs/workers/url.js @@ -0,0 +1,43 @@ +import crypto from 'node:crypto'; +import { extract } from '../../ingest/readability.js'; +import * as refs from '../../db/repos/refs.js'; +import { pool } from '../../db/pool.js'; + +export const NAME = 'ingest.url'; + +function key(space_id, url) { + return crypto.createHash('sha256').update(space_id + '\x00' + url).digest('hex'); +} + +export async function handler(job) { + const { space_id, url } = job.data; + const idem = key(space_id, url); + + const { rows: [existing] } = await pool.query( + `SELECT id FROM refs WHERE source_kind='url' AND external_id=$1 LIMIT 1`, + [idem] + ); + if (existing) return { ref_id: existing.id, idempotent: true }; + + const res = await fetch(url, { + headers: { 'User-Agent': 'void-ingest/2.0' }, + signal: AbortSignal.timeout(15_000) + }); + if (!res.ok) throw new Error(`fetch ${url} → ${res.status}`); + const html = await res.text(); + + const parsed = extract(html, url); + const row = await refs.create({ + space_id, + kind: 'url', + source_url: url, + title: parsed.title || url, + summary: parsed.excerpt, + body_text: (parsed.textContent || '').slice(0, 200_000), + source_kind: 'url', + external_id: idem, + metadata: { site_name: parsed.siteName, byline: parsed.byline } + }, { kind: 'system', id: null }); + + return { ref_id: row.id }; +} diff --git a/tests/jobs/workers/url.test.js b/tests/jobs/workers/url.test.js new file mode 100644 index 0000000..3aebce2 --- /dev/null +++ b/tests/jobs/workers/url.test.js @@ -0,0 +1,47 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { resetDb } from '../../helpers/db.js'; +import { migrateUp } from '../../../lib/db/migrate.js'; +import { stopBoss, waitForJob } from '../../helpers/boss.js'; +import * as queue from '../../../lib/jobs/queue.js'; +import { registerWorkers } from '../../../lib/jobs/index.js'; +import * as spaces from '../../../lib/db/repos/spaces.js'; +import * as refs from '../../../lib/db/repos/refs.js'; + +const HTML = `
An essay on the Cradle aesthetic and the blackflame motif. Long enough for readability to consider this main content. Lorem ipsum dolor sit amet.
+Another paragraph that pads out the article for readability detection.
+