diff --git a/lib/ingest/readability.js b/lib/ingest/readability.js new file mode 100644 index 0000000..4720e31 --- /dev/null +++ b/lib/ingest/readability.js @@ -0,0 +1,16 @@ +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; + +export function extract(html, url) { + const dom = new JSDOM(html, { url }); + const reader = new Readability(dom.window.document); + const a = reader.parse(); + if (!a) return { title: null, textContent: '', excerpt: null, byline: null, siteName: null }; + return { + title: a.title || null, + textContent: (a.textContent || '').trim(), + excerpt: a.excerpt || null, + byline: a.byline || null, + siteName: a.siteName || null + }; +} diff --git a/tests/ingest/readability.test.js b/tests/ingest/readability.test.js new file mode 100644 index 0000000..2b8e063 --- /dev/null +++ b/tests/ingest/readability.test.js @@ -0,0 +1,25 @@ +import { describe, it, expect } from 'vitest'; +import { extract } from '../../lib/ingest/readability.js'; + +const HTML = ` + Blackflame Notes + +
+

Blackflame Notes

+

An essay on the Cradle aesthetic and the blackflame motif. This is a longer paragraph that gives readability enough text to consider this the main content of the page.

+

A second paragraph also part of the article.

+
`; + +describe('readability.extract', () => { + it('pulls title and text', () => { + const out = extract(HTML, 'https://example.com/x'); + expect(out.title).toMatch(/Blackflame/); + expect(out.textContent).toMatch(/Cradle/); + expect(out.siteName).toBe('Hynesy'); + }); + + it('returns empty struct when nothing parseable', () => { + const out = extract('', 'https://example.com'); + expect(out.textContent).toBe(''); + }); +});