feat(ingest): readability wrapper
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
16
lib/ingest/readability.js
Normal file
16
lib/ingest/readability.js
Normal file
@@ -0,0 +1,16 @@
|
||||
import { JSDOM } from 'jsdom';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
|
||||
export function extract(html, url) {
|
||||
const dom = new JSDOM(html, { url });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const a = reader.parse();
|
||||
if (!a) return { title: null, textContent: '', excerpt: null, byline: null, siteName: null };
|
||||
return {
|
||||
title: a.title || null,
|
||||
textContent: (a.textContent || '').trim(),
|
||||
excerpt: a.excerpt || null,
|
||||
byline: a.byline || null,
|
||||
siteName: a.siteName || null
|
||||
};
|
||||
}
|
||||
25
tests/ingest/readability.test.js
Normal file
25
tests/ingest/readability.test.js
Normal file
@@ -0,0 +1,25 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { extract } from '../../lib/ingest/readability.js';
|
||||
|
||||
const HTML = `
|
||||
<html><head><title>Blackflame Notes</title>
|
||||
<meta property="og:site_name" content="Hynesy"/>
|
||||
</head><body><article>
|
||||
<h1>Blackflame Notes</h1>
|
||||
<p>An essay on the Cradle aesthetic and the blackflame motif. This is a longer paragraph that gives readability enough text to consider this the main content of the page.</p>
|
||||
<p>A second paragraph also part of the article.</p>
|
||||
</article></body></html>`;
|
||||
|
||||
describe('readability.extract', () => {
|
||||
it('pulls title and text', () => {
|
||||
const out = extract(HTML, 'https://example.com/x');
|
||||
expect(out.title).toMatch(/Blackflame/);
|
||||
expect(out.textContent).toMatch(/Cradle/);
|
||||
expect(out.siteName).toBe('Hynesy');
|
||||
});
|
||||
|
||||
it('returns empty struct when nothing parseable', () => {
|
||||
const out = extract('<html></html>', 'https://example.com');
|
||||
expect(out.textContent).toBe('');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user