pdftotext first; falls back to per-page pdftoppm rasterization +
Tesseract OCR when the extracted text is < 200 chars. Updates
refs.body_text + metadata.extract.{method,chars} via the repo shim;
audit entry emitted with actor_kind='worker'.
born_digital.pdf fixture padded so pdftotext yields > 200 chars and
the test exercises the pdftotext path, not the OCR fallback.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
import subprocess
|
|
from pathlib import Path
|
|
from void_workers.handlers.pdf import handle as handle_pdf
|
|
|
|
FIXTURES = Path(__file__).parent / "fixtures"
|
|
|
|
|
|
def _run_node_migrations():
|
|
subprocess.run(
|
|
["node", "lib/db/migrate.js", "up"],
|
|
cwd="/project/src/void-v2",
|
|
check=True
|
|
)
|
|
|
|
|
|
def _reset_void_schema(conn):
|
|
"""Mirror tests/helpers/db.js::resetDb on the Node side."""
|
|
conn.execute("DROP SCHEMA IF EXISTS public CASCADE")
|
|
conn.execute("CREATE SCHEMA public")
|
|
conn.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto")
|
|
conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
|
|
|
|
def _seed_space_and_ref(conn, blob_path, kind="pdf"):
|
|
sp = conn.execute(
|
|
"INSERT INTO spaces(slug, name) VALUES('plan4-tests', 'P4') "
|
|
"ON CONFLICT (slug) DO UPDATE SET name=EXCLUDED.name RETURNING id"
|
|
).fetchone()[0]
|
|
ref = conn.execute(
|
|
"INSERT INTO refs(space_id, kind, source_url, title, blob_path) "
|
|
"VALUES(%s, %s, NULL, 'fixture', %s) RETURNING id",
|
|
(sp, kind, str(blob_path))
|
|
).fetchone()[0]
|
|
return sp, ref
|
|
|
|
|
|
def test_pdf_born_digital_uses_pdftotext(conn):
|
|
_reset_void_schema(conn)
|
|
_run_node_migrations()
|
|
blob = FIXTURES / "born_digital.pdf"
|
|
sp, ref = _seed_space_and_ref(conn, blob)
|
|
out = handle_pdf({"ref_id": str(ref), "blob_path": str(blob)})
|
|
assert out["ref_id"] == str(ref)
|
|
assert out["method"] == "pdftotext"
|
|
assert out["chars"] > 0
|
|
row = conn.execute("SELECT body_text FROM refs WHERE id=%s", (ref,)).fetchone()
|
|
assert "void-workers" in (row[0] or "").lower()
|
|
|
|
|
|
def test_pdf_scanned_falls_back_to_tesseract(conn):
|
|
_reset_void_schema(conn)
|
|
_run_node_migrations()
|
|
blob = FIXTURES / "scanned.pdf"
|
|
sp, ref = _seed_space_and_ref(conn, blob)
|
|
out = handle_pdf({"ref_id": str(ref), "blob_path": str(blob)})
|
|
assert out["method"] == "tesseract"
|
|
row = conn.execute("SELECT body_text FROM refs WHERE id=%s", (ref,)).fetchone()
|
|
assert "blackflame" in (row[0] or "").lower()
|