feat(workers): extract.pdf with Tesseract fallback

pdftotext first; falls back to per-page pdftoppm rasterization + Tesseract OCR when the extracted text is < 200 chars. Updates refs.body_text + metadata.extract.{method,chars} via the repo shim; audit entry emitted with actor_kind='worker'. born_digital.pdf fixture padded so pdftotext yields > 200 chars and the test exercises the pdftotext path, not the OCR fallback. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-01 04:59:53 +10:00
parent bbb08a677e
commit 1f0e9a5f1b
5 changed files with 206 additions and 1 deletions
--- a/workers/void_workers/handlers/pdf.py
+++ b/workers/void_workers/handlers/pdf.py
@@ -0,0 +1,47 @@
+import subprocess
+import tempfile
+from pathlib import Path
+from PIL import Image
+import pytesseract
+from .. import repo
+
+NAME = "extract.pdf"
+FALLBACK_THRESHOLD = 200  # chars below which we OCR
+
+
+def _pdftotext(blob_path):
+    return subprocess.check_output(
+        ["pdftotext", "-layout", blob_path, "-"], timeout=120
+    ).decode("utf-8", errors="replace")
+
+
+def _ocr_pdf(blob_path):
+    """Rasterize each page with pdftoppm, OCR each with Tesseract."""
+    with tempfile.TemporaryDirectory() as tmp:
+        subprocess.run(
+            ["pdftoppm", "-r", "200", "-png", blob_path, f"{tmp}/p"],
+            check=True, timeout=300
+        )
+        pages = sorted(Path(tmp).glob("p-*.png"))
+        parts = []
+        for p in pages:
+            img = Image.open(p)
+            parts.append(pytesseract.image_to_string(img, lang="eng"))
+        return "\n".join(parts)
+
+
+def handle(job_data: dict) -> dict:
+    ref_id = job_data["ref_id"]
+    blob_path = job_data["blob_path"]
+    method = "pdftotext"
+    text = _pdftotext(blob_path).strip()
+    if len(text) < FALLBACK_THRESHOLD:
+        method = "tesseract"
+        text = _ocr_pdf(blob_path).strip()
+    body_text = text[:200_000]
+    repo.update_ref(
+        ref_id,
+        body_text=body_text,
+        metadata_patch={"extract": {"method": method, "chars": len(body_text)}}
+    )
+    return {"ref_id": ref_id, "chars": len(body_text), "method": method}