pdftotext first; falls back to per-page pdftoppm rasterization +
Tesseract OCR when the extracted text is < 200 chars. Updates
refs.body_text + metadata.extract.{method,chars} via the repo shim;
audit entry emitted with actor_kind='worker'.
born_digital.pdf fixture padded so pdftotext yields > 200 chars and
the test exercises the pdftotext path, not the OCR fallback.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
import pytesseract
|
|
from .. import repo
|
|
|
|
NAME = "extract.pdf"
|
|
FALLBACK_THRESHOLD = 200 # chars below which we OCR
|
|
|
|
|
|
def _pdftotext(blob_path):
|
|
return subprocess.check_output(
|
|
["pdftotext", "-layout", blob_path, "-"], timeout=120
|
|
).decode("utf-8", errors="replace")
|
|
|
|
|
|
def _ocr_pdf(blob_path):
|
|
"""Rasterize each page with pdftoppm, OCR each with Tesseract."""
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
subprocess.run(
|
|
["pdftoppm", "-r", "200", "-png", blob_path, f"{tmp}/p"],
|
|
check=True, timeout=300
|
|
)
|
|
pages = sorted(Path(tmp).glob("p-*.png"))
|
|
parts = []
|
|
for p in pages:
|
|
img = Image.open(p)
|
|
parts.append(pytesseract.image_to_string(img, lang="eng"))
|
|
return "\n".join(parts)
|
|
|
|
|
|
def handle(job_data: dict) -> dict:
|
|
ref_id = job_data["ref_id"]
|
|
blob_path = job_data["blob_path"]
|
|
method = "pdftotext"
|
|
text = _pdftotext(blob_path).strip()
|
|
if len(text) < FALLBACK_THRESHOLD:
|
|
method = "tesseract"
|
|
text = _ocr_pdf(blob_path).strip()
|
|
body_text = text[:200_000]
|
|
repo.update_ref(
|
|
ref_id,
|
|
body_text=body_text,
|
|
metadata_patch={"extract": {"method": method, "chars": len(body_text)}}
|
|
)
|
|
return {"ref_id": ref_id, "chars": len(body_text), "method": method}
|