import subprocess import tempfile from pathlib import Path from PIL import Image import pytesseract from .. import repo NAME = "extract.pdf" FALLBACK_THRESHOLD = 200 # chars below which we OCR def _pdftotext(blob_path): return subprocess.check_output( ["pdftotext", "-layout", blob_path, "-"], timeout=120 ).decode("utf-8", errors="replace") def _ocr_pdf(blob_path): """Rasterize each page with pdftoppm, OCR each with Tesseract.""" with tempfile.TemporaryDirectory() as tmp: subprocess.run( ["pdftoppm", "-r", "200", "-png", blob_path, f"{tmp}/p"], check=True, timeout=300 ) pages = sorted(Path(tmp).glob("p-*.png")) parts = [] for p in pages: img = Image.open(p) parts.append(pytesseract.image_to_string(img, lang="eng")) return "\n".join(parts) def handle(job_data: dict) -> dict: ref_id = job_data["ref_id"] blob_path = job_data["blob_path"] method = "pdftotext" text = _pdftotext(blob_path).strip() if len(text) < FALLBACK_THRESHOLD: method = "tesseract" text = _ocr_pdf(blob_path).strip() body_text = text[:200_000] repo.update_ref( ref_id, body_text=body_text, metadata_patch={"extract": {"method": method, "chars": len(body_text)}} ) return {"ref_id": ref_id, "chars": len(body_text), "method": method}