Void-Homelab/workers/void_workers/handlers/pdf.py

import subprocess
import tempfile
from pathlib import Path
from PIL import Image
import pytesseract
from .. import repo

NAME = "extract.pdf"
FALLBACK_THRESHOLD = 200  # chars below which we OCR


def _pdftotext(blob_path):
    return subprocess.check_output(
        ["pdftotext", "-layout", blob_path, "-"], timeout=120
    ).decode("utf-8", errors="replace")


def _ocr_pdf(blob_path):
    """Rasterize each page with pdftoppm, OCR each with Tesseract."""
    with tempfile.TemporaryDirectory() as tmp:
        subprocess.run(
            ["pdftoppm", "-r", "200", "-png", blob_path, f"{tmp}/p"],
            check=True, timeout=300
        )
        pages = sorted(Path(tmp).glob("p-*.png"))
        parts = []
        for p in pages:
            img = Image.open(p)
            parts.append(pytesseract.image_to_string(img, lang="eng"))
        return "\n".join(parts)


def handle(job_data: dict) -> dict:
    ref_id = job_data["ref_id"]
    blob_path = job_data["blob_path"]
    method = "pdftotext"
    text = _pdftotext(blob_path).strip()
    if len(text) < FALLBACK_THRESHOLD:
        method = "tesseract"
        text = _ocr_pdf(blob_path).strip()
    body_text = text[:200_000]
    repo.update_ref(
        ref_id,
        body_text=body_text,
        metadata_patch={"extract": {"method": method, "chars": len(body_text)}}
    )
    return {"ref_id": ref_id, "chars": len(body_text), "method": method}