from PIL import Image import pytesseract from .. import repo NAME = "extract.image" def handle(job_data: dict) -> dict: ref_id = job_data["ref_id"] blob_path = job_data["blob_path"] text = pytesseract.image_to_string(Image.open(blob_path), lang="eng").strip() body_text = text[:200_000] repo.update_ref( ref_id, body_text=body_text, metadata_patch={"extract": {"method": "tesseract", "chars": len(body_text)}} ) return {"ref_id": ref_id, "chars": len(body_text)}