feat(workers): extract.image via Tesseract

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-01 05:00:21 +10:00
parent 1f0e9a5f1b
commit f2035c1de6
3 changed files with 58 additions and 1 deletions

View File

@@ -0,0 +1,18 @@
from PIL import Image
import pytesseract
from .. import repo
NAME = "extract.image"
def handle(job_data: dict) -> dict:
ref_id = job_data["ref_id"]
blob_path = job_data["blob_path"]
text = pytesseract.image_to_string(Image.open(blob_path), lang="eng").strip()
body_text = text[:200_000]
repo.update_ref(
ref_id,
body_text=body_text,
metadata_patch={"extract": {"method": "tesseract", "chars": len(body_text)}}
)
return {"ref_id": ref_id, "chars": len(body_text)}