19 lines
518 B
Python
19 lines
518 B
Python
from PIL import Image
|
|
import pytesseract
|
|
from .. import repo
|
|
|
|
NAME = "extract.image"
|
|
|
|
|
|
def handle(job_data: dict) -> dict:
|
|
ref_id = job_data["ref_id"]
|
|
blob_path = job_data["blob_path"]
|
|
text = pytesseract.image_to_string(Image.open(blob_path), lang="eng").strip()
|
|
body_text = text[:200_000]
|
|
repo.update_ref(
|
|
ref_id,
|
|
body_text=body_text,
|
|
metadata_patch={"extract": {"method": "tesseract", "chars": len(body_text)}}
|
|
)
|
|
return {"ref_id": ref_id, "chars": len(body_text)}
|