feat(workers): extract.image via Tesseract
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from . import echo, pdf
|
||||
from . import echo, pdf, image
|
||||
|
||||
REGISTRY = {
|
||||
echo.NAME: echo.handle,
|
||||
pdf.NAME: pdf.handle,
|
||||
image.NAME: image.handle,
|
||||
}
|
||||
|
||||
18
workers/void_workers/handlers/image.py
Normal file
18
workers/void_workers/handlers/image.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
from .. import repo
|
||||
|
||||
NAME = "extract.image"
|
||||
|
||||
|
||||
def handle(job_data: dict) -> dict:
|
||||
ref_id = job_data["ref_id"]
|
||||
blob_path = job_data["blob_path"]
|
||||
text = pytesseract.image_to_string(Image.open(blob_path), lang="eng").strip()
|
||||
body_text = text[:200_000]
|
||||
repo.update_ref(
|
||||
ref_id,
|
||||
body_text=body_text,
|
||||
metadata_patch={"extract": {"method": "tesseract", "chars": len(body_text)}}
|
||||
)
|
||||
return {"ref_id": ref_id, "chars": len(body_text)}
|
||||
Reference in New Issue
Block a user