feat(workers): extract.image via Tesseract

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-01 05:00:21 +10:00
parent 1f0e9a5f1b
commit f2035c1de6
3 changed files with 58 additions and 1 deletions

View File

@@ -0,0 +1,38 @@
import subprocess
from pathlib import Path
from void_workers.handlers.image import handle as handle_image
FIXTURES = Path(__file__).parent / "fixtures"
def _reset_void_schema(conn):
conn.execute("DROP SCHEMA IF EXISTS public CASCADE")
conn.execute("CREATE SCHEMA public")
conn.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto")
conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
def _run_node_migrations():
subprocess.run(
["node", "lib/db/migrate.js", "up"],
cwd="/project/src/void-v2",
check=True
)
def test_image_ocr(conn):
_reset_void_schema(conn)
_run_node_migrations()
sp = conn.execute(
"INSERT INTO spaces(slug, name) VALUES('plan4-img', 'I') RETURNING id"
).fetchone()[0]
blob = FIXTURES / "eng_text.png"
ref = conn.execute(
"INSERT INTO refs(space_id, kind, blob_path) "
"VALUES(%s, 'image', %s) RETURNING id",
(sp, str(blob))
).fetchone()[0]
out = handle_image({"ref_id": str(ref), "blob_path": str(blob)})
assert out["chars"] > 0
row = conn.execute("SELECT body_text FROM refs WHERE id=%s", (ref,)).fetchone()
assert "blackflame" in (row[0] or "").lower()

View File

@@ -1,6 +1,7 @@
from . import echo, pdf
from . import echo, pdf, image
REGISTRY = {
echo.NAME: echo.handle,
pdf.NAME: pdf.handle,
image.NAME: image.handle,
}

View File

@@ -0,0 +1,18 @@
from PIL import Image
import pytesseract
from .. import repo
NAME = "extract.image"
def handle(job_data: dict) -> dict:
ref_id = job_data["ref_id"]
blob_path = job_data["blob_path"]
text = pytesseract.image_to_string(Image.open(blob_path), lang="eng").strip()
body_text = text[:200_000]
repo.update_ref(
ref_id,
body_text=body_text,
metadata_patch={"extract": {"method": "tesseract", "chars": len(body_text)}}
)
return {"ref_id": ref_id, "chars": len(body_text)}