diff --git a/workers/tests/test_image.py b/workers/tests/test_image.py new file mode 100644 index 0000000..52f2b08 --- /dev/null +++ b/workers/tests/test_image.py @@ -0,0 +1,38 @@ +import subprocess +from pathlib import Path +from void_workers.handlers.image import handle as handle_image + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _reset_void_schema(conn): + conn.execute("DROP SCHEMA IF EXISTS public CASCADE") + conn.execute("CREATE SCHEMA public") + conn.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto") + conn.execute("CREATE EXTENSION IF NOT EXISTS vector") + + +def _run_node_migrations(): + subprocess.run( + ["node", "lib/db/migrate.js", "up"], + cwd="/project/src/void-v2", + check=True + ) + + +def test_image_ocr(conn): + _reset_void_schema(conn) + _run_node_migrations() + sp = conn.execute( + "INSERT INTO spaces(slug, name) VALUES('plan4-img', 'I') RETURNING id" + ).fetchone()[0] + blob = FIXTURES / "eng_text.png" + ref = conn.execute( + "INSERT INTO refs(space_id, kind, blob_path) " + "VALUES(%s, 'image', %s) RETURNING id", + (sp, str(blob)) + ).fetchone()[0] + out = handle_image({"ref_id": str(ref), "blob_path": str(blob)}) + assert out["chars"] > 0 + row = conn.execute("SELECT body_text FROM refs WHERE id=%s", (ref,)).fetchone() + assert "blackflame" in (row[0] or "").lower() diff --git a/workers/void_workers/handlers/__init__.py b/workers/void_workers/handlers/__init__.py index 258277d..d4ff44c 100644 --- a/workers/void_workers/handlers/__init__.py +++ b/workers/void_workers/handlers/__init__.py @@ -1,6 +1,7 @@ -from . import echo, pdf +from . import echo, pdf, image REGISTRY = { echo.NAME: echo.handle, pdf.NAME: pdf.handle, + image.NAME: image.handle, } diff --git a/workers/void_workers/handlers/image.py b/workers/void_workers/handlers/image.py new file mode 100644 index 0000000..fb77a2f --- /dev/null +++ b/workers/void_workers/handlers/image.py @@ -0,0 +1,18 @@ +from PIL import Image +import pytesseract +from .. import repo + +NAME = "extract.image" + + +def handle(job_data: dict) -> dict: + ref_id = job_data["ref_id"] + blob_path = job_data["blob_path"] + text = pytesseract.image_to_string(Image.open(blob_path), lang="eng").strip() + body_text = text[:200_000] + repo.update_ref( + ref_id, + body_text=body_text, + metadata_patch={"extract": {"method": "tesseract", "chars": len(body_text)}} + ) + return {"ref_id": ref_id, "chars": len(body_text)}