diff --git a/workers/tests/test_video.py b/workers/tests/test_video.py new file mode 100644 index 0000000..312a2ae --- /dev/null +++ b/workers/tests/test_video.py @@ -0,0 +1,59 @@ +import subprocess +from unittest.mock import patch +from void_workers.handlers.video import handle as handle_video + + +def _reset_void_schema(conn): + conn.execute("DROP SCHEMA IF EXISTS public CASCADE") + conn.execute("CREATE SCHEMA public") + conn.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto") + conn.execute("CREATE EXTENSION IF NOT EXISTS vector") + + +def _run_node_migrations(): + subprocess.run( + ["node", "lib/db/migrate.js", "up"], + cwd="/project/src/void-v2", + check=True + ) + + +def test_video_creates_ref_with_transcript_and_metadata(conn): + _reset_void_schema(conn) + _run_node_migrations() + sp = conn.execute( + "INSERT INTO spaces(slug, name) VALUES('plan4-vid', 'V') RETURNING id" + ).fetchone()[0] + + info = { + "title": "Sample video", + "description": "a description", + "duration": 90, + "uploader": "Channel", + "thumbnail": "https://i.ytimg.com/t.jpg" + } + with patch("void_workers.handlers.video._yt_dlp_info", return_value=info), \ + patch("void_workers.handlers.video._yt_dlp_audio", return_value="/tmp/fake.opus"), \ + patch("void_workers.handlers.video.whisper_transcribe", return_value="hello world transcript"), \ + patch("os.unlink"): + out = handle_video({"space_id": str(sp), "url": "https://youtu.be/abc"}) + + assert "ref_id" in out + row = conn.execute( + "SELECT title, body_text, source_kind FROM refs WHERE id=%s", + (out["ref_id"],) + ).fetchone() + assert row[0] == "Sample video" + assert "hello world" in row[1] + assert row[2] == "youtube" + + +def test_video_skipped_when_yt_dlp_fails(conn): + _reset_void_schema(conn) + _run_node_migrations() + sp = conn.execute( + "INSERT INTO spaces(slug, name) VALUES('plan4-vid2', 'V2') RETURNING id" + ).fetchone()[0] + with patch("void_workers.handlers.video._yt_dlp_info", return_value=None): + out = handle_video({"space_id": str(sp), "url": "https://youtu.be/gone"}) + assert out.get("skipped") == "yt-dlp" diff --git a/workers/void_workers/handlers/__init__.py b/workers/void_workers/handlers/__init__.py index d4ff44c..b15bce9 100644 --- a/workers/void_workers/handlers/__init__.py +++ b/workers/void_workers/handlers/__init__.py @@ -1,7 +1,8 @@ -from . import echo, pdf, image +from . import echo, pdf, image, video REGISTRY = { echo.NAME: echo.handle, pdf.NAME: pdf.handle, image.NAME: image.handle, + video.NAME: video.handle, } diff --git a/workers/void_workers/handlers/video.py b/workers/void_workers/handlers/video.py new file mode 100644 index 0000000..3a57a3b --- /dev/null +++ b/workers/void_workers/handlers/video.py @@ -0,0 +1,81 @@ +import hashlib +import json +import os +import subprocess +import tempfile +from .. import repo +from ..model import whisper_transcribe + +NAME = "ingest.video" + + +def _yt_dlp_info(url): + """Returns dict of metadata, or None if yt-dlp could not extract.""" + try: + out = subprocess.check_output( + ["yt-dlp", "-J", "--no-warnings", "--no-playlist", url], + timeout=60 + ) + return json.loads(out) + except subprocess.CalledProcessError: + return None + + +def _yt_dlp_audio(url): + """Downloads bestaudio to a temp .opus and returns the path.""" + tmp_dir = tempfile.mkdtemp(prefix="void-yt-") + out_template = os.path.join(tmp_dir, "audio.%(ext)s") + subprocess.run( + ["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template, + "--no-warnings", "--no-playlist", url], + check=True, timeout=600 + ) + for f in os.listdir(tmp_dir): + if f.startswith("audio."): + return os.path.join(tmp_dir, f) + raise RuntimeError("yt-dlp produced no audio file") + + +def _idem(space_id, url): + return hashlib.sha256((space_id + "\x00" + url).encode()).hexdigest() + + +def _kind(url): + return "youtube" if ("youtube.com" in url or "youtu.be" in url) else "video" + + +def handle(job_data: dict) -> dict: + space_id = job_data["space_id"] + url = job_data["url"] + idem = _idem(space_id, url) + + info = _yt_dlp_info(url) + if info is None: + return {"skipped": "yt-dlp"} + + audio_path = _yt_dlp_audio(url) + try: + transcript = whisper_transcribe(audio_path) + finally: + try: + os.unlink(audio_path) + except OSError: + pass + + ref_id = repo.create_ref({ + "space_id": space_id, + "kind": "video", + "source_url": url, + "title": info.get("title") or url, + "summary": (info.get("description") or "")[:5000], + "body_text": transcript[:200_000], + "source_kind": _kind(url), + "external_id": idem, + "metadata": { + "duration_s": info.get("duration"), + "uploader": info.get("uploader"), + "thumbnail": info.get("thumbnail"), + "extract": {"method": "whisper", "chars": len(transcript)}, + } + }) + return {"ref_id": ref_id, "chars": len(transcript)}