feat(workers): ingest.video via yt-dlp + Whisper

yt-dlp pulls metadata (title, description, uploader, thumbnail) and bestaudio (opus). faster-whisper transcribes; audio file removed after. Creates a refs row with kind='video' and source_kind='youtube' for YouTube URLs, generic 'video' otherwise. Idempotent on sha256(space_id + url) via refs.external_id. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-01 10:07:33 +10:00
parent e64f1345f6
commit 1ba7aae439
3 changed files with 142 additions and 1 deletions
--- a/workers/tests/test_video.py
+++ b/workers/tests/test_video.py
@@ -0,0 +1,59 @@
+import subprocess
+from unittest.mock import patch
+from void_workers.handlers.video import handle as handle_video
+
+
+def _reset_void_schema(conn):
+    conn.execute("DROP SCHEMA IF EXISTS public CASCADE")
+    conn.execute("CREATE SCHEMA public")
+    conn.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto")
+    conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
+
+
+def _run_node_migrations():
+    subprocess.run(
+        ["node", "lib/db/migrate.js", "up"],
+        cwd="/project/src/void-v2",
+        check=True
+    )
+
+
+def test_video_creates_ref_with_transcript_and_metadata(conn):
+    _reset_void_schema(conn)
+    _run_node_migrations()
+    sp = conn.execute(
+        "INSERT INTO spaces(slug, name) VALUES('plan4-vid', 'V') RETURNING id"
+    ).fetchone()[0]
+
+    info = {
+        "title": "Sample video",
+        "description": "a description",
+        "duration": 90,
+        "uploader": "Channel",
+        "thumbnail": "https://i.ytimg.com/t.jpg"
+    }
+    with patch("void_workers.handlers.video._yt_dlp_info", return_value=info), \
+         patch("void_workers.handlers.video._yt_dlp_audio", return_value="/tmp/fake.opus"), \
+         patch("void_workers.handlers.video.whisper_transcribe", return_value="hello world transcript"), \
+         patch("os.unlink"):
+        out = handle_video({"space_id": str(sp), "url": "https://youtu.be/abc"})
+
+    assert "ref_id" in out
+    row = conn.execute(
+        "SELECT title, body_text, source_kind FROM refs WHERE id=%s",
+        (out["ref_id"],)
+    ).fetchone()
+    assert row[0] == "Sample video"
+    assert "hello world" in row[1]
+    assert row[2] == "youtube"
+
+
+def test_video_skipped_when_yt_dlp_fails(conn):
+    _reset_void_schema(conn)
+    _run_node_migrations()
+    sp = conn.execute(
+        "INSERT INTO spaces(slug, name) VALUES('plan4-vid2', 'V2') RETURNING id"
+    ).fetchone()[0]
+    with patch("void_workers.handlers.video._yt_dlp_info", return_value=None):
+        out = handle_video({"space_id": str(sp), "url": "https://youtu.be/gone"})
+    assert out.get("skipped") == "yt-dlp"
--- a/workers/void_workers/handlers/init.py
+++ b/workers/void_workers/handlers/init.py
@@ -1,7 +1,8 @@
-from . import echo, pdf, image
+from . import echo, pdf, image, video

 REGISTRY = {
    echo.NAME: echo.handle,
    pdf.NAME: pdf.handle,
    image.NAME: image.handle,
+    video.NAME: video.handle,
 }
--- a/workers/void_workers/handlers/video.py
+++ b/workers/void_workers/handlers/video.py
@@ -0,0 +1,81 @@
+import hashlib
+import json
+import os
+import subprocess
+import tempfile
+from .. import repo
+from ..model import whisper_transcribe
+
+NAME = "ingest.video"
+
+
+def _yt_dlp_info(url):
+    """Returns dict of metadata, or None if yt-dlp could not extract."""
+    try:
+        out = subprocess.check_output(
+            ["yt-dlp", "-J", "--no-warnings", "--no-playlist", url],
+            timeout=60
+        )
+        return json.loads(out)
+    except subprocess.CalledProcessError:
+        return None
+
+
+def _yt_dlp_audio(url):
+    """Downloads bestaudio to a temp .opus and returns the path."""
+    tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
+    out_template = os.path.join(tmp_dir, "audio.%(ext)s")
+    subprocess.run(
+        ["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
+         "--no-warnings", "--no-playlist", url],
+        check=True, timeout=600
+    )
+    for f in os.listdir(tmp_dir):
+        if f.startswith("audio."):
+            return os.path.join(tmp_dir, f)
+    raise RuntimeError("yt-dlp produced no audio file")
+
+
+def _idem(space_id, url):
+    return hashlib.sha256((space_id + "\x00" + url).encode()).hexdigest()
+
+
+def _kind(url):
+    return "youtube" if ("youtube.com" in url or "youtu.be" in url) else "video"
+
+
+def handle(job_data: dict) -> dict:
+    space_id = job_data["space_id"]
+    url = job_data["url"]
+    idem = _idem(space_id, url)
+
+    info = _yt_dlp_info(url)
+    if info is None:
+        return {"skipped": "yt-dlp"}
+
+    audio_path = _yt_dlp_audio(url)
+    try:
+        transcript = whisper_transcribe(audio_path)
+    finally:
+        try:
+            os.unlink(audio_path)
+        except OSError:
+            pass
+
+    ref_id = repo.create_ref({
+        "space_id": space_id,
+        "kind": "video",
+        "source_url": url,
+        "title": info.get("title") or url,
+        "summary": (info.get("description") or "")[:5000],
+        "body_text": transcript[:200_000],
+        "source_kind": _kind(url),
+        "external_id": idem,
+        "metadata": {
+            "duration_s": info.get("duration"),
+            "uploader": info.get("uploader"),
+            "thumbnail": info.get("thumbnail"),
+            "extract": {"method": "whisper", "chars": len(transcript)},
+        }
+    })
+    return {"ref_id": ref_id, "chars": len(transcript)}