feat(workers): ingest.video via yt-dlp + Whisper
yt-dlp pulls metadata (title, description, uploader, thumbnail) and bestaudio (opus). faster-whisper transcribes; audio file removed after. Creates a refs row with kind='video' and source_kind='youtube' for YouTube URLs, generic 'video' otherwise. Idempotent on sha256(space_id + url) via refs.external_id. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
59
workers/tests/test_video.py
Normal file
59
workers/tests/test_video.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import subprocess
|
||||||
|
from unittest.mock import patch
|
||||||
|
from void_workers.handlers.video import handle as handle_video
|
||||||
|
|
||||||
|
|
||||||
|
def _reset_void_schema(conn):
|
||||||
|
conn.execute("DROP SCHEMA IF EXISTS public CASCADE")
|
||||||
|
conn.execute("CREATE SCHEMA public")
|
||||||
|
conn.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto")
|
||||||
|
conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||||
|
|
||||||
|
|
||||||
|
def _run_node_migrations():
|
||||||
|
subprocess.run(
|
||||||
|
["node", "lib/db/migrate.js", "up"],
|
||||||
|
cwd="/project/src/void-v2",
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_video_creates_ref_with_transcript_and_metadata(conn):
|
||||||
|
_reset_void_schema(conn)
|
||||||
|
_run_node_migrations()
|
||||||
|
sp = conn.execute(
|
||||||
|
"INSERT INTO spaces(slug, name) VALUES('plan4-vid', 'V') RETURNING id"
|
||||||
|
).fetchone()[0]
|
||||||
|
|
||||||
|
info = {
|
||||||
|
"title": "Sample video",
|
||||||
|
"description": "a description",
|
||||||
|
"duration": 90,
|
||||||
|
"uploader": "Channel",
|
||||||
|
"thumbnail": "https://i.ytimg.com/t.jpg"
|
||||||
|
}
|
||||||
|
with patch("void_workers.handlers.video._yt_dlp_info", return_value=info), \
|
||||||
|
patch("void_workers.handlers.video._yt_dlp_audio", return_value="/tmp/fake.opus"), \
|
||||||
|
patch("void_workers.handlers.video.whisper_transcribe", return_value="hello world transcript"), \
|
||||||
|
patch("os.unlink"):
|
||||||
|
out = handle_video({"space_id": str(sp), "url": "https://youtu.be/abc"})
|
||||||
|
|
||||||
|
assert "ref_id" in out
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT title, body_text, source_kind FROM refs WHERE id=%s",
|
||||||
|
(out["ref_id"],)
|
||||||
|
).fetchone()
|
||||||
|
assert row[0] == "Sample video"
|
||||||
|
assert "hello world" in row[1]
|
||||||
|
assert row[2] == "youtube"
|
||||||
|
|
||||||
|
|
||||||
|
def test_video_skipped_when_yt_dlp_fails(conn):
|
||||||
|
_reset_void_schema(conn)
|
||||||
|
_run_node_migrations()
|
||||||
|
sp = conn.execute(
|
||||||
|
"INSERT INTO spaces(slug, name) VALUES('plan4-vid2', 'V2') RETURNING id"
|
||||||
|
).fetchone()[0]
|
||||||
|
with patch("void_workers.handlers.video._yt_dlp_info", return_value=None):
|
||||||
|
out = handle_video({"space_id": str(sp), "url": "https://youtu.be/gone"})
|
||||||
|
assert out.get("skipped") == "yt-dlp"
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
from . import echo, pdf, image
|
from . import echo, pdf, image, video
|
||||||
|
|
||||||
REGISTRY = {
|
REGISTRY = {
|
||||||
echo.NAME: echo.handle,
|
echo.NAME: echo.handle,
|
||||||
pdf.NAME: pdf.handle,
|
pdf.NAME: pdf.handle,
|
||||||
image.NAME: image.handle,
|
image.NAME: image.handle,
|
||||||
|
video.NAME: video.handle,
|
||||||
}
|
}
|
||||||
|
|||||||
81
workers/void_workers/handlers/video.py
Normal file
81
workers/void_workers/handlers/video.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from .. import repo
|
||||||
|
from ..model import whisper_transcribe
|
||||||
|
|
||||||
|
NAME = "ingest.video"
|
||||||
|
|
||||||
|
|
||||||
|
def _yt_dlp_info(url):
|
||||||
|
"""Returns dict of metadata, or None if yt-dlp could not extract."""
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(
|
||||||
|
["yt-dlp", "-J", "--no-warnings", "--no-playlist", url],
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
return json.loads(out)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _yt_dlp_audio(url):
|
||||||
|
"""Downloads bestaudio to a temp .opus and returns the path."""
|
||||||
|
tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
|
||||||
|
out_template = os.path.join(tmp_dir, "audio.%(ext)s")
|
||||||
|
subprocess.run(
|
||||||
|
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
|
||||||
|
"--no-warnings", "--no-playlist", url],
|
||||||
|
check=True, timeout=600
|
||||||
|
)
|
||||||
|
for f in os.listdir(tmp_dir):
|
||||||
|
if f.startswith("audio."):
|
||||||
|
return os.path.join(tmp_dir, f)
|
||||||
|
raise RuntimeError("yt-dlp produced no audio file")
|
||||||
|
|
||||||
|
|
||||||
|
def _idem(space_id, url):
|
||||||
|
return hashlib.sha256((space_id + "\x00" + url).encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _kind(url):
|
||||||
|
return "youtube" if ("youtube.com" in url or "youtu.be" in url) else "video"
|
||||||
|
|
||||||
|
|
||||||
|
def handle(job_data: dict) -> dict:
|
||||||
|
space_id = job_data["space_id"]
|
||||||
|
url = job_data["url"]
|
||||||
|
idem = _idem(space_id, url)
|
||||||
|
|
||||||
|
info = _yt_dlp_info(url)
|
||||||
|
if info is None:
|
||||||
|
return {"skipped": "yt-dlp"}
|
||||||
|
|
||||||
|
audio_path = _yt_dlp_audio(url)
|
||||||
|
try:
|
||||||
|
transcript = whisper_transcribe(audio_path)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(audio_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
ref_id = repo.create_ref({
|
||||||
|
"space_id": space_id,
|
||||||
|
"kind": "video",
|
||||||
|
"source_url": url,
|
||||||
|
"title": info.get("title") or url,
|
||||||
|
"summary": (info.get("description") or "")[:5000],
|
||||||
|
"body_text": transcript[:200_000],
|
||||||
|
"source_kind": _kind(url),
|
||||||
|
"external_id": idem,
|
||||||
|
"metadata": {
|
||||||
|
"duration_s": info.get("duration"),
|
||||||
|
"uploader": info.get("uploader"),
|
||||||
|
"thumbnail": info.get("thumbnail"),
|
||||||
|
"extract": {"method": "whisper", "chars": len(transcript)},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return {"ref_id": ref_id, "chars": len(transcript)}
|
||||||
Reference in New Issue
Block a user