Files
Void-Homelab/workers/void_workers/handlers/video.py
root 1ba7aae439 feat(workers): ingest.video via yt-dlp + Whisper
yt-dlp pulls metadata (title, description, uploader, thumbnail) and
bestaudio (opus). faster-whisper transcribes; audio file removed after.
Creates a refs row with kind='video' and source_kind='youtube' for
YouTube URLs, generic 'video' otherwise. Idempotent on
sha256(space_id + url) via refs.external_id.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-01 10:07:33 +10:00

82 lines
2.2 KiB
Python

import hashlib
import json
import os
import subprocess
import tempfile
from .. import repo
from ..model import whisper_transcribe
NAME = "ingest.video"
def _yt_dlp_info(url):
"""Returns dict of metadata, or None if yt-dlp could not extract."""
try:
out = subprocess.check_output(
["yt-dlp", "-J", "--no-warnings", "--no-playlist", url],
timeout=60
)
return json.loads(out)
except subprocess.CalledProcessError:
return None
def _yt_dlp_audio(url):
"""Downloads bestaudio to a temp .opus and returns the path."""
tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
out_template = os.path.join(tmp_dir, "audio.%(ext)s")
subprocess.run(
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
"--no-warnings", "--no-playlist", url],
check=True, timeout=600
)
for f in os.listdir(tmp_dir):
if f.startswith("audio."):
return os.path.join(tmp_dir, f)
raise RuntimeError("yt-dlp produced no audio file")
def _idem(space_id, url):
return hashlib.sha256((space_id + "\x00" + url).encode()).hexdigest()
def _kind(url):
return "youtube" if ("youtube.com" in url or "youtu.be" in url) else "video"
def handle(job_data: dict) -> dict:
space_id = job_data["space_id"]
url = job_data["url"]
idem = _idem(space_id, url)
info = _yt_dlp_info(url)
if info is None:
return {"skipped": "yt-dlp"}
audio_path = _yt_dlp_audio(url)
try:
transcript = whisper_transcribe(audio_path)
finally:
try:
os.unlink(audio_path)
except OSError:
pass
ref_id = repo.create_ref({
"space_id": space_id,
"kind": "video",
"source_url": url,
"title": info.get("title") or url,
"summary": (info.get("description") or "")[:5000],
"body_text": transcript[:200_000],
"source_kind": _kind(url),
"external_id": idem,
"metadata": {
"duration_s": info.get("duration"),
"uploader": info.get("uploader"),
"thumbnail": info.get("thumbnail"),
"extract": {"method": "whisper", "chars": len(transcript)},
}
})
return {"ref_id": ref_id, "chars": len(transcript)}