import hashlib import json import os import subprocess import tempfile from .. import repo from ..model import whisper_transcribe NAME = "ingest.video" def _yt_dlp_info(url): """Returns dict of metadata, or None if yt-dlp could not extract.""" try: out = subprocess.check_output( ["yt-dlp", "-J", "--no-warnings", "--no-playlist", url], timeout=60 ) return json.loads(out) except subprocess.CalledProcessError: return None def _yt_dlp_audio(url): """Downloads bestaudio to a temp .opus and returns the path.""" tmp_dir = tempfile.mkdtemp(prefix="void-yt-") out_template = os.path.join(tmp_dir, "audio.%(ext)s") subprocess.run( ["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template, "--no-warnings", "--no-playlist", url], check=True, timeout=600 ) for f in os.listdir(tmp_dir): if f.startswith("audio."): return os.path.join(tmp_dir, f) raise RuntimeError("yt-dlp produced no audio file") def _idem(space_id, url): return hashlib.sha256((space_id + "\x00" + url).encode()).hexdigest() def _kind(url): return "youtube" if ("youtube.com" in url or "youtu.be" in url) else "video" def handle(job_data: dict) -> dict: space_id = job_data["space_id"] url = job_data["url"] idem = _idem(space_id, url) info = _yt_dlp_info(url) if info is None: return {"skipped": "yt-dlp"} audio_path = _yt_dlp_audio(url) try: transcript = whisper_transcribe(audio_path) finally: try: os.unlink(audio_path) except OSError: pass ref_id = repo.create_ref({ "space_id": space_id, "kind": "video", "source_url": url, "title": info.get("title") or url, "summary": (info.get("description") or "")[:5000], "body_text": transcript[:200_000], "source_kind": _kind(url), "external_id": idem, "metadata": { "duration_s": info.get("duration"), "uploader": info.get("uploader"), "thumbnail": info.get("thumbnail"), "extract": {"method": "whisper", "chars": len(transcript)}, } }) return {"ref_id": ref_id, "chars": len(transcript)}