The url passed to yt-dlp is user-controllable (via /api/capture). Any string starting with '-' would be parsed as a flag (e.g. --config-location=/etc/passwd). Mitigations: 1. Validate scheme is http(s) and hostname is present before subprocess. 2. Pass `--` to yt-dlp so it stops flag parsing before the positional URL. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
100 lines
2.8 KiB
Python
100 lines
2.8 KiB
Python
import hashlib
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
from urllib.parse import urlparse
|
|
from .. import repo
|
|
from ..model import whisper_transcribe
|
|
|
|
NAME = "ingest.video"
|
|
|
|
|
|
def _validate_url(url):
|
|
"""yt-dlp accepts URLs as positional args; require http(s) scheme so a
|
|
crafted '--config-location=...' string can't smuggle a flag. We pass
|
|
`--` to yt-dlp too as belt + suspenders."""
|
|
try:
|
|
p = urlparse(url)
|
|
except Exception as e:
|
|
raise ValueError(f"invalid url: {e}")
|
|
if p.scheme not in ("http", "https"):
|
|
raise ValueError(f"unsupported scheme: {p.scheme}")
|
|
if not p.hostname:
|
|
raise ValueError("missing hostname")
|
|
return url
|
|
|
|
|
|
def _yt_dlp_info(url):
|
|
"""Returns dict of metadata, or None if yt-dlp could not extract."""
|
|
safe = _validate_url(url)
|
|
try:
|
|
out = subprocess.check_output(
|
|
["yt-dlp", "-J", "--no-warnings", "--no-playlist", "--", safe],
|
|
timeout=60
|
|
)
|
|
return json.loads(out)
|
|
except subprocess.CalledProcessError:
|
|
return None
|
|
|
|
|
|
def _yt_dlp_audio(url):
|
|
"""Downloads bestaudio to a temp .opus and returns the path."""
|
|
safe = _validate_url(url)
|
|
tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
|
|
out_template = os.path.join(tmp_dir, "audio.%(ext)s")
|
|
subprocess.run(
|
|
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
|
|
"--no-warnings", "--no-playlist", "--", safe],
|
|
check=True, timeout=600
|
|
)
|
|
for f in os.listdir(tmp_dir):
|
|
if f.startswith("audio."):
|
|
return os.path.join(tmp_dir, f)
|
|
raise RuntimeError("yt-dlp produced no audio file")
|
|
|
|
|
|
def _idem(space_id, url):
|
|
return hashlib.sha256((space_id + "\x00" + url).encode()).hexdigest()
|
|
|
|
|
|
def _kind(url):
|
|
return "youtube" if ("youtube.com" in url or "youtu.be" in url) else "video"
|
|
|
|
|
|
def handle(job_data: dict) -> dict:
|
|
space_id = job_data["space_id"]
|
|
url = job_data["url"]
|
|
idem = _idem(space_id, url)
|
|
|
|
info = _yt_dlp_info(url)
|
|
if info is None:
|
|
return {"skipped": "yt-dlp"}
|
|
|
|
audio_path = _yt_dlp_audio(url)
|
|
try:
|
|
transcript = whisper_transcribe(audio_path)
|
|
finally:
|
|
try:
|
|
os.unlink(audio_path)
|
|
except OSError:
|
|
pass
|
|
|
|
ref_id = repo.create_ref({
|
|
"space_id": space_id,
|
|
"kind": "video",
|
|
"source_url": url,
|
|
"title": info.get("title") or url,
|
|
"summary": (info.get("description") or "")[:5000],
|
|
"body_text": transcript[:200_000],
|
|
"source_kind": _kind(url),
|
|
"external_id": idem,
|
|
"metadata": {
|
|
"duration_s": info.get("duration"),
|
|
"uploader": info.get("uploader"),
|
|
"thumbnail": info.get("thumbnail"),
|
|
"extract": {"method": "whisper", "chars": len(transcript)},
|
|
}
|
|
})
|
|
return {"ref_id": ref_id, "chars": len(transcript)}
|