diff --git a/workers/tests/test_video.py b/workers/tests/test_video.py index 312a2ae..9ca2464 100644 --- a/workers/tests/test_video.py +++ b/workers/tests/test_video.py @@ -1,6 +1,7 @@ +import pytest import subprocess from unittest.mock import patch -from void_workers.handlers.video import handle as handle_video +from void_workers.handlers.video import handle as handle_video, _validate_url def _reset_void_schema(conn): @@ -48,6 +49,17 @@ def test_video_creates_ref_with_transcript_and_metadata(conn): assert row[2] == "youtube" +def test_validate_url_rejects_non_http(): + with pytest.raises(ValueError): + _validate_url("file:///etc/passwd") + with pytest.raises(ValueError): + _validate_url("javascript:alert(1)") + + +def test_validate_url_accepts_https(): + assert _validate_url("https://youtu.be/abc") == "https://youtu.be/abc" + + def test_video_skipped_when_yt_dlp_fails(conn): _reset_void_schema(conn) _run_node_migrations() diff --git a/workers/void_workers/handlers/video.py b/workers/void_workers/handlers/video.py index 3a57a3b..221e427 100644 --- a/workers/void_workers/handlers/video.py +++ b/workers/void_workers/handlers/video.py @@ -3,17 +3,34 @@ import json import os import subprocess import tempfile +from urllib.parse import urlparse from .. import repo from ..model import whisper_transcribe NAME = "ingest.video" +def _validate_url(url): + """yt-dlp accepts URLs as positional args; require http(s) scheme so a + crafted '--config-location=...' string can't smuggle a flag. We pass + `--` to yt-dlp too as belt + suspenders.""" + try: + p = urlparse(url) + except Exception as e: + raise ValueError(f"invalid url: {e}") + if p.scheme not in ("http", "https"): + raise ValueError(f"unsupported scheme: {p.scheme}") + if not p.hostname: + raise ValueError("missing hostname") + return url + + def _yt_dlp_info(url): """Returns dict of metadata, or None if yt-dlp could not extract.""" + safe = _validate_url(url) try: out = subprocess.check_output( - ["yt-dlp", "-J", "--no-warnings", "--no-playlist", url], + ["yt-dlp", "-J", "--no-warnings", "--no-playlist", "--", safe], timeout=60 ) return json.loads(out) @@ -23,11 +40,12 @@ def _yt_dlp_info(url): def _yt_dlp_audio(url): """Downloads bestaudio to a temp .opus and returns the path.""" + safe = _validate_url(url) tmp_dir = tempfile.mkdtemp(prefix="void-yt-") out_template = os.path.join(tmp_dir, "audio.%(ext)s") subprocess.run( ["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template, - "--no-warnings", "--no-playlist", url], + "--no-warnings", "--no-playlist", "--", safe], check=True, timeout=600 ) for f in os.listdir(tmp_dir):