From 65fd71dc0d846da9a4f1573a8af3a20a541774ad Mon Sep 17 00:00:00 2001 From: root Date: Mon, 1 Jun 2026 10:11:57 +1000 Subject: [PATCH] =?UTF-8?q?fix(workers):=20yt-dlp=20argv=20injection=20?= =?UTF-8?q?=E2=80=94=20scheme=20check=20+=20--=20separator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The url passed to yt-dlp is user-controllable (via /api/capture). Any string starting with '-' would be parsed as a flag (e.g. --config-location=/etc/passwd). Mitigations: 1. Validate scheme is http(s) and hostname is present before subprocess. 2. Pass `--` to yt-dlp so it stops flag parsing before the positional URL. Co-Authored-By: Claude Opus 4.7 --- workers/tests/test_video.py | 14 +++++++++++++- workers/void_workers/handlers/video.py | 22 ++++++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/workers/tests/test_video.py b/workers/tests/test_video.py index 312a2ae..9ca2464 100644 --- a/workers/tests/test_video.py +++ b/workers/tests/test_video.py @@ -1,6 +1,7 @@ +import pytest import subprocess from unittest.mock import patch -from void_workers.handlers.video import handle as handle_video +from void_workers.handlers.video import handle as handle_video, _validate_url def _reset_void_schema(conn): @@ -48,6 +49,17 @@ def test_video_creates_ref_with_transcript_and_metadata(conn): assert row[2] == "youtube" +def test_validate_url_rejects_non_http(): + with pytest.raises(ValueError): + _validate_url("file:///etc/passwd") + with pytest.raises(ValueError): + _validate_url("javascript:alert(1)") + + +def test_validate_url_accepts_https(): + assert _validate_url("https://youtu.be/abc") == "https://youtu.be/abc" + + def test_video_skipped_when_yt_dlp_fails(conn): _reset_void_schema(conn) _run_node_migrations() diff --git a/workers/void_workers/handlers/video.py b/workers/void_workers/handlers/video.py index 3a57a3b..221e427 100644 --- a/workers/void_workers/handlers/video.py +++ b/workers/void_workers/handlers/video.py @@ -3,17 +3,34 @@ import json import os import subprocess import tempfile +from urllib.parse import urlparse from .. import repo from ..model import whisper_transcribe NAME = "ingest.video" +def _validate_url(url): + """yt-dlp accepts URLs as positional args; require http(s) scheme so a + crafted '--config-location=...' string can't smuggle a flag. We pass + `--` to yt-dlp too as belt + suspenders.""" + try: + p = urlparse(url) + except Exception as e: + raise ValueError(f"invalid url: {e}") + if p.scheme not in ("http", "https"): + raise ValueError(f"unsupported scheme: {p.scheme}") + if not p.hostname: + raise ValueError("missing hostname") + return url + + def _yt_dlp_info(url): """Returns dict of metadata, or None if yt-dlp could not extract.""" + safe = _validate_url(url) try: out = subprocess.check_output( - ["yt-dlp", "-J", "--no-warnings", "--no-playlist", url], + ["yt-dlp", "-J", "--no-warnings", "--no-playlist", "--", safe], timeout=60 ) return json.loads(out) @@ -23,11 +40,12 @@ def _yt_dlp_info(url): def _yt_dlp_audio(url): """Downloads bestaudio to a temp .opus and returns the path.""" + safe = _validate_url(url) tmp_dir = tempfile.mkdtemp(prefix="void-yt-") out_template = os.path.join(tmp_dir, "audio.%(ext)s") subprocess.run( ["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template, - "--no-warnings", "--no-playlist", url], + "--no-warnings", "--no-playlist", "--", safe], check=True, timeout=600 ) for f in os.listdir(tmp_dir):