fix(workers): yt-dlp argv injection — scheme check + -- separator

The url passed to yt-dlp is user-controllable (via /api/capture). Any
string starting with '-' would be parsed as a flag (e.g.
--config-location=/etc/passwd). Mitigations:
1. Validate scheme is http(s) and hostname is present before subprocess.
2. Pass `--` to yt-dlp so it stops flag parsing before the positional
   URL.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-01 10:11:57 +10:00
parent b10b68582d
commit 65fd71dc0d
2 changed files with 33 additions and 3 deletions

View File

@@ -1,6 +1,7 @@
import pytest
import subprocess import subprocess
from unittest.mock import patch from unittest.mock import patch
from void_workers.handlers.video import handle as handle_video from void_workers.handlers.video import handle as handle_video, _validate_url
def _reset_void_schema(conn): def _reset_void_schema(conn):
@@ -48,6 +49,17 @@ def test_video_creates_ref_with_transcript_and_metadata(conn):
assert row[2] == "youtube" assert row[2] == "youtube"
def test_validate_url_rejects_non_http():
with pytest.raises(ValueError):
_validate_url("file:///etc/passwd")
with pytest.raises(ValueError):
_validate_url("javascript:alert(1)")
def test_validate_url_accepts_https():
assert _validate_url("https://youtu.be/abc") == "https://youtu.be/abc"
def test_video_skipped_when_yt_dlp_fails(conn): def test_video_skipped_when_yt_dlp_fails(conn):
_reset_void_schema(conn) _reset_void_schema(conn)
_run_node_migrations() _run_node_migrations()

View File

@@ -3,17 +3,34 @@ import json
import os import os
import subprocess import subprocess
import tempfile import tempfile
from urllib.parse import urlparse
from .. import repo from .. import repo
from ..model import whisper_transcribe from ..model import whisper_transcribe
NAME = "ingest.video" NAME = "ingest.video"
def _validate_url(url):
"""yt-dlp accepts URLs as positional args; require http(s) scheme so a
crafted '--config-location=...' string can't smuggle a flag. We pass
`--` to yt-dlp too as belt + suspenders."""
try:
p = urlparse(url)
except Exception as e:
raise ValueError(f"invalid url: {e}")
if p.scheme not in ("http", "https"):
raise ValueError(f"unsupported scheme: {p.scheme}")
if not p.hostname:
raise ValueError("missing hostname")
return url
def _yt_dlp_info(url): def _yt_dlp_info(url):
"""Returns dict of metadata, or None if yt-dlp could not extract.""" """Returns dict of metadata, or None if yt-dlp could not extract."""
safe = _validate_url(url)
try: try:
out = subprocess.check_output( out = subprocess.check_output(
["yt-dlp", "-J", "--no-warnings", "--no-playlist", url], ["yt-dlp", "-J", "--no-warnings", "--no-playlist", "--", safe],
timeout=60 timeout=60
) )
return json.loads(out) return json.loads(out)
@@ -23,11 +40,12 @@ def _yt_dlp_info(url):
def _yt_dlp_audio(url): def _yt_dlp_audio(url):
"""Downloads bestaudio to a temp .opus and returns the path.""" """Downloads bestaudio to a temp .opus and returns the path."""
safe = _validate_url(url)
tmp_dir = tempfile.mkdtemp(prefix="void-yt-") tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
out_template = os.path.join(tmp_dir, "audio.%(ext)s") out_template = os.path.join(tmp_dir, "audio.%(ext)s")
subprocess.run( subprocess.run(
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template, ["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
"--no-warnings", "--no-playlist", url], "--no-warnings", "--no-playlist", "--", safe],
check=True, timeout=600 check=True, timeout=600
) )
for f in os.listdir(tmp_dir): for f in os.listdir(tmp_dir):