fix(workers): yt-dlp argv injection — scheme check + -- separator
The url passed to yt-dlp is user-controllable (via /api/capture). Any string starting with '-' would be parsed as a flag (e.g. --config-location=/etc/passwd). Mitigations: 1. Validate scheme is http(s) and hostname is present before subprocess. 2. Pass `--` to yt-dlp so it stops flag parsing before the positional URL. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
|
import pytest
|
||||||
import subprocess
|
import subprocess
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
from void_workers.handlers.video import handle as handle_video
|
from void_workers.handlers.video import handle as handle_video, _validate_url
|
||||||
|
|
||||||
|
|
||||||
def _reset_void_schema(conn):
|
def _reset_void_schema(conn):
|
||||||
@@ -48,6 +49,17 @@ def test_video_creates_ref_with_transcript_and_metadata(conn):
|
|||||||
assert row[2] == "youtube"
|
assert row[2] == "youtube"
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_url_rejects_non_http():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_validate_url("file:///etc/passwd")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_validate_url("javascript:alert(1)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_url_accepts_https():
|
||||||
|
assert _validate_url("https://youtu.be/abc") == "https://youtu.be/abc"
|
||||||
|
|
||||||
|
|
||||||
def test_video_skipped_when_yt_dlp_fails(conn):
|
def test_video_skipped_when_yt_dlp_fails(conn):
|
||||||
_reset_void_schema(conn)
|
_reset_void_schema(conn)
|
||||||
_run_node_migrations()
|
_run_node_migrations()
|
||||||
|
|||||||
@@ -3,17 +3,34 @@ import json
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from urllib.parse import urlparse
|
||||||
from .. import repo
|
from .. import repo
|
||||||
from ..model import whisper_transcribe
|
from ..model import whisper_transcribe
|
||||||
|
|
||||||
NAME = "ingest.video"
|
NAME = "ingest.video"
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_url(url):
|
||||||
|
"""yt-dlp accepts URLs as positional args; require http(s) scheme so a
|
||||||
|
crafted '--config-location=...' string can't smuggle a flag. We pass
|
||||||
|
`--` to yt-dlp too as belt + suspenders."""
|
||||||
|
try:
|
||||||
|
p = urlparse(url)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"invalid url: {e}")
|
||||||
|
if p.scheme not in ("http", "https"):
|
||||||
|
raise ValueError(f"unsupported scheme: {p.scheme}")
|
||||||
|
if not p.hostname:
|
||||||
|
raise ValueError("missing hostname")
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def _yt_dlp_info(url):
|
def _yt_dlp_info(url):
|
||||||
"""Returns dict of metadata, or None if yt-dlp could not extract."""
|
"""Returns dict of metadata, or None if yt-dlp could not extract."""
|
||||||
|
safe = _validate_url(url)
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output(
|
out = subprocess.check_output(
|
||||||
["yt-dlp", "-J", "--no-warnings", "--no-playlist", url],
|
["yt-dlp", "-J", "--no-warnings", "--no-playlist", "--", safe],
|
||||||
timeout=60
|
timeout=60
|
||||||
)
|
)
|
||||||
return json.loads(out)
|
return json.loads(out)
|
||||||
@@ -23,11 +40,12 @@ def _yt_dlp_info(url):
|
|||||||
|
|
||||||
def _yt_dlp_audio(url):
|
def _yt_dlp_audio(url):
|
||||||
"""Downloads bestaudio to a temp .opus and returns the path."""
|
"""Downloads bestaudio to a temp .opus and returns the path."""
|
||||||
|
safe = _validate_url(url)
|
||||||
tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
|
tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
|
||||||
out_template = os.path.join(tmp_dir, "audio.%(ext)s")
|
out_template = os.path.join(tmp_dir, "audio.%(ext)s")
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
|
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
|
||||||
"--no-warnings", "--no-playlist", url],
|
"--no-warnings", "--no-playlist", "--", safe],
|
||||||
check=True, timeout=600
|
check=True, timeout=600
|
||||||
)
|
)
|
||||||
for f in os.listdir(tmp_dir):
|
for f in os.listdir(tmp_dir):
|
||||||
|
|||||||
Reference in New Issue
Block a user