fix(workers): yt-dlp argv injection — scheme check + -- separator
The url passed to yt-dlp is user-controllable (via /api/capture). Any string starting with '-' would be parsed as a flag (e.g. --config-location=/etc/passwd). Mitigations: 1. Validate scheme is http(s) and hostname is present before subprocess. 2. Pass `--` to yt-dlp so it stops flag parsing before the positional URL. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -3,17 +3,34 @@ import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from urllib.parse import urlparse
|
||||
from .. import repo
|
||||
from ..model import whisper_transcribe
|
||||
|
||||
NAME = "ingest.video"
|
||||
|
||||
|
||||
def _validate_url(url):
|
||||
"""yt-dlp accepts URLs as positional args; require http(s) scheme so a
|
||||
crafted '--config-location=...' string can't smuggle a flag. We pass
|
||||
`--` to yt-dlp too as belt + suspenders."""
|
||||
try:
|
||||
p = urlparse(url)
|
||||
except Exception as e:
|
||||
raise ValueError(f"invalid url: {e}")
|
||||
if p.scheme not in ("http", "https"):
|
||||
raise ValueError(f"unsupported scheme: {p.scheme}")
|
||||
if not p.hostname:
|
||||
raise ValueError("missing hostname")
|
||||
return url
|
||||
|
||||
|
||||
def _yt_dlp_info(url):
|
||||
"""Returns dict of metadata, or None if yt-dlp could not extract."""
|
||||
safe = _validate_url(url)
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["yt-dlp", "-J", "--no-warnings", "--no-playlist", url],
|
||||
["yt-dlp", "-J", "--no-warnings", "--no-playlist", "--", safe],
|
||||
timeout=60
|
||||
)
|
||||
return json.loads(out)
|
||||
@@ -23,11 +40,12 @@ def _yt_dlp_info(url):
|
||||
|
||||
def _yt_dlp_audio(url):
|
||||
"""Downloads bestaudio to a temp .opus and returns the path."""
|
||||
safe = _validate_url(url)
|
||||
tmp_dir = tempfile.mkdtemp(prefix="void-yt-")
|
||||
out_template = os.path.join(tmp_dir, "audio.%(ext)s")
|
||||
subprocess.run(
|
||||
["yt-dlp", "-x", "--audio-format", "opus", "-o", out_template,
|
||||
"--no-warnings", "--no-playlist", url],
|
||||
"--no-warnings", "--no-playlist", "--", safe],
|
||||
check=True, timeout=600
|
||||
)
|
||||
for f in os.listdir(tmp_dir):
|
||||
|
||||
Reference in New Issue
Block a user