fix(workers): safe_fetch pins IP + manual redirect re-validation

Two real findings from the security reviewer:

1. urllib auto-follows 3xx redirects via the default HTTPRedirectHandler.
   The previous code's hop loop never ran — urllib silently followed.
   Replaced with http.client + a manual hop loop. Every hop re-runs
   _validate_url, so an open-redirect to 127.0.0.1 / RFC1918 / metadata
   gets caught on the second hop.

2. DNS TOCTOU — _resolve() validated but urllib.request re-resolved on
   connect. Now the connection is pinned to the validated IP via a
   PinnedHTTPConn / PinnedHTTPSConn subclass that overrides connect() to
   bind socket.create_connection to (addr, port). For HTTPS, TLS
   server_hostname is set to the original host so SNI + cert
   verification still work against the named host while the TCP
   destination is the pinned IP.

Tests added: redirect-to-loopback short-circuits at validation;
too-many-redirects exhausts max_hops; 2xx returns body; non-2xx raises.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-01 10:28:55 +10:00
parent 7707b7eb00
commit a8b2cddcf5
2 changed files with 165 additions and 37 deletions

View File

@@ -1,18 +1,22 @@
"""Python port of lib/ingest/safe_fetch.js.
"""SSRF-safe HTTP client used by sync.source_doc (and any future workers).
Same SSRF mitigations the Node side ships:
Same contract as lib/ingest/safe_fetch.js on the Node side:
- http/https only
- DNS-resolved hostnames checked against loopback / RFC1918 /
link-local / CGNAT / IPv6 ULA + link-local
- Redirects followed manually with the same checks on each hop
- VOID_INGEST_ALLOW_PRIVATE=true gate for offline-fixture tests
- DNS-resolve and reject loopback / RFC1918 / link-local / CGNAT / metadata /
IPv6 ULA + link-local
- Pin the validated IP into the connection so a rebind between our DNS check
and the TCP connect cannot point us at an internal address.
- Follow redirects MANUALLY, re-validating every hop. We disable urllib's
built-in redirect handler so it cannot silently auto-follow.
- `VOID_INGEST_ALLOW_PRIVATE=true` gate for offline-fixture tests.
"""
import socket
import http.client
import ipaddress
import urllib.request
import urllib.error
import os
from urllib.parse import urlparse
import socket
import ssl
import urllib.parse
BLOCK_V4_NETS = [ipaddress.ip_network(c) for c in [
"0.0.0.0/8", "127.0.0.0/8", "10.0.0.0/8",
@@ -42,41 +46,101 @@ def _is_blocked(addr):
return False
def _resolve(host):
def _resolve_validated(host):
"""Resolve the host and return (address, family). Raises if any returned
address is in a blocked range."""
try:
infos = socket.getaddrinfo(host, None)
except socket.gaierror as e:
raise SafeFetchError(f"no DNS for {host}: {e}")
addrs = list({i[4][0] for i in infos})
for a in addrs:
if _is_blocked(a):
raise SafeFetchError(f"{host} resolves to blocked address {a}")
addrs = {(i[4][0], i[0]) for i in infos} # de-dupe
if not addrs:
raise SafeFetchError(f"no addresses for {host}")
return addrs[0]
for a, _fam in addrs:
if _is_blocked(a):
raise SafeFetchError(f"{host} resolves to blocked address {a}")
# Pick the first record. Caller pins this exact IP into the socket.
address, family = next(iter(addrs))
return address, family
def _validate_url(url):
"""Returns (scheme, hostname, port, path-with-query, pinned_addr, family)."""
u = urllib.parse.urlparse(url)
if u.scheme not in ("http", "https"):
raise SafeFetchError(f"unsupported scheme {u.scheme}")
host = u.hostname
if not host:
raise SafeFetchError(f"no hostname in {url}")
# Literal IP path
try:
ipaddress.ip_address(host)
if _is_blocked(host):
raise SafeFetchError(f"blocked literal IP {host}")
addr, family = host, (socket.AF_INET6 if ":" in host else socket.AF_INET)
except ValueError:
addr, family = _resolve_validated(host)
port = u.port or (443 if u.scheme == "https" else 80)
path = (u.path or "/") + (("?" + u.query) if u.query else "")
return u.scheme, host, port, path, addr, family
def _request_one(url, *, headers, timeout):
"""Issue one HTTP request with the IP pinned. Returns
(status, headers_obj, body_bytes). Does NOT follow redirects."""
scheme, host, port, path, addr, family = _validate_url(url)
# Build a socket bound to the validated IP. http.client lets us pass a
# custom socket via a connection subclass.
class PinnedHTTPConn(http.client.HTTPConnection):
def connect(self):
self.sock = socket.create_connection(
(addr, port), timeout=timeout,
source_address=None
)
class PinnedHTTPSConn(http.client.HTTPSConnection):
def connect(self):
sock = socket.create_connection(
(addr, port), timeout=timeout
)
ctx = ssl.create_default_context()
# TLS SNI + cert verification against the original hostname,
# while the TCP connection is pinned to the validated IP.
self.sock = ctx.wrap_socket(sock, server_hostname=host)
if scheme == "https":
conn = PinnedHTTPSConn(host, port, timeout=timeout)
else:
conn = PinnedHTTPConn(host, port, timeout=timeout)
try:
req_headers = {"Host": host, **(headers or {})}
conn.request("GET", path, headers=req_headers)
resp = conn.getresponse()
body = resp.read()
return resp.status, resp.headers, body
finally:
conn.close()
def safe_fetch(url, *, headers=None, timeout=15, max_hops=5):
"""GET `url` with SSRF mitigations. Returns body bytes on 2xx, raises on
non-2xx (after exhausting redirect budget)."""
current = url
for hop in range(max_hops + 1):
u = urlparse(current)
if u.scheme not in ("http", "https"):
raise SafeFetchError(f"unsupported scheme {u.scheme}")
host = u.hostname
try:
ipaddress.ip_address(host)
if _is_blocked(host):
raise SafeFetchError(f"blocked literal IP {host}")
except ValueError:
_resolve(host)
req = urllib.request.Request(current, headers=headers or {})
try:
opener = urllib.request.build_opener()
with opener.open(req, timeout=timeout) as r:
return r.read()
except urllib.error.HTTPError as e:
if e.code in (301, 302, 303, 307, 308) and "Location" in e.headers and hop < max_hops:
current = e.headers["Location"]
continue
raise
status, resp_headers, body = _request_one(
current, headers=headers, timeout=timeout
)
if status in (301, 302, 303, 307, 308):
loc = resp_headers.get("Location")
if not loc:
raise SafeFetchError("redirect without Location")
if hop >= max_hops:
raise SafeFetchError(f"too many redirects ({max_hops})")
# Resolve relative redirects + re-validate on the next loop pass.
current = urllib.parse.urljoin(current, loc)
continue
if 200 <= status < 300:
return body
raise SafeFetchError(f"http {status} from {current}")
raise SafeFetchError(f"too many redirects ({max_hops})")