"""SSRF-safe HTTP client used by sync.source_doc (and any future workers). Same contract as lib/ingest/safe_fetch.js on the Node side: - http/https only - DNS-resolve and reject loopback / RFC1918 / link-local / CGNAT / metadata / IPv6 ULA + link-local - Pin the validated IP into the connection so a rebind between our DNS check and the TCP connect cannot point us at an internal address. - Follow redirects MANUALLY, re-validating every hop. We disable urllib's built-in redirect handler so it cannot silently auto-follow. - `VOID_INGEST_ALLOW_PRIVATE=true` gate for offline-fixture tests. """ import http.client import ipaddress import os import socket import ssl import urllib.parse BLOCK_V4_NETS = [ipaddress.ip_network(c) for c in [ "0.0.0.0/8", "127.0.0.0/8", "10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16", "169.254.0.0/16", "100.64.0.0/10", ]] class SafeFetchError(Exception): pass def _is_blocked(addr): if os.environ.get("VOID_INGEST_ALLOW_PRIVATE") == "true": return False try: ip = ipaddress.ip_address(addr) except ValueError: return True if ip.is_loopback or ip.is_link_local or ip.is_multicast or ip.is_unspecified: return True if isinstance(ip, ipaddress.IPv4Address): return any(ip in n for n in BLOCK_V4_NETS) # IPv6: ULA + link-local if ip in ipaddress.ip_network("fc00::/7") or ip in ipaddress.ip_network("fe80::/10"): return True return False def _resolve_validated(host): """Resolve the host and return (address, family). Raises if any returned address is in a blocked range.""" try: infos = socket.getaddrinfo(host, None) except socket.gaierror as e: raise SafeFetchError(f"no DNS for {host}: {e}") addrs = {(i[4][0], i[0]) for i in infos} # de-dupe if not addrs: raise SafeFetchError(f"no addresses for {host}") for a, _fam in addrs: if _is_blocked(a): raise SafeFetchError(f"{host} resolves to blocked address {a}") # Pick the first record. Caller pins this exact IP into the socket. address, family = next(iter(addrs)) return address, family def _validate_url(url): """Returns (scheme, hostname, port, path-with-query, pinned_addr, family).""" u = urllib.parse.urlparse(url) if u.scheme not in ("http", "https"): raise SafeFetchError(f"unsupported scheme {u.scheme}") host = u.hostname if not host: raise SafeFetchError(f"no hostname in {url}") # Literal IP path try: ipaddress.ip_address(host) if _is_blocked(host): raise SafeFetchError(f"blocked literal IP {host}") addr, family = host, (socket.AF_INET6 if ":" in host else socket.AF_INET) except ValueError: addr, family = _resolve_validated(host) port = u.port or (443 if u.scheme == "https" else 80) path = (u.path or "/") + (("?" + u.query) if u.query else "") return u.scheme, host, port, path, addr, family def _request_one(url, *, headers, timeout): """Issue one HTTP request with the IP pinned. Returns (status, headers_obj, body_bytes). Does NOT follow redirects.""" scheme, host, port, path, addr, family = _validate_url(url) # Build a socket bound to the validated IP. http.client lets us pass a # custom socket via a connection subclass. class PinnedHTTPConn(http.client.HTTPConnection): def connect(self): self.sock = socket.create_connection( (addr, port), timeout=timeout, source_address=None ) class PinnedHTTPSConn(http.client.HTTPSConnection): def connect(self): sock = socket.create_connection( (addr, port), timeout=timeout ) ctx = ssl.create_default_context() # TLS SNI + cert verification against the original hostname, # while the TCP connection is pinned to the validated IP. self.sock = ctx.wrap_socket(sock, server_hostname=host) if scheme == "https": conn = PinnedHTTPSConn(host, port, timeout=timeout) else: conn = PinnedHTTPConn(host, port, timeout=timeout) try: req_headers = {"Host": host, **(headers or {})} conn.request("GET", path, headers=req_headers) resp = conn.getresponse() body = resp.read() return resp.status, resp.headers, body finally: conn.close() def safe_fetch(url, *, headers=None, timeout=15, max_hops=5): """GET `url` with SSRF mitigations. Returns body bytes on 2xx, raises on non-2xx (after exhausting redirect budget).""" current = url for hop in range(max_hops + 1): status, resp_headers, body = _request_one( current, headers=headers, timeout=timeout ) if status in (301, 302, 303, 307, 308): loc = resp_headers.get("Location") if not loc: raise SafeFetchError("redirect without Location") if hop >= max_hops: raise SafeFetchError(f"too many redirects ({max_hops})") # Resolve relative redirects + re-validate on the next loop pass. current = urllib.parse.urljoin(current, loc) continue if 200 <= status < 300: return body raise SafeFetchError(f"http {status} from {current}") raise SafeFetchError(f"too many redirects ({max_hops})")