import hashlib from datetime import datetime, timezone from .. import repo from ..safe_fetch import safe_fetch NAME = "sync.source_doc" def _sha(data): return hashlib.sha256(data).hexdigest() def handle(job_data: dict) -> dict: sd_id = job_data["source_doc_id"] doc = repo.get_source_doc(sd_id) if not doc: return {"skipped": "gone"} body = safe_fetch(doc["upstream_url"], headers={"User-Agent": "void-ingest/2.0"}) new_sha = _sha(body) old_sha = ((doc.get("metadata") or {}).get("body_sha")) now = datetime.now(timezone.utc) if new_sha == old_sha: repo.update_source_doc(sd_id, last_synced=now) return {"unchanged": True} text = body.decode("utf-8", errors="replace")[:1_000_000] repo.update_source_doc( sd_id, body_text=text, last_synced=now, metadata_patch={"body_sha": new_sha} ) return {"updated": True, "chars": len(text)}