Fetches upstream URL via safe_fetch, sha256-diffs against the prior body_sha stored in metadata, updates body_text + last_synced only when content changed. Unchanged syncs just touch last_synced. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
33 lines
942 B
Python
33 lines
942 B
Python
import hashlib
|
|
from datetime import datetime, timezone
|
|
from .. import repo
|
|
from ..safe_fetch import safe_fetch
|
|
|
|
NAME = "sync.source_doc"
|
|
|
|
|
|
def _sha(data):
|
|
return hashlib.sha256(data).hexdigest()
|
|
|
|
|
|
def handle(job_data: dict) -> dict:
|
|
sd_id = job_data["source_doc_id"]
|
|
doc = repo.get_source_doc(sd_id)
|
|
if not doc:
|
|
return {"skipped": "gone"}
|
|
body = safe_fetch(doc["upstream_url"], headers={"User-Agent": "void-ingest/2.0"})
|
|
new_sha = _sha(body)
|
|
old_sha = ((doc.get("metadata") or {}).get("body_sha"))
|
|
now = datetime.now(timezone.utc)
|
|
if new_sha == old_sha:
|
|
repo.update_source_doc(sd_id, last_synced=now)
|
|
return {"unchanged": True}
|
|
text = body.decode("utf-8", errors="replace")[:1_000_000]
|
|
repo.update_source_doc(
|
|
sd_id,
|
|
body_text=text,
|
|
last_synced=now,
|
|
metadata_patch={"body_sha": new_sha}
|
|
)
|
|
return {"updated": True, "chars": len(text)}
|