feat(workers): sync.source_doc with sha256 diff

Fetches upstream URL via safe_fetch, sha256-diffs against the prior
body_sha stored in metadata, updates body_text + last_synced only when
content changed. Unchanged syncs just touch last_synced.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-01 10:13:27 +10:00
parent cd1d69c689
commit 8fa7f71694
3 changed files with 104 additions and 1 deletions

View File

@@ -1,8 +1,9 @@
from . import echo, pdf, image, video
from . import echo, pdf, image, video, sourcedoc
REGISTRY = {
echo.NAME: echo.handle,
pdf.NAME: pdf.handle,
image.NAME: image.handle,
video.NAME: video.handle,
sourcedoc.NAME: sourcedoc.handle,
}

View File

@@ -0,0 +1,32 @@
import hashlib
from datetime import datetime, timezone
from .. import repo
from ..safe_fetch import safe_fetch
NAME = "sync.source_doc"
def _sha(data):
return hashlib.sha256(data).hexdigest()
def handle(job_data: dict) -> dict:
sd_id = job_data["source_doc_id"]
doc = repo.get_source_doc(sd_id)
if not doc:
return {"skipped": "gone"}
body = safe_fetch(doc["upstream_url"], headers={"User-Agent": "void-ingest/2.0"})
new_sha = _sha(body)
old_sha = ((doc.get("metadata") or {}).get("body_sha"))
now = datetime.now(timezone.utc)
if new_sha == old_sha:
repo.update_source_doc(sd_id, last_synced=now)
return {"unchanged": True}
text = body.decode("utf-8", errors="replace")[:1_000_000]
repo.update_source_doc(
sd_id,
body_text=text,
last_synced=now,
metadata_patch={"body_sha": new_sha}
)
return {"updated": True, "chars": len(text)}