Void-Homelab/lib/jobs/workers/blob.js

import fs from 'node:fs/promises';
import * as refs from '../../db/repos/refs.js';
import { defaultStore } from '../../ingest/blob_store.js';
import * as queue from '../queue.js';

export const NAME = 'ingest.blob';

function kindFor(content_type, filename) {
  if (content_type?.startsWith('image/')) return 'image';
  if (content_type === 'application/pdf' || filename?.toLowerCase().endsWith('.pdf')) return 'pdf';
  return 'file';
}

export async function handler(job) {
  const { space_id, tmp_path, filename, content_type, meta = {} } = job.data;
  const buf = await fs.readFile(tmp_path);
  const { sha, path } = await defaultStore().write(buf);
  try { await fs.unlink(tmp_path); } catch { /* */ }

  const kind = kindFor(content_type, filename);
  const row = await refs.create({
    space_id,
    kind,
    source_url: null,
    title: meta.title || filename || sha.slice(0, 12),
    summary: null,
    body_text: null,
    blob_path: path,
    metadata: { sha, content_type, filename, size: buf.length, ...(meta.metadata || {}) }
  }, { kind: 'system', id: null });

  // Plan 4: hand off to the Python void-workers for OCR / extraction.
  if (kind === 'pdf') {
    await queue.enqueue('extract.pdf', { ref_id: row.id, blob_path: path });
  } else if (kind === 'image') {
    await queue.enqueue('extract.image', { ref_id: row.id, blob_path: path });
  }
  return { ref_id: row.id, sha };
}