Files
Void-Homelab/lib/jobs/workers/embed.js
root 71adc51c00 fix(embed): chunk + mean-pool long text so large pages embed
Split long page text into 1500-char chunks before calling Ollama, then
mean-pool the per-chunk vectors into one page vector. Removes the hard
6000-char slice that still caused 500s on dense markdown/table pages.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-05 23:24:40 +10:00

30 lines
1.3 KiB
JavaScript

import { embedTextPooled, padTo } from '../../ai/ollama.js';
import { pool } from '../../db/pool.js';
import { recordAudit } from '../../db/repos/audit.js';
export const NAME = 'embed.text';
const STRING_BUILDERS = {
page: row => `${row.title}\n\n${row.body_md || ''}`,
ref: row => `${row.title || ''}\n${row.summary || ''}\n${row.body_text || ''}`,
source_doc: row => `${row.name}\n${row.body_text || ''}`,
conversation: row => `${row.title || ''}\n${row.summary || ''}`
};
const TABLE = { page: 'pages', ref: 'refs', source_doc: 'source_docs', conversation: 'conversations' };
export async function handler(job) {
const { entity_type, entity_id } = job.data;
const table = TABLE[entity_type];
if (!table) throw new Error(`unknown entity_type: ${entity_type}`);
const { rows: [row] } = await pool.query(`SELECT * FROM ${table} WHERE id=$1`, [entity_id]);
if (!row) return { skipped: 'gone' };
const text = STRING_BUILDERS[entity_type](row);
const v = await embedTextPooled(text);
const padded = padTo(v, 1024);
const literal = '[' + padded.join(',') + ']';
await pool.query(`UPDATE ${table} SET embedding=$1::vector WHERE id=$2`, [literal, entity_id]);
await recordAudit({ kind: 'worker', id: null }, 'update', entity_type, entity_id, null, { embedding: 'updated' });
return { entity_id };
}