fix(embed): chunk + mean-pool long text so large pages embed

Split long page text into 1500-char chunks before calling Ollama, then
mean-pool the per-chunk vectors into one page vector. Removes the hard
6000-char slice that still caused 500s on dense markdown/table pages.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-05 23:24:40 +10:00
parent 946a03883f
commit 71adc51c00
3 changed files with 76 additions and 3 deletions

View File

@@ -22,3 +22,40 @@ export function padTo(vector, dim) {
while (out.length < dim) out.push(0);
return out;
}
// Split text into chunks of at most `size` chars, breaking on line boundaries
// where possible (never mid-word-loss): accumulate lines until adding the next
// would exceed `size`. A single over-long line is hard-split. Returns [] for empty.
export function chunkText(text, size = 1500) {
const s = (text || '').trim();
if (!s) return [];
const chunks = [];
let cur = '';
for (const line of s.split('\n')) {
if (line.length > size) {
if (cur) { chunks.push(cur); cur = ''; }
for (let i = 0; i < line.length; i += size) chunks.push(line.slice(i, i + size));
continue;
}
if (cur.length + line.length + 1 > size) { if (cur) chunks.push(cur); cur = line; }
else { cur = cur ? cur + '\n' + line : line; }
}
if (cur) chunks.push(cur);
return chunks;
}
// Embed possibly-long text by chunking, embedding each chunk, and mean-pooling
// the resulting vectors element-wise. Returns a single embedding vector.
// 1 chunk => identical to embedText. Caps the number of chunks to bound cost.
export async function embedTextPooled(text, { model = 'nomic-embed-text', timeoutMs = 60_000, maxChunks = 64, chunkSize = 1500 } = {}) {
let chunks = chunkText(text, chunkSize);
if (chunks.length === 0) chunks = [''];
if (chunks.length > maxChunks) chunks = chunks.slice(0, maxChunks);
const vecs = [];
for (const c of chunks) vecs.push(await embedText(c, { model, timeoutMs }));
const dim = vecs[0].length;
const pooled = new Array(dim).fill(0);
for (const v of vecs) for (let i = 0; i < dim; i++) pooled[i] += (v[i] || 0);
for (let i = 0; i < dim; i++) pooled[i] /= vecs.length;
return pooled;
}