fix(embed): chunk + mean-pool long text so large pages embed
Split long page text into 1500-char chunks before calling Ollama, then mean-pool the per-chunk vectors into one page vector. Removes the hard 6000-char slice that still caused 500s on dense markdown/table pages. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,3 +22,40 @@ export function padTo(vector, dim) {
|
||||
while (out.length < dim) out.push(0);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Split text into chunks of at most `size` chars, breaking on line boundaries
|
||||
// where possible (never mid-word-loss): accumulate lines until adding the next
|
||||
// would exceed `size`. A single over-long line is hard-split. Returns [] for empty.
|
||||
export function chunkText(text, size = 1500) {
|
||||
const s = (text || '').trim();
|
||||
if (!s) return [];
|
||||
const chunks = [];
|
||||
let cur = '';
|
||||
for (const line of s.split('\n')) {
|
||||
if (line.length > size) {
|
||||
if (cur) { chunks.push(cur); cur = ''; }
|
||||
for (let i = 0; i < line.length; i += size) chunks.push(line.slice(i, i + size));
|
||||
continue;
|
||||
}
|
||||
if (cur.length + line.length + 1 > size) { if (cur) chunks.push(cur); cur = line; }
|
||||
else { cur = cur ? cur + '\n' + line : line; }
|
||||
}
|
||||
if (cur) chunks.push(cur);
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Embed possibly-long text by chunking, embedding each chunk, and mean-pooling
|
||||
// the resulting vectors element-wise. Returns a single embedding vector.
|
||||
// 1 chunk => identical to embedText. Caps the number of chunks to bound cost.
|
||||
export async function embedTextPooled(text, { model = 'nomic-embed-text', timeoutMs = 60_000, maxChunks = 64, chunkSize = 1500 } = {}) {
|
||||
let chunks = chunkText(text, chunkSize);
|
||||
if (chunks.length === 0) chunks = [''];
|
||||
if (chunks.length > maxChunks) chunks = chunks.slice(0, maxChunks);
|
||||
const vecs = [];
|
||||
for (const c of chunks) vecs.push(await embedText(c, { model, timeoutMs }));
|
||||
const dim = vecs[0].length;
|
||||
const pooled = new Array(dim).fill(0);
|
||||
for (const v of vecs) for (let i = 0; i < dim; i++) pooled[i] += (v[i] || 0);
|
||||
for (let i = 0; i < dim; i++) pooled[i] /= vecs.length;
|
||||
return pooled;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user