From f40d548fda70dd326ee4dd7f5ba10e34302153eb Mon Sep 17 00:00:00 2001 From: oddkit-agent Date: Thu, 9 Apr 2026 13:08:25 +0000 Subject: [PATCH 1/3] fix: BM25 phrase boost + KV index freshness verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 1 (workers/src/bm25.ts, src/search/bm25.js): BM25 scored every query token independently, letting high-frequency terms like 'pattern' dilute rare-but-precise ones like 'vodka', pushing exact-title matches down the rankings. Fix: store originalText on BM25Doc during buildBM25Index, then after BM25 scoring apply a phrase boost in searchBM25: - +5.0 (PHRASE_BOOST_EXACT) if the full lowercased query appears as a substring of the doc's original text - +2.0 (PHRASE_BOOST_PARTIAL) if any consecutive word bigram from the query appears in the doc text (first hit wins) These boosts supplement BM25; they never replace it. Applied to both the Worker TypeScript version and the Node/stdio JS version for consistency. Bug 2 (workers/src/zip-baseline-fetcher.ts): Cloudflare KV is eventually consistent — two requests seconds apart can hit different edge nodes and return stale cached indexes even when the SHA-keyed cache key looks valid. Fix: after a KV cache hit in getIndex(), cross-check the cached index's embedded commit_sha / canon_commit_sha against the SHAs just resolved from the GitHub API. If they diverge the entry is stale; log a warning, discard it, and rebuild from source. --- src/search/bm25.js | 32 ++++++++++++++++++++++++++- workers/src/bm25.ts | 34 ++++++++++++++++++++++++++++- workers/src/zip-baseline-fetcher.ts | 18 +++++++++++++-- 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/search/bm25.js b/src/search/bm25.js index 5064d25..b8718cd 100644 --- a/src/search/bm25.js +++ b/src/search/bm25.js @@ -48,7 +48,7 @@ export function buildBM25Index(documents) { for (const doc of documents) { const terms = tokenize(doc.text); - docs.push({ id: doc.id, terms, length: terms.length }); + docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text }); totalLength += terms.length; const seen = new Set(); @@ -68,11 +68,21 @@ export function buildBM25Index(documents) { }; } +// Phrase boost constants — supplement BM25, never replace it. +// Exact: full query string found as substring in doc text. +// Partial: any consecutive two-word query bigram found in doc text. +const PHRASE_BOOST_EXACT = 5.0; +const PHRASE_BOOST_PARTIAL = 2.0; + /** Search BM25 index, return sorted {id, score} pairs */ export function searchBM25(index, query, limit = 5) { const queryTerms = tokenize(query); if (queryTerms.length === 0) return []; + // Pre-compute phrase matching inputs once, outside the per-doc loop. + const queryLower = query.toLowerCase(); + const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1); + const scores = []; for (const doc of index.docs) { @@ -96,6 +106,26 @@ export function searchBM25(index, query, limit = 5) { score += idf * tfNorm; } + // Phrase boost: BM25 treats every query token independently, which lets + // high-frequency terms dilute rare-but-important ones (e.g. "pattern" + // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether + // the original query phrase appears verbatim — or as a bigram — in the + // document's original text rescues those precise title/tag matches. + const docLower = doc.originalText.toLowerCase(); + if (docLower.includes(queryLower)) { + // Full query is a substring of the doc text — strong exact match. + score += PHRASE_BOOST_EXACT; + } else if (queryWords.length >= 2) { + // Scan every consecutive word pair in the query; first hit wins. + for (let i = 0; i < queryWords.length - 1; i++) { + const bigram = queryWords[i] + " " + queryWords[i + 1]; + if (docLower.includes(bigram)) { + score += PHRASE_BOOST_PARTIAL; + break; + } + } + } + if (score > 0) scores.push({ id: doc.id, score }); } diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts index 68c2d60..c4cb345 100644 --- a/workers/src/bm25.ts +++ b/workers/src/bm25.ts @@ -44,6 +44,8 @@ export interface BM25Doc { id: string; terms: string[]; length: number; + /** Original (pre-tokenization) text, used for phrase-level scoring. */ + originalText: string; } export interface BM25Index { @@ -63,7 +65,7 @@ export function buildBM25Index( for (const doc of documents) { const terms = tokenize(doc.text); - docs.push({ id: doc.id, terms, length: terms.length }); + docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text }); totalLength += terms.length; const seen = new Set(); @@ -83,6 +85,12 @@ export function buildBM25Index( }; } +// Phrase boost constants — supplement BM25, never replace it. +// Exact: full query string found as substring in doc text. +// Partial: any consecutive two-word query bigram found in doc text. +const PHRASE_BOOST_EXACT = 5.0; +const PHRASE_BOOST_PARTIAL = 2.0; + /** Search BM25 index, return sorted {id, score} pairs */ export function searchBM25( index: BM25Index, @@ -92,6 +100,10 @@ export function searchBM25( const queryTerms = tokenize(query); if (queryTerms.length === 0) return []; + // Pre-compute phrase matching inputs once, outside the per-doc loop. + const queryLower = query.toLowerCase(); + const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1); + const scores: Array<{ id: string; score: number }> = []; for (const doc of index.docs) { @@ -119,6 +131,26 @@ export function searchBM25( score += idf * tfNorm; } + // Phrase boost: BM25 treats every query token independently, which lets + // high-frequency terms dilute rare-but-important ones (e.g. "pattern" + // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether + // the original query phrase appears verbatim — or as a bigram — in the + // document's original text rescues those precise title/tag matches. + const docLower = doc.originalText.toLowerCase(); + if (docLower.includes(queryLower)) { + // Full query is a substring of the doc text — strong exact match. + score += PHRASE_BOOST_EXACT; + } else if (queryWords.length >= 2) { + // Scan every consecutive word pair in the query; first hit wins. + for (let i = 0; i < queryWords.length - 1; i++) { + const bigram = queryWords[i] + " " + queryWords[i + 1]; + if (docLower.includes(bigram)) { + score += PHRASE_BOOST_PARTIAL; + break; + } + } + } + if (score > 0) scores.push({ id: doc.id, score }); } diff --git a/workers/src/zip-baseline-fetcher.ts b/workers/src/zip-baseline-fetcher.ts index 8fe60ea..a79da72 100644 --- a/workers/src/zip-baseline-fetcher.ts +++ b/workers/src/zip-baseline-fetcher.ts @@ -760,8 +760,22 @@ export class ZipBaselineFetcher { if (this.env.BASELINE_CACHE) { const cached = await this.env.BASELINE_CACHE.get(cacheKey, "json") as BaselineIndex | null; if (cached) { - // Content-addressed cache hit: SHA matches, content is truthful - return cached; + // Cloudflare KV is eventually consistent — two requests seconds apart + // can hit different edge nodes and return stale data even when the + // cache key looks correct. Cross-check the cached index's embedded + // commit SHAs against the SHAs we just resolved from the GitHub API. + // If they diverge, the cached entry is stale; discard and rebuild. + const baselineShaMatch = !baselineSha || cached.commit_sha === baselineSha; + const canonShaMatch = !canonSha || cached.canon_commit_sha === canonSha; + if (baselineShaMatch && canonShaMatch) { + // Content-addressed cache hit: SHA verified, content is truthful. + return cached; + } + console.warn( + `KV cache SHA mismatch — discarding stale index. ` + + `cached=${cached.commit_sha}/${cached.canon_commit_sha} ` + + `resolved=${baselineSha}/${canonSha ?? "none"}` + ); } } From 44aa00454bc11e45c23e3815284b1f0cb0baee5e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 9 Apr 2026 13:23:05 +0000 Subject: [PATCH 2/3] Fix phrase boost: guard behind positive BM25 score and filter stop words from bigrams --- src/search/bm25.js | 31 ++++++++++++++----------------- workers/src/bm25.ts | 31 ++++++++++++++----------------- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/src/search/bm25.js b/src/search/bm25.js index b8718cd..11338b8 100644 --- a/src/search/bm25.js +++ b/src/search/bm25.js @@ -81,7 +81,7 @@ export function searchBM25(index, query, limit = 5) { // Pre-compute phrase matching inputs once, outside the per-doc loop. const queryLower = query.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1); + const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); const scores = []; @@ -106,22 +106,19 @@ export function searchBM25(index, query, limit = 5) { score += idf * tfNorm; } - // Phrase boost: BM25 treats every query token independently, which lets - // high-frequency terms dilute rare-but-important ones (e.g. "pattern" - // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether - // the original query phrase appears verbatim — or as a bigram — in the - // document's original text rescues those precise title/tag matches. - const docLower = doc.originalText.toLowerCase(); - if (docLower.includes(queryLower)) { - // Full query is a substring of the doc text — strong exact match. - score += PHRASE_BOOST_EXACT; - } else if (queryWords.length >= 2) { - // Scan every consecutive word pair in the query; first hit wins. - for (let i = 0; i < queryWords.length - 1; i++) { - const bigram = queryWords[i] + " " + queryWords[i + 1]; - if (docLower.includes(bigram)) { - score += PHRASE_BOOST_PARTIAL; - break; + // Phrase boost: supplement BM25 — never replace it. + // Only apply when the document already has genuine BM25 relevance. + if (score > 0) { + const docLower = doc.originalText.toLowerCase(); + if (docLower.includes(queryLower)) { + score += PHRASE_BOOST_EXACT; + } else if (queryWords.length >= 2) { + for (let i = 0; i < queryWords.length - 1; i++) { + const bigram = queryWords[i] + " " + queryWords[i + 1]; + if (docLower.includes(bigram)) { + score += PHRASE_BOOST_PARTIAL; + break; + } } } } diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts index c4cb345..d07defb 100644 --- a/workers/src/bm25.ts +++ b/workers/src/bm25.ts @@ -102,7 +102,7 @@ export function searchBM25( // Pre-compute phrase matching inputs once, outside the per-doc loop. const queryLower = query.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1); + const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); const scores: Array<{ id: string; score: number }> = []; @@ -131,22 +131,19 @@ export function searchBM25( score += idf * tfNorm; } - // Phrase boost: BM25 treats every query token independently, which lets - // high-frequency terms dilute rare-but-important ones (e.g. "pattern" - // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether - // the original query phrase appears verbatim — or as a bigram — in the - // document's original text rescues those precise title/tag matches. - const docLower = doc.originalText.toLowerCase(); - if (docLower.includes(queryLower)) { - // Full query is a substring of the doc text — strong exact match. - score += PHRASE_BOOST_EXACT; - } else if (queryWords.length >= 2) { - // Scan every consecutive word pair in the query; first hit wins. - for (let i = 0; i < queryWords.length - 1; i++) { - const bigram = queryWords[i] + " " + queryWords[i + 1]; - if (docLower.includes(bigram)) { - score += PHRASE_BOOST_PARTIAL; - break; + // Phrase boost: supplement BM25 — never replace it. + // Only apply when the document already has genuine BM25 relevance. + if (score > 0) { + const docLower = doc.originalText.toLowerCase(); + if (docLower.includes(queryLower)) { + score += PHRASE_BOOST_EXACT; + } else if (queryWords.length >= 2) { + for (let i = 0; i < queryWords.length - 1; i++) { + const bigram = queryWords[i] + " " + queryWords[i + 1]; + if (docLower.includes(bigram)) { + score += PHRASE_BOOST_PARTIAL; + break; + } } } } From 3e0dd25e14a417e79a66a4784c040668adcbb734 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 9 Apr 2026 13:35:08 +0000 Subject: [PATCH 3/3] fix: normalize queryWords with same punctuation/split pipeline as tokenize queryWords was built by splitting the raw lowercased query on whitespace only, skipping the punctuation stripping and hyphen/underscore/slash splitting that tokenize() applies. This caused dirty tokens like pattern? or whats to form bigrams that never matched against clean document text, silently disabling partial phrase boost for punctuated queries. Apply the same replace/split pipeline as tokenize (minus stemming) so bigram matching works correctly. --- src/search/bm25.js | 2 +- workers/src/bm25.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search/bm25.js b/src/search/bm25.js index 11338b8..36786ee 100644 --- a/src/search/bm25.js +++ b/src/search/bm25.js @@ -81,7 +81,7 @@ export function searchBM25(index, query, limit = 5) { // Pre-compute phrase matching inputs once, outside the per-doc loop. const queryLower = query.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); + const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); const scores = []; diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts index d07defb..f1aea92 100644 --- a/workers/src/bm25.ts +++ b/workers/src/bm25.ts @@ -102,7 +102,7 @@ export function searchBM25( // Pre-compute phrase matching inputs once, outside the per-doc loop. const queryLower = query.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); + const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); const scores: Array<{ id: string; score: number }> = [];