diff --git a/src/search/bm25.js b/src/search/bm25.js index 5064d25..36786ee 100644 --- a/src/search/bm25.js +++ b/src/search/bm25.js @@ -48,7 +48,7 @@ export function buildBM25Index(documents) { for (const doc of documents) { const terms = tokenize(doc.text); - docs.push({ id: doc.id, terms, length: terms.length }); + docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text }); totalLength += terms.length; const seen = new Set(); @@ -68,11 +68,21 @@ export function buildBM25Index(documents) { }; } +// Phrase boost constants — supplement BM25, never replace it. +// Exact: full query string found as substring in doc text. +// Partial: any consecutive two-word query bigram found in doc text. +const PHRASE_BOOST_EXACT = 5.0; +const PHRASE_BOOST_PARTIAL = 2.0; + /** Search BM25 index, return sorted {id, score} pairs */ export function searchBM25(index, query, limit = 5) { const queryTerms = tokenize(query); if (queryTerms.length === 0) return []; + // Pre-compute phrase matching inputs once, outside the per-doc loop. + const queryLower = query.toLowerCase(); + const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); + const scores = []; for (const doc of index.docs) { @@ -96,6 +106,23 @@ export function searchBM25(index, query, limit = 5) { score += idf * tfNorm; } + // Phrase boost: supplement BM25 — never replace it. + // Only apply when the document already has genuine BM25 relevance. + if (score > 0) { + const docLower = doc.originalText.toLowerCase(); + if (docLower.includes(queryLower)) { + score += PHRASE_BOOST_EXACT; + } else if (queryWords.length >= 2) { + for (let i = 0; i < queryWords.length - 1; i++) { + const bigram = queryWords[i] + " " + queryWords[i + 1]; + if (docLower.includes(bigram)) { + score += PHRASE_BOOST_PARTIAL; + break; + } + } + } + } + if (score > 0) scores.push({ id: doc.id, score }); } diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts index 68c2d60..f1aea92 100644 --- a/workers/src/bm25.ts +++ b/workers/src/bm25.ts @@ -44,6 +44,8 @@ export interface BM25Doc { id: string; terms: string[]; length: number; + /** Original (pre-tokenization) text, used for phrase-level scoring. */ + originalText: string; } export interface BM25Index { @@ -63,7 +65,7 @@ export function buildBM25Index( for (const doc of documents) { const terms = tokenize(doc.text); - docs.push({ id: doc.id, terms, length: terms.length }); + docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text }); totalLength += terms.length; const seen = new Set(); @@ -83,6 +85,12 @@ export function buildBM25Index( }; } +// Phrase boost constants — supplement BM25, never replace it. +// Exact: full query string found as substring in doc text. +// Partial: any consecutive two-word query bigram found in doc text. +const PHRASE_BOOST_EXACT = 5.0; +const PHRASE_BOOST_PARTIAL = 2.0; + /** Search BM25 index, return sorted {id, score} pairs */ export function searchBM25( index: BM25Index, @@ -92,6 +100,10 @@ export function searchBM25( const queryTerms = tokenize(query); if (queryTerms.length === 0) return []; + // Pre-compute phrase matching inputs once, outside the per-doc loop. + const queryLower = query.toLowerCase(); + const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w)); + const scores: Array<{ id: string; score: number }> = []; for (const doc of index.docs) { @@ -119,6 +131,23 @@ export function searchBM25( score += idf * tfNorm; } + // Phrase boost: supplement BM25 — never replace it. + // Only apply when the document already has genuine BM25 relevance. + if (score > 0) { + const docLower = doc.originalText.toLowerCase(); + if (docLower.includes(queryLower)) { + score += PHRASE_BOOST_EXACT; + } else if (queryWords.length >= 2) { + for (let i = 0; i < queryWords.length - 1; i++) { + const bigram = queryWords[i] + " " + queryWords[i + 1]; + if (docLower.includes(bigram)) { + score += PHRASE_BOOST_PARTIAL; + break; + } + } + } + } + if (score > 0) scores.push({ id: doc.id, score }); } diff --git a/workers/src/zip-baseline-fetcher.ts b/workers/src/zip-baseline-fetcher.ts index 8fe60ea..a79da72 100644 --- a/workers/src/zip-baseline-fetcher.ts +++ b/workers/src/zip-baseline-fetcher.ts @@ -760,8 +760,22 @@ export class ZipBaselineFetcher { if (this.env.BASELINE_CACHE) { const cached = await this.env.BASELINE_CACHE.get(cacheKey, "json") as BaselineIndex | null; if (cached) { - // Content-addressed cache hit: SHA matches, content is truthful - return cached; + // Cloudflare KV is eventually consistent — two requests seconds apart + // can hit different edge nodes and return stale data even when the + // cache key looks correct. Cross-check the cached index's embedded + // commit SHAs against the SHAs we just resolved from the GitHub API. + // If they diverge, the cached entry is stale; discard and rebuild. + const baselineShaMatch = !baselineSha || cached.commit_sha === baselineSha; + const canonShaMatch = !canonSha || cached.canon_commit_sha === canonSha; + if (baselineShaMatch && canonShaMatch) { + // Content-addressed cache hit: SHA verified, content is truthful. + return cached; + } + console.warn( + `KV cache SHA mismatch — discarding stale index. ` + + `cached=${cached.commit_sha}/${cached.canon_commit_sha} ` + + `resolved=${baselineSha}/${canonSha ?? "none"}` + ); } }