From f40d548fda70dd326ee4dd7f5ba10e34302153eb Mon Sep 17 00:00:00 2001
From: oddkit-agent <agent@oddkit>
Date: Thu, 9 Apr 2026 13:08:25 +0000
Subject: [PATCH 1/3] fix: BM25 phrase boost + KV index freshness verification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug 1 (workers/src/bm25.ts, src/search/bm25.js):
BM25 scored every query token independently, letting high-frequency
terms like 'pattern' dilute rare-but-precise ones like 'vodka',
pushing exact-title matches down the rankings.

Fix: store originalText on BM25Doc during buildBM25Index, then after
BM25 scoring apply a phrase boost in searchBM25:
  - +5.0 (PHRASE_BOOST_EXACT)   if the full lowercased query appears
    as a substring of the doc's original text
  - +2.0 (PHRASE_BOOST_PARTIAL) if any consecutive word bigram from
    the query appears in the doc text (first hit wins)

These boosts supplement BM25; they never replace it. Applied to both
the Worker TypeScript version and the Node/stdio JS version for
consistency.

Bug 2 (workers/src/zip-baseline-fetcher.ts):
Cloudflare KV is eventually consistent — two requests seconds apart
can hit different edge nodes and return stale cached indexes even
when the SHA-keyed cache key looks valid.

Fix: after a KV cache hit in getIndex(), cross-check the cached
index's embedded commit_sha / canon_commit_sha against the SHAs just
resolved from the GitHub API. If they diverge the entry is stale;
log a warning, discard it, and rebuild from source.
---
 src/search/bm25.js                  | 32 ++++++++++++++++++++++++++-
 workers/src/bm25.ts                 | 34 ++++++++++++++++++++++++++++-
 workers/src/zip-baseline-fetcher.ts | 18 +++++++++++++--
 3 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/src/search/bm25.js b/src/search/bm25.js
index 5064d25..b8718cd 100644
--- a/src/search/bm25.js
+++ b/src/search/bm25.js
@@ -48,7 +48,7 @@ export function buildBM25Index(documents) {
 
   for (const doc of documents) {
     const terms = tokenize(doc.text);
-    docs.push({ id: doc.id, terms, length: terms.length });
+    docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text });
     totalLength += terms.length;
 
     const seen = new Set();
@@ -68,11 +68,21 @@ export function buildBM25Index(documents) {
   };
 }
 
+// Phrase boost constants — supplement BM25, never replace it.
+// Exact: full query string found as substring in doc text.
+// Partial: any consecutive two-word query bigram found in doc text.
+const PHRASE_BOOST_EXACT = 5.0;
+const PHRASE_BOOST_PARTIAL = 2.0;
+
 /** Search BM25 index, return sorted {id, score} pairs */
 export function searchBM25(index, query, limit = 5) {
   const queryTerms = tokenize(query);
   if (queryTerms.length === 0) return [];
 
+  // Pre-compute phrase matching inputs once, outside the per-doc loop.
+  const queryLower = query.toLowerCase();
+  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1);
+
   const scores = [];
 
   for (const doc of index.docs) {
@@ -96,6 +106,26 @@ export function searchBM25(index, query, limit = 5) {
       score += idf * tfNorm;
     }
 
+    // Phrase boost: BM25 treats every query token independently, which lets
+    // high-frequency terms dilute rare-but-important ones (e.g. "pattern"
+    // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether
+    // the original query phrase appears verbatim — or as a bigram — in the
+    // document's original text rescues those precise title/tag matches.
+    const docLower = doc.originalText.toLowerCase();
+    if (docLower.includes(queryLower)) {
+      // Full query is a substring of the doc text — strong exact match.
+      score += PHRASE_BOOST_EXACT;
+    } else if (queryWords.length >= 2) {
+      // Scan every consecutive word pair in the query; first hit wins.
+      for (let i = 0; i < queryWords.length - 1; i++) {
+        const bigram = queryWords[i] + " " + queryWords[i + 1];
+        if (docLower.includes(bigram)) {
+          score += PHRASE_BOOST_PARTIAL;
+          break;
+        }
+      }
+    }
+
     if (score > 0) scores.push({ id: doc.id, score });
   }
 
diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts
index 68c2d60..c4cb345 100644
--- a/workers/src/bm25.ts
+++ b/workers/src/bm25.ts
@@ -44,6 +44,8 @@ export interface BM25Doc {
   id: string;
   terms: string[];
   length: number;
+  /** Original (pre-tokenization) text, used for phrase-level scoring. */
+  originalText: string;
 }
 
 export interface BM25Index {
@@ -63,7 +65,7 @@ export function buildBM25Index(
 
   for (const doc of documents) {
     const terms = tokenize(doc.text);
-    docs.push({ id: doc.id, terms, length: terms.length });
+    docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text });
     totalLength += terms.length;
 
     const seen = new Set<string>();
@@ -83,6 +85,12 @@ export function buildBM25Index(
   };
 }
 
+// Phrase boost constants — supplement BM25, never replace it.
+// Exact: full query string found as substring in doc text.
+// Partial: any consecutive two-word query bigram found in doc text.
+const PHRASE_BOOST_EXACT = 5.0;
+const PHRASE_BOOST_PARTIAL = 2.0;
+
 /** Search BM25 index, return sorted {id, score} pairs */
 export function searchBM25(
   index: BM25Index,
@@ -92,6 +100,10 @@ export function searchBM25(
   const queryTerms = tokenize(query);
   if (queryTerms.length === 0) return [];
 
+  // Pre-compute phrase matching inputs once, outside the per-doc loop.
+  const queryLower = query.toLowerCase();
+  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1);
+
   const scores: Array<{ id: string; score: number }> = [];
 
   for (const doc of index.docs) {
@@ -119,6 +131,26 @@ export function searchBM25(
       score += idf * tfNorm;
     }
 
+    // Phrase boost: BM25 treats every query token independently, which lets
+    // high-frequency terms dilute rare-but-important ones (e.g. "pattern"
+    // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether
+    // the original query phrase appears verbatim — or as a bigram — in the
+    // document's original text rescues those precise title/tag matches.
+    const docLower = doc.originalText.toLowerCase();
+    if (docLower.includes(queryLower)) {
+      // Full query is a substring of the doc text — strong exact match.
+      score += PHRASE_BOOST_EXACT;
+    } else if (queryWords.length >= 2) {
+      // Scan every consecutive word pair in the query; first hit wins.
+      for (let i = 0; i < queryWords.length - 1; i++) {
+        const bigram = queryWords[i] + " " + queryWords[i + 1];
+        if (docLower.includes(bigram)) {
+          score += PHRASE_BOOST_PARTIAL;
+          break;
+        }
+      }
+    }
+
     if (score > 0) scores.push({ id: doc.id, score });
   }
 
diff --git a/workers/src/zip-baseline-fetcher.ts b/workers/src/zip-baseline-fetcher.ts
index 8fe60ea..a79da72 100644
--- a/workers/src/zip-baseline-fetcher.ts
+++ b/workers/src/zip-baseline-fetcher.ts
@@ -760,8 +760,22 @@ export class ZipBaselineFetcher {
     if (this.env.BASELINE_CACHE) {
       const cached = await this.env.BASELINE_CACHE.get(cacheKey, "json") as BaselineIndex | null;
       if (cached) {
-        // Content-addressed cache hit: SHA matches, content is truthful
-        return cached;
+        // Cloudflare KV is eventually consistent — two requests seconds apart
+        // can hit different edge nodes and return stale data even when the
+        // cache key looks correct. Cross-check the cached index's embedded
+        // commit SHAs against the SHAs we just resolved from the GitHub API.
+        // If they diverge, the cached entry is stale; discard and rebuild.
+        const baselineShaMatch = !baselineSha || cached.commit_sha === baselineSha;
+        const canonShaMatch = !canonSha || cached.canon_commit_sha === canonSha;
+        if (baselineShaMatch && canonShaMatch) {
+          // Content-addressed cache hit: SHA verified, content is truthful.
+          return cached;
+        }
+        console.warn(
+          `KV cache SHA mismatch — discarding stale index. ` +
+          `cached=${cached.commit_sha}/${cached.canon_commit_sha} ` +
+          `resolved=${baselineSha}/${canonSha ?? "none"}`
+        );
       }
     }
 

From 44aa00454bc11e45c23e3815284b1f0cb0baee5e Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 9 Apr 2026 13:23:05 +0000
Subject: [PATCH 2/3] Fix phrase boost: guard behind positive BM25 score and
 filter stop words from bigrams

---
 src/search/bm25.js  | 31 ++++++++++++++-----------------
 workers/src/bm25.ts | 31 ++++++++++++++-----------------
 2 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/search/bm25.js b/src/search/bm25.js
index b8718cd..11338b8 100644
--- a/src/search/bm25.js
+++ b/src/search/bm25.js
@@ -81,7 +81,7 @@ export function searchBM25(index, query, limit = 5) {
 
   // Pre-compute phrase matching inputs once, outside the per-doc loop.
   const queryLower = query.toLowerCase();
-  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1);
+  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
 
   const scores = [];
 
@@ -106,22 +106,19 @@ export function searchBM25(index, query, limit = 5) {
       score += idf * tfNorm;
     }
 
-    // Phrase boost: BM25 treats every query token independently, which lets
-    // high-frequency terms dilute rare-but-important ones (e.g. "pattern"
-    // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether
-    // the original query phrase appears verbatim — or as a bigram — in the
-    // document's original text rescues those precise title/tag matches.
-    const docLower = doc.originalText.toLowerCase();
-    if (docLower.includes(queryLower)) {
-      // Full query is a substring of the doc text — strong exact match.
-      score += PHRASE_BOOST_EXACT;
-    } else if (queryWords.length >= 2) {
-      // Scan every consecutive word pair in the query; first hit wins.
-      for (let i = 0; i < queryWords.length - 1; i++) {
-        const bigram = queryWords[i] + " " + queryWords[i + 1];
-        if (docLower.includes(bigram)) {
-          score += PHRASE_BOOST_PARTIAL;
-          break;
+    // Phrase boost: supplement BM25 — never replace it.
+    // Only apply when the document already has genuine BM25 relevance.
+    if (score > 0) {
+      const docLower = doc.originalText.toLowerCase();
+      if (docLower.includes(queryLower)) {
+        score += PHRASE_BOOST_EXACT;
+      } else if (queryWords.length >= 2) {
+        for (let i = 0; i < queryWords.length - 1; i++) {
+          const bigram = queryWords[i] + " " + queryWords[i + 1];
+          if (docLower.includes(bigram)) {
+            score += PHRASE_BOOST_PARTIAL;
+            break;
+          }
         }
       }
     }
diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts
index c4cb345..d07defb 100644
--- a/workers/src/bm25.ts
+++ b/workers/src/bm25.ts
@@ -102,7 +102,7 @@ export function searchBM25(
 
   // Pre-compute phrase matching inputs once, outside the per-doc loop.
   const queryLower = query.toLowerCase();
-  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1);
+  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
 
   const scores: Array<{ id: string; score: number }> = [];
 
@@ -131,22 +131,19 @@ export function searchBM25(
       score += idf * tfNorm;
     }
 
-    // Phrase boost: BM25 treats every query token independently, which lets
-    // high-frequency terms dilute rare-but-important ones (e.g. "pattern"
-    // drowning out "vodka" in "Vodka Architecture pattern"). Checking whether
-    // the original query phrase appears verbatim — or as a bigram — in the
-    // document's original text rescues those precise title/tag matches.
-    const docLower = doc.originalText.toLowerCase();
-    if (docLower.includes(queryLower)) {
-      // Full query is a substring of the doc text — strong exact match.
-      score += PHRASE_BOOST_EXACT;
-    } else if (queryWords.length >= 2) {
-      // Scan every consecutive word pair in the query; first hit wins.
-      for (let i = 0; i < queryWords.length - 1; i++) {
-        const bigram = queryWords[i] + " " + queryWords[i + 1];
-        if (docLower.includes(bigram)) {
-          score += PHRASE_BOOST_PARTIAL;
-          break;
+    // Phrase boost: supplement BM25 — never replace it.
+    // Only apply when the document already has genuine BM25 relevance.
+    if (score > 0) {
+      const docLower = doc.originalText.toLowerCase();
+      if (docLower.includes(queryLower)) {
+        score += PHRASE_BOOST_EXACT;
+      } else if (queryWords.length >= 2) {
+        for (let i = 0; i < queryWords.length - 1; i++) {
+          const bigram = queryWords[i] + " " + queryWords[i + 1];
+          if (docLower.includes(bigram)) {
+            score += PHRASE_BOOST_PARTIAL;
+            break;
+          }
         }
       }
     }

From 3e0dd25e14a417e79a66a4784c040668adcbb734 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 9 Apr 2026 13:35:08 +0000
Subject: [PATCH 3/3] fix: normalize queryWords with same punctuation/split
 pipeline as tokenize

queryWords was built by splitting the raw lowercased query on whitespace
only, skipping the punctuation stripping and hyphen/underscore/slash
splitting that tokenize() applies. This caused dirty tokens like
pattern? or whats to form bigrams that never matched against clean
document text, silently disabling partial phrase boost for punctuated
queries. Apply the same replace/split pipeline as tokenize (minus
stemming) so bigram matching works correctly.
---
 src/search/bm25.js  | 2 +-
 workers/src/bm25.ts | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search/bm25.js b/src/search/bm25.js
index 11338b8..36786ee 100644
--- a/src/search/bm25.js
+++ b/src/search/bm25.js
@@ -81,7 +81,7 @@ export function searchBM25(index, query, limit = 5) {
 
   // Pre-compute phrase matching inputs once, outside the per-doc loop.
   const queryLower = query.toLowerCase();
-  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
+  const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
 
   const scores = [];
 
diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts
index d07defb..f1aea92 100644
--- a/workers/src/bm25.ts
+++ b/workers/src/bm25.ts
@@ -102,7 +102,7 @@ export function searchBM25(
 
   // Pre-compute phrase matching inputs once, outside the per-doc loop.
   const queryLower = query.toLowerCase();
-  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
+  const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
 
   const scores: Array<{ id: string; score: number }> = [];