fix: normalize queryWords with same punctuation/split pipeline as tokenize

cursoragent · cursoragent · commit 3e0dd25e14a4 · 2026-04-09T13:35:08.000Z
queryWords was built by splitting the raw lowercased query on whitespace
only, skipping the punctuation stripping and hyphen/underscore/slash
splitting that tokenize() applies. This caused dirty tokens like
pattern? or whats to form bigrams that never matched against clean
document text, silently disabling partial phrase boost for punctuated
queries. Apply the same replace/split pipeline as tokenize (minus
stemming) so bigram matching works correctly.
diff --git a/src/search/bm25.js b/src/search/bm25.js
@@ -81,7 +81,7 @@ export function searchBM25(index, query, limit = 5) {
 
   // Pre-compute phrase matching inputs once, outside the per-doc loop.
   const queryLower = query.toLowerCase();
-  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
+  const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
 
   const scores = [];
 
diff --git a/workers/src/bm25.ts b/workers/src/bm25.ts
@@ -102,7 +102,7 @@ export function searchBM25(
 
   // Pre-compute phrase matching inputs once, outside the per-doc loop.
   const queryLower = query.toLowerCase();
-  const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
+  const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
 
   const scores: Array<{ id: string; score: number }> = [];