Skip to content

Commit b7826ba

Browse files
authored
Merge pull request #72 from klappy/fix/search-phrase-boost-and-index-freshness
2 parents 519edb6 + 3e0dd25 commit b7826ba

3 files changed

Lines changed: 74 additions & 4 deletions

File tree

src/search/bm25.js

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ export function buildBM25Index(documents) {
4848

4949
for (const doc of documents) {
5050
const terms = tokenize(doc.text);
51-
docs.push({ id: doc.id, terms, length: terms.length });
51+
docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text });
5252
totalLength += terms.length;
5353

5454
const seen = new Set();
@@ -68,11 +68,21 @@ export function buildBM25Index(documents) {
6868
};
6969
}
7070

71+
// Phrase boost constants — supplement BM25, never replace it.
72+
// Exact: full query string found as substring in doc text.
73+
// Partial: any consecutive two-word query bigram found in doc text.
74+
const PHRASE_BOOST_EXACT = 5.0;
75+
const PHRASE_BOOST_PARTIAL = 2.0;
76+
7177
/** Search BM25 index, return sorted {id, score} pairs */
7278
export function searchBM25(index, query, limit = 5) {
7379
const queryTerms = tokenize(query);
7480
if (queryTerms.length === 0) return [];
7581

82+
// Pre-compute phrase matching inputs once, outside the per-doc loop.
83+
const queryLower = query.toLowerCase();
84+
const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
85+
7686
const scores = [];
7787

7888
for (const doc of index.docs) {
@@ -96,6 +106,23 @@ export function searchBM25(index, query, limit = 5) {
96106
score += idf * tfNorm;
97107
}
98108

109+
// Phrase boost: supplement BM25 — never replace it.
110+
// Only apply when the document already has genuine BM25 relevance.
111+
if (score > 0) {
112+
const docLower = doc.originalText.toLowerCase();
113+
if (docLower.includes(queryLower)) {
114+
score += PHRASE_BOOST_EXACT;
115+
} else if (queryWords.length >= 2) {
116+
for (let i = 0; i < queryWords.length - 1; i++) {
117+
const bigram = queryWords[i] + " " + queryWords[i + 1];
118+
if (docLower.includes(bigram)) {
119+
score += PHRASE_BOOST_PARTIAL;
120+
break;
121+
}
122+
}
123+
}
124+
}
125+
99126
if (score > 0) scores.push({ id: doc.id, score });
100127
}
101128

workers/src/bm25.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ export interface BM25Doc {
4444
id: string;
4545
terms: string[];
4646
length: number;
47+
/** Original (pre-tokenization) text, used for phrase-level scoring. */
48+
originalText: string;
4749
}
4850

4951
export interface BM25Index {
@@ -63,7 +65,7 @@ export function buildBM25Index(
6365

6466
for (const doc of documents) {
6567
const terms = tokenize(doc.text);
66-
docs.push({ id: doc.id, terms, length: terms.length });
68+
docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text });
6769
totalLength += terms.length;
6870

6971
const seen = new Set<string>();
@@ -83,6 +85,12 @@ export function buildBM25Index(
8385
};
8486
}
8587

88+
// Phrase boost constants — supplement BM25, never replace it.
89+
// Exact: full query string found as substring in doc text.
90+
// Partial: any consecutive two-word query bigram found in doc text.
91+
const PHRASE_BOOST_EXACT = 5.0;
92+
const PHRASE_BOOST_PARTIAL = 2.0;
93+
8694
/** Search BM25 index, return sorted {id, score} pairs */
8795
export function searchBM25(
8896
index: BM25Index,
@@ -92,6 +100,10 @@ export function searchBM25(
92100
const queryTerms = tokenize(query);
93101
if (queryTerms.length === 0) return [];
94102

103+
// Pre-compute phrase matching inputs once, outside the per-doc loop.
104+
const queryLower = query.toLowerCase();
105+
const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));
106+
95107
const scores: Array<{ id: string; score: number }> = [];
96108

97109
for (const doc of index.docs) {
@@ -119,6 +131,23 @@ export function searchBM25(
119131
score += idf * tfNorm;
120132
}
121133

134+
// Phrase boost: supplement BM25 — never replace it.
135+
// Only apply when the document already has genuine BM25 relevance.
136+
if (score > 0) {
137+
const docLower = doc.originalText.toLowerCase();
138+
if (docLower.includes(queryLower)) {
139+
score += PHRASE_BOOST_EXACT;
140+
} else if (queryWords.length >= 2) {
141+
for (let i = 0; i < queryWords.length - 1; i++) {
142+
const bigram = queryWords[i] + " " + queryWords[i + 1];
143+
if (docLower.includes(bigram)) {
144+
score += PHRASE_BOOST_PARTIAL;
145+
break;
146+
}
147+
}
148+
}
149+
}
150+
122151
if (score > 0) scores.push({ id: doc.id, score });
123152
}
124153

workers/src/zip-baseline-fetcher.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -760,8 +760,22 @@ export class ZipBaselineFetcher {
760760
if (this.env.BASELINE_CACHE) {
761761
const cached = await this.env.BASELINE_CACHE.get(cacheKey, "json") as BaselineIndex | null;
762762
if (cached) {
763-
// Content-addressed cache hit: SHA matches, content is truthful
764-
return cached;
763+
// Cloudflare KV is eventually consistent — two requests seconds apart
764+
// can hit different edge nodes and return stale data even when the
765+
// cache key looks correct. Cross-check the cached index's embedded
766+
// commit SHAs against the SHAs we just resolved from the GitHub API.
767+
// If they diverge, the cached entry is stale; discard and rebuild.
768+
const baselineShaMatch = !baselineSha || cached.commit_sha === baselineSha;
769+
const canonShaMatch = !canonSha || cached.canon_commit_sha === canonSha;
770+
if (baselineShaMatch && canonShaMatch) {
771+
// Content-addressed cache hit: SHA verified, content is truthful.
772+
return cached;
773+
}
774+
console.warn(
775+
`KV cache SHA mismatch — discarding stale index. ` +
776+
`cached=${cached.commit_sha}/${cached.canon_commit_sha} ` +
777+
`resolved=${baselineSha}/${canonSha ?? "none"}`
778+
);
765779
}
766780
}
767781

0 commit comments

Comments
 (0)