Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion src/search/bm25.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ export function buildBM25Index(documents) {

for (const doc of documents) {
const terms = tokenize(doc.text);
docs.push({ id: doc.id, terms, length: terms.length });
docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text });
totalLength += terms.length;

const seen = new Set();
Expand All @@ -68,11 +68,21 @@ export function buildBM25Index(documents) {
};
}

// Phrase boost constants — supplement BM25, never replace it.
// Exact: full query string found as substring in doc text.
// Partial: any consecutive two-word query bigram found in doc text.
const PHRASE_BOOST_EXACT = 5.0;
const PHRASE_BOOST_PARTIAL = 2.0;

/** Search BM25 index, return sorted {id, score} pairs */
export function searchBM25(index, query, limit = 5) {
const queryTerms = tokenize(query);
if (queryTerms.length === 0) return [];

// Pre-compute phrase matching inputs once, outside the per-doc loop.
const queryLower = query.toLowerCase();
const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));

const scores = [];

for (const doc of index.docs) {
Expand All @@ -96,6 +106,23 @@ export function searchBM25(index, query, limit = 5) {
score += idf * tfNorm;
}

// Phrase boost: supplement BM25 — never replace it.
// Only apply when the document already has genuine BM25 relevance.
if (score > 0) {
const docLower = doc.originalText.toLowerCase();
if (docLower.includes(queryLower)) {
score += PHRASE_BOOST_EXACT;
} else if (queryWords.length >= 2) {
for (let i = 0; i < queryWords.length - 1; i++) {
const bigram = queryWords[i] + " " + queryWords[i + 1];
if (docLower.includes(bigram)) {
score += PHRASE_BOOST_PARTIAL;
break;
}
}
}
}

if (score > 0) scores.push({ id: doc.id, score });
}

Expand Down
31 changes: 30 additions & 1 deletion workers/src/bm25.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ export interface BM25Doc {
id: string;
terms: string[];
length: number;
/** Original (pre-tokenization) text, used for phrase-level scoring. */
originalText: string;
}

export interface BM25Index {
Expand All @@ -63,7 +65,7 @@ export function buildBM25Index(

for (const doc of documents) {
const terms = tokenize(doc.text);
docs.push({ id: doc.id, terms, length: terms.length });
docs.push({ id: doc.id, terms, length: terms.length, originalText: doc.text });
totalLength += terms.length;

const seen = new Set<string>();
Expand All @@ -83,6 +85,12 @@ export function buildBM25Index(
};
}

// Phrase boost constants — supplement BM25, never replace it.
// Exact: full query string found as substring in doc text.
// Partial: any consecutive two-word query bigram found in doc text.
const PHRASE_BOOST_EXACT = 5.0;
const PHRASE_BOOST_PARTIAL = 2.0;

/** Search BM25 index, return sorted {id, score} pairs */
export function searchBM25(
index: BM25Index,
Expand All @@ -92,6 +100,10 @@ export function searchBM25(
const queryTerms = tokenize(query);
if (queryTerms.length === 0) return [];

// Pre-compute phrase matching inputs once, outside the per-doc loop.
const queryLower = query.toLowerCase();
const queryWords = queryLower.replace(/[^\w\s-]/g, " ").split(/[\s\-_/]+/).filter((w) => w.length > 1 && !STOP_WORDS.has(w));

const scores: Array<{ id: string; score: number }> = [];

for (const doc of index.docs) {
Expand Down Expand Up @@ -119,6 +131,23 @@ export function searchBM25(
score += idf * tfNorm;
}

// Phrase boost: supplement BM25 — never replace it.
// Only apply when the document already has genuine BM25 relevance.
if (score > 0) {
const docLower = doc.originalText.toLowerCase();
if (docLower.includes(queryLower)) {
score += PHRASE_BOOST_EXACT;
} else if (queryWords.length >= 2) {
for (let i = 0; i < queryWords.length - 1; i++) {
const bigram = queryWords[i] + " " + queryWords[i + 1];
if (docLower.includes(bigram)) {
score += PHRASE_BOOST_PARTIAL;
break;
}
}
}
}

Comment thread
cursor[bot] marked this conversation as resolved.
if (score > 0) scores.push({ id: doc.id, score });
}

Expand Down
18 changes: 16 additions & 2 deletions workers/src/zip-baseline-fetcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -760,8 +760,22 @@ export class ZipBaselineFetcher {
if (this.env.BASELINE_CACHE) {
const cached = await this.env.BASELINE_CACHE.get(cacheKey, "json") as BaselineIndex | null;
if (cached) {
// Content-addressed cache hit: SHA matches, content is truthful
return cached;
// Cloudflare KV is eventually consistent — two requests seconds apart
// can hit different edge nodes and return stale data even when the
// cache key looks correct. Cross-check the cached index's embedded
// commit SHAs against the SHAs we just resolved from the GitHub API.
// If they diverge, the cached entry is stale; discard and rebuild.
const baselineShaMatch = !baselineSha || cached.commit_sha === baselineSha;
const canonShaMatch = !canonSha || cached.canon_commit_sha === canonSha;
if (baselineShaMatch && canonShaMatch) {
// Content-addressed cache hit: SHA verified, content is truthful.
return cached;
}
console.warn(
`KV cache SHA mismatch — discarding stale index. ` +
`cached=${cached.commit_sha}/${cached.canon_commit_sha} ` +
`resolved=${baselineSha}/${canonSha ?? "none"}`
);
}
}

Expand Down
Loading