@@ -44,6 +44,8 @@ export interface BM25Doc {
4444 id : string ;
4545 terms : string [ ] ;
4646 length : number ;
47+ /** Original (pre-tokenization) text, used for phrase-level scoring. */
48+ originalText : string ;
4749}
4850
4951export interface BM25Index {
@@ -63,7 +65,7 @@ export function buildBM25Index(
6365
6466 for ( const doc of documents ) {
6567 const terms = tokenize ( doc . text ) ;
66- docs . push ( { id : doc . id , terms, length : terms . length } ) ;
68+ docs . push ( { id : doc . id , terms, length : terms . length , originalText : doc . text } ) ;
6769 totalLength += terms . length ;
6870
6971 const seen = new Set < string > ( ) ;
@@ -83,6 +85,12 @@ export function buildBM25Index(
8385 } ;
8486}
8587
88+ // Phrase boost constants — supplement BM25, never replace it.
89+ // Exact: full query string found as substring in doc text.
90+ // Partial: any consecutive two-word query bigram found in doc text.
91+ const PHRASE_BOOST_EXACT = 5.0 ;
92+ const PHRASE_BOOST_PARTIAL = 2.0 ;
93+
8694/** Search BM25 index, return sorted {id, score} pairs */
8795export function searchBM25 (
8896 index : BM25Index ,
@@ -92,6 +100,10 @@ export function searchBM25(
92100 const queryTerms = tokenize ( query ) ;
93101 if ( queryTerms . length === 0 ) return [ ] ;
94102
103+ // Pre-compute phrase matching inputs once, outside the per-doc loop.
104+ const queryLower = query . toLowerCase ( ) ;
105+ const queryWords = queryLower . replace ( / [ ^ \w \s - ] / g, " " ) . split ( / [ \s \- _ / ] + / ) . filter ( ( w ) => w . length > 1 && ! STOP_WORDS . has ( w ) ) ;
106+
95107 const scores : Array < { id : string ; score : number } > = [ ] ;
96108
97109 for ( const doc of index . docs ) {
@@ -119,6 +131,23 @@ export function searchBM25(
119131 score += idf * tfNorm ;
120132 }
121133
134+ // Phrase boost: supplement BM25 — never replace it.
135+ // Only apply when the document already has genuine BM25 relevance.
136+ if ( score > 0 ) {
137+ const docLower = doc . originalText . toLowerCase ( ) ;
138+ if ( docLower . includes ( queryLower ) ) {
139+ score += PHRASE_BOOST_EXACT ;
140+ } else if ( queryWords . length >= 2 ) {
141+ for ( let i = 0 ; i < queryWords . length - 1 ; i ++ ) {
142+ const bigram = queryWords [ i ] + " " + queryWords [ i + 1 ] ;
143+ if ( docLower . includes ( bigram ) ) {
144+ score += PHRASE_BOOST_PARTIAL ;
145+ break ;
146+ }
147+ }
148+ }
149+ }
150+
122151 if ( score > 0 ) scores . push ( { id : doc . id , score } ) ;
123152 }
124153
0 commit comments