From 0274ffd9e3b42f13eecb3347eb7d3dfc2b88a130 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 5 Feb 2026 11:19:00 -0800 Subject: [PATCH] fix: FTS flat search drops rows when avg_doc_length < 1.0 Two integer arithmetic bugs in BM25 scoring caused scores to be 0 for unindexed rows when indexed data has fractional average document length (e.g. single-word stop words). Rows with score 0 are filtered out, silently dropping results. 1. `MemBM25Scorer::avg_doc_length()` used integer division, truncating values < 1.0 to 0. 2. `flat_bm25_search_stream` reconstructed `total_tokens` by casting the float avg back to u64, losing precision. Fixes #5871 Co-Authored-By: Claude Opus 4.5 --- rust/lance-index/src/scalar/inverted/index.rs | 2 +- .../lance-index/src/scalar/inverted/scorer.rs | 8 ++-- rust/lance/src/dataset/tests/dataset_index.rs | 44 +++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index dab83131f8a..78ca78b50f7 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -2538,7 +2538,7 @@ pub fn flat_bm25_search_stream( token_docs.insert(token.clone(), token_nq); } MemBM25Scorer::new( - index_bm25_scorer.avg_doc_length() as u64 * index_bm25_scorer.num_docs() as u64, + index_bm25_scorer.total_tokens(), index_bm25_scorer.num_docs(), token_docs, ) diff --git a/rust/lance-index/src/scalar/inverted/scorer.rs b/rust/lance-index/src/scalar/inverted/scorer.rs index 4f38f03d712..33359ff003f 100644 --- a/rust/lance-index/src/scalar/inverted/scorer.rs +++ b/rust/lance-index/src/scalar/inverted/scorer.rs @@ -57,7 +57,7 @@ impl MemBM25Scorer { } pub fn avg_doc_length(&self) -> f32 { - (self.total_tokens / self.num_docs as u64) as f32 + self.total_tokens as f32 / self.num_docs as f32 } pub fn num_docs_containing_token(&self, token: &str) -> usize { @@ -71,6 +71,7 @@ impl MemBM25Scorer { pub struct IndexBM25Scorer<'a> { partitions: Vec<&'a InvertedPartition>, num_docs: usize, + total_tokens: u64, avg_doc_length: f32, } @@ -86,6 +87,7 @@ impl<'a> IndexBM25Scorer<'a> { Self { partitions, num_docs, + total_tokens, avg_doc_length: avgdl, } } @@ -94,8 +96,8 @@ impl<'a> IndexBM25Scorer<'a> { self.num_docs } - pub fn avg_doc_length(&self) -> f32 { - self.avg_doc_length + pub fn total_tokens(&self) -> u64 { + self.total_tokens } pub fn num_docs_containing_token(&self, token: &str) -> usize { diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index 62a4302701a..9ee0144af67 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -927,6 +927,50 @@ async fn test_fts_unindexed_data() { assert_eq!(results.num_rows(), 1); } +#[tokio::test] +async fn test_fts_unindexed_data_with_stop_words() { + // When indexed data has avg_doc_length < 1.0 (e.g. single-word stop words + // that get filtered), the BM25 scorer must still produce non-zero scores + // for unindexed rows. Regression test for #5871. + let params = InvertedIndexParams::default(); + let text_col = StringArray::from(vec!["a", "is", "the", "bug"]); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(), + vec![Arc::new(text_col) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + let mut dataset = Dataset::write(batches, "memory://stop_words.lance", None) + .await + .unwrap(); + dataset + .create_index(&["text"], IndexType::Inverted, None, ¶ms, true) + .await + .unwrap(); + + // Append unindexed rows with a term not in the index + let unindexed: Vec = (0..10).map(|i| format!("hello_{i}")).collect(); + let text_col = StringArray::from(unindexed); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(), + vec![Arc::new(text_col) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + dataset.append(batches, None).await.unwrap(); + + let results = dataset + .scan() + .full_text_search(FullTextSearchQuery::new("hello".to_owned())) + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(results.num_rows(), 10); +} + #[tokio::test] async fn test_fts_unindexed_data_on_empty_index() { // Empty dataset with fts index