Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rust/lance-index/src/scalar/inverted/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2538,7 +2538,7 @@ pub fn flat_bm25_search_stream(
token_docs.insert(token.clone(), token_nq);
}
MemBM25Scorer::new(
index_bm25_scorer.avg_doc_length() as u64 * index_bm25_scorer.num_docs() as u64,
index_bm25_scorer.total_tokens(),
index_bm25_scorer.num_docs(),
token_docs,
)
Expand Down
8 changes: 5 additions & 3 deletions rust/lance-index/src/scalar/inverted/scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ impl MemBM25Scorer {
}

pub fn avg_doc_length(&self) -> f32 {
(self.total_tokens / self.num_docs as u64) as f32
self.total_tokens as f32 / self.num_docs as f32
}

pub fn num_docs_containing_token(&self, token: &str) -> usize {
Expand All @@ -71,6 +71,7 @@ impl MemBM25Scorer {
pub struct IndexBM25Scorer<'a> {
partitions: Vec<&'a InvertedPartition>,
num_docs: usize,
total_tokens: u64,
avg_doc_length: f32,
}

Expand All @@ -86,6 +87,7 @@ impl<'a> IndexBM25Scorer<'a> {
Self {
partitions,
num_docs,
total_tokens,
avg_doc_length: avgdl,
}
}
Expand All @@ -94,8 +96,8 @@ impl<'a> IndexBM25Scorer<'a> {
self.num_docs
}

pub fn avg_doc_length(&self) -> f32 {
self.avg_doc_length
pub fn total_tokens(&self) -> u64 {
self.total_tokens
}

pub fn num_docs_containing_token(&self, token: &str) -> usize {
Expand Down
44 changes: 44 additions & 0 deletions rust/lance/src/dataset/tests/dataset_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,50 @@ async fn test_fts_unindexed_data() {
assert_eq!(results.num_rows(), 1);
}

#[tokio::test]
async fn test_fts_unindexed_data_with_stop_words() {
// When indexed data has avg_doc_length < 1.0 (e.g. single-word stop words
// that get filtered), the BM25 scorer must still produce non-zero scores
// for unindexed rows. Regression test for #5871.
let params = InvertedIndexParams::default();
let text_col = StringArray::from(vec!["a", "is", "the", "bug"]);
let batch = RecordBatch::try_new(
arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(),
vec![Arc::new(text_col) as ArrayRef],
)
.unwrap();
let schema = batch.schema();
let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
let mut dataset = Dataset::write(batches, "memory://stop_words.lance", None)
.await
.unwrap();
dataset
.create_index(&["text"], IndexType::Inverted, None, &params, true)
.await
.unwrap();

// Append unindexed rows with a term not in the index
let unindexed: Vec<String> = (0..10).map(|i| format!("hello_{i}")).collect();
let text_col = StringArray::from(unindexed);
let batch = RecordBatch::try_new(
arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(),
vec![Arc::new(text_col) as ArrayRef],
)
.unwrap();
let schema = batch.schema();
let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
dataset.append(batches, None).await.unwrap();

let results = dataset
.scan()
.full_text_search(FullTextSearchQuery::new("hello".to_owned()))
.unwrap()
.try_into_batch()
.await
.unwrap();
assert_eq!(results.num_rows(), 10);
}

#[tokio::test]
async fn test_fts_unindexed_data_on_empty_index() {
// Empty dataset with fts index
Expand Down
Loading