From e7e188590c6b491bd2b087982825e4714a312ab9 Mon Sep 17 00:00:00 2001 From: stevie9868 Date: Mon, 9 Feb 2026 10:53:14 -0800 Subject: [PATCH 1/4] respect fragment restrictions in vector and FTS searches for unindex cases refactor refactor --- rust/lance/src/dataset/scanner.rs | 168 ++++++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 10 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index a9265517fe5..4bd951770ec 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -2938,8 +2938,29 @@ impl Scanner { .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts()) .await?; + // Get target fragments + let target_fragments = self + .fragments + .clone() + .unwrap_or_else(|| self.dataset.fragments().to_vec()); + let (match_plan, flat_match_plan) = match &index { Some(index) => { + // Get unindexed fragments and filter to target fragments + let mut unindexed_fragments = self.dataset.unindexed_fragments(&index.name).await?; + let target_bitmap = + RoaringBitmap::from_iter(target_fragments.iter().map(|f| f.id as u32)); + unindexed_fragments.retain(|f| target_bitmap.contains(f.id as u32)); + + // If all target fragments are unindexed, skip index entirely + if unindexed_fragments.len() == target_fragments.len() { + let flat_match_plan = self + .plan_flat_match_query(unindexed_fragments, query, params, filter_plan) + .await?; + return Ok(flat_match_plan); + } + + // Mixed case: use index + flat search for unindexed let match_plan: Arc = Arc::new(MatchQueryExec::new( self.dataset.clone(), query.clone(), @@ -2947,7 +2968,6 @@ impl Scanner { prefilter_source.clone(), )); - let unindexed_fragments = self.dataset.unindexed_fragments(&index.name).await?; if unindexed_fragments.is_empty() { (Some(match_plan), None) } else { @@ -2958,9 +2978,9 @@ impl Scanner { } } None => { - let unindexed_fragments = self.dataset.fragments().iter().cloned().collect(); + // No index: flat search all target fragments let flat_match_plan = self - .plan_flat_match_query(unindexed_fragments, query, params, filter_plan) + .plan_flat_match_query(target_fragments.to_vec(), query, params, filter_plan) .await?; (None, Some(flat_match_plan)) } @@ -3102,7 +3122,20 @@ impl Scanner { None }; - if let Some((index, _idx, index_metric)) = matching_index { + // Only return index and deltas if there is an index on the column and at least one of the target fragments are indexed + let index_and_deltas = if let Some((index, _idx, index_metric)) = matching_index { + let deltas = self.dataset.load_indices_by_name(&index.name).await?; + let index_frags = self.get_indexed_frags(&deltas); + if !index_frags.is_empty() { + Some((index, deltas, index_metric)) + } else { + None + } + } else { + None + }; + + if let Some((index, deltas, index_metric)) = index_and_deltas { log::trace!("index found for vector search"); // Use the index's metric type q.metric_type = Some(index_metric); @@ -3114,9 +3147,6 @@ impl Scanner { location!(), )); } - - // Find all deltas with the same index name. - let deltas = self.dataset.load_indices_by_name(&index.name).await?; let ann_node = match vector_type { DataType::FixedSizeList(_, _) => self.ann(&q, &deltas, filter_plan).await?, DataType::List(_) => self.multivec_ann(&q, &deltas, filter_plan).await?, @@ -3166,7 +3196,7 @@ impl Scanner { filter_plan, vector_scan_projection, /*include_deleted_rows=*/ true, - None, + self.fragments.clone().map(Arc::new), None, /*is_prefilter= */ true, ) @@ -3187,8 +3217,13 @@ impl Scanner { mut knn_node: Arc, filter_plan: &ExprFilterPlan, ) -> Result> { - // Check if we've created new versions since the index was built. - let unindexed_fragments = self.dataset.unindexed_fragments(&index.name).await?; + // Get unindexed fragments and filter to target fragments + let mut unindexed_fragments = self.dataset.unindexed_fragments(&index.name).await?; + if let Some(target_frags) = &self.fragments { + let target_bitmap = RoaringBitmap::from_iter(target_frags.iter().map(|f| f.id as u32)); + unindexed_fragments.retain(|f| target_bitmap.contains(f.id as u32)); + } + if !unindexed_fragments.is_empty() { // need to set the metric type to be the same as the index // to make sure the distance is comparable. @@ -9079,4 +9114,117 @@ mod test { runtime.handle().metrics().num_alive_tasks() ); } + + #[tokio::test] + async fn test_vector_search_respects_fragment_list() { + // Create dataset with 2 initial fragments (400 rows, max 200 per file) + let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) + .await + .unwrap(); + + // Create index on first 2 fragments + test_ds.make_vector_index().await.unwrap(); + + // Append one more fragment after indexing (this will be unindexed) + test_ds.append_new_data().await.unwrap(); + + // Now we have 3 fragments: + // Fragment 0: i=0..200 (indexed) + // Fragment 1: i=200..400 (indexed) + // Fragment 2: i=400..410 (unindexed) + + let fragments = test_ds.dataset.fragments(); + assert_eq!(fragments.len(), 3); + + // Test 1: Query only unindexed fragment (fragment 2) + let query: Float32Array = (0..32).map(|v| v as f32).collect(); + let fragment_2 = vec![fragments[2].clone()]; + + let mut scanner = test_ds.dataset.scan(); + scanner + .nearest("vec", &query, 5) + .unwrap() + .with_fragments(fragment_2); + + let batches: Vec<_> = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Should only get results from fragment 2 (i=400..410) + let mut has_results = false; + for batch in &batches { + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (400..410).contains(&val), + "Expected only values from fragment 2 (i=400..410), but got i={}", + val + ); + } + } + assert!(has_results, "Expected some results from fragment 2"); + } + + #[tokio::test] + async fn test_fts_respects_fragment_list() { + // Create dataset with 2 initial fragments (400 rows, max 200 per file) + let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) + .await + .unwrap(); + + // Create FTS index on first 2 fragments + test_ds.make_fts_index().await.unwrap(); + + // Append one more fragment after indexing (this will be unindexed) + test_ds.append_new_data().await.unwrap(); + + // Now we have 3 fragments: + // Fragment 0: i=0..200 (indexed) + // Fragment 1: i=200..400 (indexed) + // Fragment 2: i=400..410 (unindexed) + + let fragments = test_ds.dataset.fragments(); + assert_eq!(fragments.len(), 3); + + // Test 1: Query only unindexed fragment (fragment 2) + let fragment_2 = vec![fragments[2].clone()]; + + let mut scanner = test_ds.dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("s-405".into())) + .unwrap() + .with_fragments(fragment_2); + + let batches: Vec<_> = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Should only get results from fragment 2 (i=400..410) + let mut has_results = false; + for batch in &batches { + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (400..410).contains(&val), + "Expected only values from fragment 2 (i=400..410), but got i={}", + val + ); + } + } + assert!(has_results, "Expected some results from fragment 2"); + } } From f0c3e817114d414dff173b0b6c81f0411e8b8c85 Mon Sep 17 00:00:00 2001 From: stevie9868 Date: Thu, 12 Feb 2026 08:29:57 -0800 Subject: [PATCH 2/4] address comments --- rust/lance/src/dataset/scanner.rs | 266 ++++++++++++++++++++++++------ 1 file changed, 214 insertions(+), 52 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 4bd951770ec..9f26aa44ba2 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4277,16 +4277,21 @@ pub mod test_dataset { } pub async fn append_new_data(&mut self) -> Result<()> { - let vector_values: Float32Array = (0..10) + self.append_data_with_range(400, 410).await + } + + pub async fn append_data_with_range(&mut self, start: i32, end: i32) -> Result<()> { + let count = (end - start) as usize; + let vector_values: Float32Array = (0..count) .flat_map(|i| vec![i as f32; self.dimension as usize].into_iter()) .collect(); let new_vectors = FixedSizeListArray::try_new_from_values(vector_values, self.dimension as i32) .unwrap(); let new_data: Vec = vec![ - Arc::new(Int32Array::from_iter_values(400..410)), // 5 * 80 + Arc::new(Int32Array::from_iter_values(start..end)), Arc::new(StringArray::from_iter_values( - (400..410).map(|v| format!("s-{}", v)), + (start..end).map(|v| format!("s-{}", v)), )), Arc::new(new_vectors), ]; @@ -9125,51 +9130,128 @@ mod test { // Create index on first 2 fragments test_ds.make_vector_index().await.unwrap(); - // Append one more fragment after indexing (this will be unindexed) - test_ds.append_new_data().await.unwrap(); + // Append two more fragments after indexing (these will be unindexed) + test_ds.append_data_with_range(400, 410).await.unwrap(); + test_ds.append_data_with_range(410, 420).await.unwrap(); - // Now we have 3 fragments: + // Now we have 4 fragments: // Fragment 0: i=0..200 (indexed) // Fragment 1: i=200..400 (indexed) // Fragment 2: i=400..410 (unindexed) + // Fragment 3: i=410..420 (unindexed) let fragments = test_ds.dataset.fragments(); - assert_eq!(fragments.len(), 3); + assert_eq!(fragments.len(), 4); - // Test 1: Query only unindexed fragment (fragment 2) let query: Float32Array = (0..32).map(|v| v as f32).collect(); + + // Test 1: Query without fragment filter - should get results from all fragments + let mut scanner = test_ds.dataset.scan(); + scanner.nearest("vec", &query, 420).unwrap(); + + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + + // Verify we get results from all fragments by checking all value ranges are present + let has_fragment_0 = i_array + .iter() + .any(|v| v.map_or(false, |val| (0..200).contains(&val))); + let has_fragment_1 = i_array + .iter() + .any(|v| v.map_or(false, |val| (200..400).contains(&val))); + let has_fragment_2 = i_array + .iter() + .any(|v| v.map_or(false, |val| (400..410).contains(&val))); + let has_fragment_3 = i_array + .iter() + .any(|v| v.map_or(false, |val| (410..420).contains(&val))); + assert!( + has_fragment_0 && has_fragment_1 && has_fragment_2 && has_fragment_3, + "Expected results from all fragments" + ); + + // Test 2: Query only one unindexed fragment (fragment 2), excluding fragment 3 let fragment_2 = vec![fragments[2].clone()]; let mut scanner = test_ds.dataset.scan(); scanner - .nearest("vec", &query, 5) + .nearest("vec", &query, 10) .unwrap() .with_fragments(fragment_2); - let batches: Vec<_> = scanner - .try_into_stream() - .await - .unwrap() - .try_collect::>() - .await - .unwrap(); + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); // Should only get results from fragment 2 (i=400..410) let mut has_results = false; - for batch in &batches { - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (400..410).contains(&val), - "Expected only values from fragment 2 (i=400..410), but got i={}", - val - ); - } + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (400..410).contains(&val), + "Expected only values from fragment 2 (i=400..410), but got i={}", + val + ); } assert!(has_results, "Expected some results from fragment 2"); + + // Test 3: Query only indexed fragments (fragments 0 and 1) + let indexed_fragments = vec![fragments[0].clone(), fragments[1].clone()]; + + let mut scanner = test_ds.dataset.scan(); + scanner + .nearest("vec", &query, 400) + .unwrap() + .with_fragments(indexed_fragments); + + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + + // Should only get results from indexed fragments (i=0..400) + let mut has_results = false; + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (0..400).contains(&val), + "Expected only values from indexed fragments (i=0..400), but got i={}", + val + ); + } + assert!(has_results, "Expected some results from indexed fragments"); + + // Test 4: Query all indexed fragments (0, 1) plus one unindexed fragment (2), excluding fragment 3 + let mixed_fragments = vec![ + fragments[0].clone(), + fragments[1].clone(), + fragments[2].clone(), + ]; + + let mut scanner = test_ds.dataset.scan(); + scanner + .nearest("vec", &query, 410) + .unwrap() + .with_fragments(mixed_fragments); + + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + + // Should get results from fragments 0, 1, and 2 (i=0..410) only, excluding fragment 3 + let mut has_results = false; + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (0..410).contains(&val), + "Expected only values from fragments 0, 1, and 2 (i=0..410), but got i={}", + val + ); + } + assert!(has_results, "Expected some results from mixed fragments"); } #[tokio::test] @@ -9182,18 +9264,49 @@ mod test { // Create FTS index on first 2 fragments test_ds.make_fts_index().await.unwrap(); - // Append one more fragment after indexing (this will be unindexed) - test_ds.append_new_data().await.unwrap(); + // Append two more fragments after indexing (these will be unindexed) + test_ds.append_data_with_range(400, 410).await.unwrap(); + test_ds.append_data_with_range(410, 420).await.unwrap(); - // Now we have 3 fragments: + // Now we have 4 fragments: // Fragment 0: i=0..200 (indexed) // Fragment 1: i=200..400 (indexed) // Fragment 2: i=400..410 (unindexed) + // Fragment 3: i=410..420 (unindexed) let fragments = test_ds.dataset.fragments(); - assert_eq!(fragments.len(), 3); + assert_eq!(fragments.len(), 4); + + // Test 1: Query without fragment filter - should get results from all fragments + let mut scanner = test_ds.dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("s-5".into())) + .unwrap(); + + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + + // "s-5" matches: s-5, s-50-59, s-150-159 (frag 0), s-250-259, s-350-359 (frag 1), s-405 (frag 2), s-415 (frag 3) + // Verify we get results from all fragments + let has_fragment_0 = i_array + .iter() + .any(|v| v.map_or(false, |val| (0..200).contains(&val))); + let has_fragment_1 = i_array + .iter() + .any(|v| v.map_or(false, |val| (200..400).contains(&val))); + let has_fragment_2 = i_array + .iter() + .any(|v| v.map_or(false, |val| (400..410).contains(&val))); + let has_fragment_3 = i_array + .iter() + .any(|v| v.map_or(false, |val| (410..420).contains(&val))); + assert!( + has_fragment_0 && has_fragment_1 && has_fragment_2 && has_fragment_3, + "Expected results from all fragments" + ); - // Test 1: Query only unindexed fragment (fragment 2) + // Test 2: Query only one unindexed fragment (fragment 2), excluding fragment 3 let fragment_2 = vec![fragments[2].clone()]; let mut scanner = test_ds.dataset.scan(); @@ -9202,29 +9315,78 @@ mod test { .unwrap() .with_fragments(fragment_2); - let batches: Vec<_> = scanner - .try_into_stream() - .await - .unwrap() - .try_collect::>() - .await - .unwrap(); + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); // Should only get results from fragment 2 (i=400..410) let mut has_results = false; - for batch in &batches { - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (400..410).contains(&val), - "Expected only values from fragment 2 (i=400..410), but got i={}", - val - ); - } + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (400..410).contains(&val), + "Expected only values from fragment 2 (i=400..410), but got i={}", + val + ); } assert!(has_results, "Expected some results from fragment 2"); + + // Test 3: Query only indexed fragments (fragments 0 and 1) + let indexed_fragments = vec![fragments[0].clone(), fragments[1].clone()]; + + let mut scanner = test_ds.dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("s-5".into())) + .unwrap() + .with_fragments(indexed_fragments); + + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + + // Should only get results from indexed fragments (i=0..400) + let mut has_results = false; + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (0..400).contains(&val), + "Expected only values from indexed fragments (i=0..400), but got i={}", + val + ); + } + assert!(has_results, "Expected some results from indexed fragments"); + + // Test 4: Query all indexed fragments (0, 1) plus one unindexed fragment (2), excluding fragment 3 + let mixed_fragments = vec![ + fragments[0].clone(), + fragments[1].clone(), + fragments[2].clone(), + ]; + + let mut scanner = test_ds.dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("s-5".into())) + .unwrap() + .with_fragments(mixed_fragments); + + let batch = scanner.try_into_batch().await.unwrap(); + let i_col = batch.column_by_name("i").unwrap(); + let i_array = i_col.as_any().downcast_ref::().unwrap(); + + // Should get results from fragments 0, 1, and 2 (i=0..410) only, excluding fragment 3 + // "s-5" matches: s-5, s-50-59, s-150-159 (frag 0), s-250-259, s-350-359 (frag 1), s-405 (frag 2) + let mut has_results = false; + for idx in 0..i_array.len() { + has_results = true; + let val = i_array.value(idx); + assert!( + (0..410).contains(&val), + "Expected only values from fragments 0, 1, and 2 (i=0..410), but got i={}", + val + ); + } + assert!(has_results, "Expected some results from mixed fragments"); } } From 1f5e3b85574c6b963cf57efbd7a655ef140dbad1 Mon Sep 17 00:00:00 2001 From: stevie9868 Date: Thu, 12 Feb 2026 08:50:34 -0800 Subject: [PATCH 3/4] fix: address clippy warnings - use is_some_and instead of map_or --- rust/lance/src/dataset/scanner.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 9f26aa44ba2..a80e41c6239 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -9156,16 +9156,16 @@ mod test { // Verify we get results from all fragments by checking all value ranges are present let has_fragment_0 = i_array .iter() - .any(|v| v.map_or(false, |val| (0..200).contains(&val))); + .any(|v| v.is_some_and(|val| (0..200).contains(&val))); let has_fragment_1 = i_array .iter() - .any(|v| v.map_or(false, |val| (200..400).contains(&val))); + .any(|v| v.is_some_and(|val| (200..400).contains(&val))); let has_fragment_2 = i_array .iter() - .any(|v| v.map_or(false, |val| (400..410).contains(&val))); + .any(|v| v.is_some_and(|val| (400..410).contains(&val))); let has_fragment_3 = i_array .iter() - .any(|v| v.map_or(false, |val| (410..420).contains(&val))); + .any(|v| v.is_some_and(|val| (410..420).contains(&val))); assert!( has_fragment_0 && has_fragment_1 && has_fragment_2 && has_fragment_3, "Expected results from all fragments" @@ -9291,16 +9291,16 @@ mod test { // Verify we get results from all fragments let has_fragment_0 = i_array .iter() - .any(|v| v.map_or(false, |val| (0..200).contains(&val))); + .any(|v| v.is_some_and(|val| (0..200).contains(&val))); let has_fragment_1 = i_array .iter() - .any(|v| v.map_or(false, |val| (200..400).contains(&val))); + .any(|v| v.is_some_and(|val| (200..400).contains(&val))); let has_fragment_2 = i_array .iter() - .any(|v| v.map_or(false, |val| (400..410).contains(&val))); + .any(|v| v.is_some_and(|val| (400..410).contains(&val))); let has_fragment_3 = i_array .iter() - .any(|v| v.map_or(false, |val| (410..420).contains(&val))); + .any(|v| v.is_some_and(|val| (410..420).contains(&val))); assert!( has_fragment_0 && has_fragment_1 && has_fragment_2 && has_fragment_3, "Expected results from all fragments" From 346749fb517cbd24675b69aa5f1132b3c3e1e3c0 Mon Sep 17 00:00:00 2001 From: stevie9868 Date: Thu, 12 Feb 2026 18:01:48 -0800 Subject: [PATCH 4/4] address comments --- rust/lance/src/dataset/scanner.rs | 328 ++++++++++-------------------- 1 file changed, 111 insertions(+), 217 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index a80e41c6239..fe25e9ba6a1 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -9120,6 +9120,103 @@ mod test { ); } + fn assert_values_in_range(array: &Int32Array, range: std::ops::Range, msg: &str) { + assert!(!array.is_empty(), "Expected some results but got none"); + assert!( + array + .iter() + .all(|v| v.is_some_and(|val| range.contains(&val))), + "{msg} (expected range {range:?})" + ); + } + + // Helper to assert that results exist from all fragment ranges + fn assert_has_all_fragments(array: &Int32Array) { + assert!( + array + .iter() + .any(|v| v.is_some_and(|val| (0..200).contains(&val))) + && array + .iter() + .any(|v| v.is_some_and(|val| (200..400).contains(&val))) + && array + .iter() + .any(|v| v.is_some_and(|val| (400..410).contains(&val))) + && array + .iter() + .any(|v| v.is_some_and(|val| (410..420).contains(&val))), + "Expected results from all fragments" + ); + } + + // Common test function for fragment list filtering + async fn test_fragment_list_filtering( + test_ds: &TestVectorDataset, + fragments: &[Fragment], + mut build_scanner: impl FnMut(&Dataset) -> Scanner, + ) { + // Test 1: Query without fragment filter - should get results from all fragments + let batch = build_scanner(&test_ds.dataset) + .try_into_batch() + .await + .unwrap(); + let i_array = batch + .column_by_name("i") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_has_all_fragments(i_array); + + // Test 2: Query only one unindexed fragment (fragment 2), excluding fragment 3 + let mut scanner = build_scanner(&test_ds.dataset); + scanner.with_fragments(vec![fragments[2].clone()]); + let batch = scanner.try_into_batch().await.unwrap(); + let i_array = batch + .column_by_name("i") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_values_in_range(i_array, 400..410, "Should only get results from fragment 2"); + + // Test 3: Query only indexed fragments (fragments 0 and 1) + let mut scanner = build_scanner(&test_ds.dataset); + scanner.with_fragments(vec![fragments[0].clone(), fragments[1].clone()]); + let batch = scanner.try_into_batch().await.unwrap(); + let i_array = batch + .column_by_name("i") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_values_in_range( + i_array, + 0..400, + "Should only get results from indexed fragments", + ); + + // Test 4: Query all indexed fragments (0, 1) plus one unindexed fragment (2), excluding fragment 3 + let mut scanner = build_scanner(&test_ds.dataset); + scanner.with_fragments(vec![ + fragments[0].clone(), + fragments[1].clone(), + fragments[2].clone(), + ]); + let batch = scanner.try_into_batch().await.unwrap(); + let i_array = batch + .column_by_name("i") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_values_in_range( + i_array, + 0..410, + "Should get results from fragments 0, 1, and 2, excluding fragment 3", + ); + } + #[tokio::test] async fn test_vector_search_respects_fragment_list() { // Create dataset with 2 initial fragments (400 rows, max 200 per file) @@ -9145,113 +9242,12 @@ mod test { let query: Float32Array = (0..32).map(|v| v as f32).collect(); - // Test 1: Query without fragment filter - should get results from all fragments - let mut scanner = test_ds.dataset.scan(); - scanner.nearest("vec", &query, 420).unwrap(); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Verify we get results from all fragments by checking all value ranges are present - let has_fragment_0 = i_array - .iter() - .any(|v| v.is_some_and(|val| (0..200).contains(&val))); - let has_fragment_1 = i_array - .iter() - .any(|v| v.is_some_and(|val| (200..400).contains(&val))); - let has_fragment_2 = i_array - .iter() - .any(|v| v.is_some_and(|val| (400..410).contains(&val))); - let has_fragment_3 = i_array - .iter() - .any(|v| v.is_some_and(|val| (410..420).contains(&val))); - assert!( - has_fragment_0 && has_fragment_1 && has_fragment_2 && has_fragment_3, - "Expected results from all fragments" - ); - - // Test 2: Query only one unindexed fragment (fragment 2), excluding fragment 3 - let fragment_2 = vec![fragments[2].clone()]; - - let mut scanner = test_ds.dataset.scan(); - scanner - .nearest("vec", &query, 10) - .unwrap() - .with_fragments(fragment_2); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Should only get results from fragment 2 (i=400..410) - let mut has_results = false; - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (400..410).contains(&val), - "Expected only values from fragment 2 (i=400..410), but got i={}", - val - ); - } - assert!(has_results, "Expected some results from fragment 2"); - - // Test 3: Query only indexed fragments (fragments 0 and 1) - let indexed_fragments = vec![fragments[0].clone(), fragments[1].clone()]; - - let mut scanner = test_ds.dataset.scan(); - scanner - .nearest("vec", &query, 400) - .unwrap() - .with_fragments(indexed_fragments); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Should only get results from indexed fragments (i=0..400) - let mut has_results = false; - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (0..400).contains(&val), - "Expected only values from indexed fragments (i=0..400), but got i={}", - val - ); - } - assert!(has_results, "Expected some results from indexed fragments"); - - // Test 4: Query all indexed fragments (0, 1) plus one unindexed fragment (2), excluding fragment 3 - let mixed_fragments = vec![ - fragments[0].clone(), - fragments[1].clone(), - fragments[2].clone(), - ]; - - let mut scanner = test_ds.dataset.scan(); - scanner - .nearest("vec", &query, 410) - .unwrap() - .with_fragments(mixed_fragments); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Should get results from fragments 0, 1, and 2 (i=0..410) only, excluding fragment 3 - let mut has_results = false; - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (0..410).contains(&val), - "Expected only values from fragments 0, 1, and 2 (i=0..410), but got i={}", - val - ); - } - assert!(has_results, "Expected some results from mixed fragments"); + test_fragment_list_filtering(&test_ds, fragments, |dataset| { + let mut scanner = dataset.scan(); + scanner.nearest("vec", &query, 420).unwrap(); + scanner + }) + .await; } #[tokio::test] @@ -9277,116 +9273,14 @@ mod test { let fragments = test_ds.dataset.fragments(); assert_eq!(fragments.len(), 4); - // Test 1: Query without fragment filter - should get results from all fragments - let mut scanner = test_ds.dataset.scan(); - scanner - .full_text_search(FullTextSearchQuery::new("s-5".into())) - .unwrap(); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - // "s-5" matches: s-5, s-50-59, s-150-159 (frag 0), s-250-259, s-350-359 (frag 1), s-405 (frag 2), s-415 (frag 3) - // Verify we get results from all fragments - let has_fragment_0 = i_array - .iter() - .any(|v| v.is_some_and(|val| (0..200).contains(&val))); - let has_fragment_1 = i_array - .iter() - .any(|v| v.is_some_and(|val| (200..400).contains(&val))); - let has_fragment_2 = i_array - .iter() - .any(|v| v.is_some_and(|val| (400..410).contains(&val))); - let has_fragment_3 = i_array - .iter() - .any(|v| v.is_some_and(|val| (410..420).contains(&val))); - assert!( - has_fragment_0 && has_fragment_1 && has_fragment_2 && has_fragment_3, - "Expected results from all fragments" - ); - - // Test 2: Query only one unindexed fragment (fragment 2), excluding fragment 3 - let fragment_2 = vec![fragments[2].clone()]; - - let mut scanner = test_ds.dataset.scan(); - scanner - .full_text_search(FullTextSearchQuery::new("s-405".into())) - .unwrap() - .with_fragments(fragment_2); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Should only get results from fragment 2 (i=400..410) - let mut has_results = false; - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (400..410).contains(&val), - "Expected only values from fragment 2 (i=400..410), but got i={}", - val - ); - } - assert!(has_results, "Expected some results from fragment 2"); - - // Test 3: Query only indexed fragments (fragments 0 and 1) - let indexed_fragments = vec![fragments[0].clone(), fragments[1].clone()]; - - let mut scanner = test_ds.dataset.scan(); - scanner - .full_text_search(FullTextSearchQuery::new("s-5".into())) - .unwrap() - .with_fragments(indexed_fragments); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Should only get results from indexed fragments (i=0..400) - let mut has_results = false; - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (0..400).contains(&val), - "Expected only values from indexed fragments (i=0..400), but got i={}", - val - ); - } - assert!(has_results, "Expected some results from indexed fragments"); - - // Test 4: Query all indexed fragments (0, 1) plus one unindexed fragment (2), excluding fragment 3 - let mixed_fragments = vec![ - fragments[0].clone(), - fragments[1].clone(), - fragments[2].clone(), - ]; - - let mut scanner = test_ds.dataset.scan(); - scanner - .full_text_search(FullTextSearchQuery::new("s-5".into())) - .unwrap() - .with_fragments(mixed_fragments); - - let batch = scanner.try_into_batch().await.unwrap(); - let i_col = batch.column_by_name("i").unwrap(); - let i_array = i_col.as_any().downcast_ref::().unwrap(); - - // Should get results from fragments 0, 1, and 2 (i=0..410) only, excluding fragment 3 - // "s-5" matches: s-5, s-50-59, s-150-159 (frag 0), s-250-259, s-350-359 (frag 1), s-405 (frag 2) - let mut has_results = false; - for idx in 0..i_array.len() { - has_results = true; - let val = i_array.value(idx); - assert!( - (0..410).contains(&val), - "Expected only values from fragments 0, 1, and 2 (i=0..410), but got i={}", - val - ); - } - assert!(has_results, "Expected some results from mixed fragments"); + test_fragment_list_filtering(&test_ds, fragments, |dataset| { + let mut scanner = dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("s-5".into())) + .unwrap(); + scanner + }) + .await; } }