diff --git a/Cargo.lock b/Cargo.lock index 2e68786d957..9f26e238541 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,7 +2302,7 @@ dependencies = [ [[package]] name = "fsst" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "lance-datagen", @@ -3002,7 +3002,7 @@ dependencies = [ [[package]] name = "lance" -version = "0.20.1" +version = "0.21.0" dependencies = [ "all_asserts", "approx", @@ -3082,7 +3082,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3099,7 +3099,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3138,7 +3138,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3166,7 +3166,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3183,7 +3183,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrayref", "arrow", @@ -3229,7 +3229,7 @@ dependencies = [ [[package]] name = "lance-encoding-datafusion" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3261,7 +3261,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-arith", "arrow-array", @@ -3303,7 +3303,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "0.20.1" +version = "0.21.0" dependencies = [ "approx", "arrow", @@ -3362,7 +3362,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-arith", @@ -3407,7 +3407,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-schema", @@ -3428,7 +3428,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "0.20.1" +version = "0.21.0" dependencies = [ "approx", "arrow-arith", @@ -3457,7 +3457,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3501,7 +3501,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "0.20.1" +version = "0.21.0" dependencies = [ "proc-macro2", "quote", @@ -3510,7 +3510,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-schema", diff --git a/Cargo.toml b/Cargo.toml index 94405a5c925..84c183579c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ exclude = ["python"] resolver = "2" [workspace.package] -version = "0.20.1" +version = "0.21.0" edition = "2021" authors = ["Lance Devs "] license = "Apache-2.0" @@ -44,21 +44,21 @@ categories = [ rust-version = "1.78" [workspace.dependencies] -lance = { version = "=0.20.1", path = "./rust/lance" } -lance-arrow = { version = "=0.20.1", path = "./rust/lance-arrow" } -lance-core = { version = "=0.20.1", path = "./rust/lance-core" } -lance-datafusion = { version = "=0.20.1", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=0.20.1", path = "./rust/lance-datagen" } -lance-encoding = { version = "=0.20.1", path = "./rust/lance-encoding" } -lance-encoding-datafusion = { version = "=0.20.1", path = "./rust/lance-encoding-datafusion" } -lance-file = { version = "=0.20.1", path = "./rust/lance-file" } -lance-index = { version = "=0.20.1", path = "./rust/lance-index" } -lance-io = { version = "=0.20.1", path = "./rust/lance-io" } -lance-jni = { version = "=0.20.1", path = "./java/core/lance-jni" } -lance-linalg = { version = "=0.20.1", path = "./rust/lance-linalg" } -lance-table = { version = "=0.20.1", path = "./rust/lance-table" } -lance-test-macros = { version = "=0.20.1", path = "./rust/lance-test-macros" } -lance-testing = { version = "=0.20.1", path = "./rust/lance-testing" } +lance = { version = "=0.21.0", path = "./rust/lance" } +lance-arrow = { version = "=0.21.0", path = "./rust/lance-arrow" } +lance-core = { version = "=0.21.0", path = "./rust/lance-core" } +lance-datafusion = { version = "=0.21.0", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=0.21.0", path = "./rust/lance-datagen" } +lance-encoding = { version = "=0.21.0", path = "./rust/lance-encoding" } +lance-encoding-datafusion = { version = "=0.21.0", path = "./rust/lance-encoding-datafusion" } +lance-file = { version = "=0.21.0", path = "./rust/lance-file" } +lance-index = { version = "=0.21.0", path = "./rust/lance-index" } +lance-io = { version = "=0.21.0", path = "./rust/lance-io" } +lance-jni = { version = "=0.21.0", path = "./java/core/lance-jni" } +lance-linalg = { version = "=0.21.0", path = "./rust/lance-linalg" } +lance-table = { version = "=0.21.0", path = "./rust/lance-table" } +lance-test-macros = { version = "=0.21.0", path = "./rust/lance-test-macros" } +lance-testing = { version = "=0.21.0", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "53.2", optional = false, features = ["prettyprint"] } @@ -111,7 +111,7 @@ datafusion-physical-expr = { version = "42.0", features = [ ] } deepsize = "0.2.0" either = "1.0" -fsst = { version = "=0.20.1", path = "./rust/lance-encoding/src/compression_algo/fsst" } +fsst = { version = "=0.21.0", path = "./rust/lance-encoding/src/compression_algo/fsst" } futures = "0.3" http = "1.1.0" hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } diff --git a/java/core/pom.xml b/java/core/pom.xml index 1c434cc1bf5..9b32dbd361f 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -8,7 +8,7 @@ com.lancedb lance-parent - 0.20.1 + 0.21.0 ../pom.xml diff --git a/java/pom.xml b/java/pom.xml index bd06179d7d6..6bfbb83ddfa 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -6,7 +6,7 @@ com.lancedb lance-parent - 0.20.1 + 0.21.0 pom Lance Parent diff --git a/java/spark/pom.xml b/java/spark/pom.xml index 3b4692a5b15..4c6f183f5e4 100644 --- a/java/spark/pom.xml +++ b/java/spark/pom.xml @@ -8,7 +8,7 @@ com.lancedb lance-parent - 0.20.1 + 0.21.0 ../pom.xml @@ -82,7 +82,7 @@ com.lancedb lance-core - 0.20.1 + 0.21.0 org.apache.spark diff --git a/python/Cargo.lock b/python/Cargo.lock index 89862bcdc19..fcd28fd2fd3 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1964,7 +1964,7 @@ dependencies = [ [[package]] name = "fsst" -version = "0.20.1" +version = "0.21.0" dependencies = [ "rand", ] @@ -2730,7 +2730,7 @@ dependencies = [ [[package]] name = "lance" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-arith", @@ -2792,7 +2792,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -2809,7 +2809,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -2845,7 +2845,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -2871,7 +2871,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -2886,7 +2886,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrayref", "arrow", @@ -2924,7 +2924,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-arith", "arrow-array", @@ -2958,7 +2958,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3009,7 +3009,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-arith", @@ -3048,7 +3048,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow-array", "arrow-ord", @@ -3071,7 +3071,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", @@ -3991,7 +3991,7 @@ checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", "heck 0.5.0", - "itertools 0.10.5", + "itertools 0.12.1", "log", "multimap", "once_cell", @@ -4012,7 +4012,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", "heck 0.5.0", - "itertools 0.10.5", + "itertools 0.13.0", "log", "multimap", "once_cell", @@ -4045,7 +4045,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.12.1", "proc-macro2", "quote", "syn 2.0.90", @@ -4058,7 +4058,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.13.0", "proc-macro2", "quote", "syn 2.0.90", @@ -4093,7 +4093,7 @@ dependencies = [ [[package]] name = "pylance" -version = "0.20.1" +version = "0.21.0" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index a56a87cba14..e9e9f867c4d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "0.20.1" +version = "0.21.0" edition = "2021" authors = ["Lance Devs "] rust-version = "1.65" diff --git a/python/python/tests/test_migration.py b/python/python/tests/test_migration.py index 1dcfa0dfffc..97ae4398e22 100644 --- a/python/python/tests/test_migration.py +++ b/python/python/tests/test_migration.py @@ -62,3 +62,19 @@ def test_fix_data_storage_version(tmp_path: Path): OSError, match="The dataset contains a mixture of file versions" ): ds.delete("false") + + +def test_old_btree_bitmap_indices(tmp_path: Path): + """ + In versions below 0.21.0 we used the legacy file format for btree and bitmap + indices. In version 0.21.0 we switched to the new format. This test ensures + that we can still read the old indices. + """ + ds = prep_dataset(tmp_path, "v0.20.0", "old_btree_bitmap_indices.lance") + + assert ds.to_table(filter="bitmap > 2") == pa.table( + {"bitmap": [3, 4], "btree": [3, 4]} + ) + assert ds.to_table(filter="btree > 2") == pa.table( + {"bitmap": [3, 4], "btree": [3, 4]} + ) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 2dad325f967..3777c90d489 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -319,6 +319,36 @@ def test_bitmap_index(tmp_path: Path): assert indices[0]["type"] == "Bitmap" +def test_null_handling(tmp_path: Path): + tbl = pa.table( + { + "x": [1, 2, None, 3], + } + ) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + def check(has_index: bool): + assert dataset.to_table(filter="x IS NULL").num_rows == 1 + assert dataset.to_table(filter="x IS NOT NULL").num_rows == 3 + assert dataset.to_table(filter="x > 0").num_rows == 3 + assert dataset.to_table(filter="x < 5").num_rows == 3 + assert dataset.to_table(filter="x IN (1, 2)").num_rows == 2 + # Note: there is a bit of discrepancy here. Datafusion does not consider + # NULL==NULL when doing an IN operation due to classic SQL shenanigans. + # We should decide at some point which behavior we want and make this + # consistent. + if has_index: + assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 3 + else: + assert dataset.to_table(filter="x IN (1, 2, NULL)").num_rows == 2 + + check(False) + dataset.create_scalar_index("x", index_type="BITMAP") + check(True) + dataset.create_scalar_index("x", index_type="BTREE") + check(True) + + def test_label_list_index(tmp_path: Path): tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"]) tag_list = pa.ListArray.from_arrays([0, 2, 4], tags) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index effabea0be2..8493098ecf8 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -165,7 +165,7 @@ pub trait IndexWriter: Send { #[async_trait] pub trait IndexReader: Send + Sync { /// Read the n-th record batch from the file - async fn read_record_batch(&self, n: u32) -> Result; + async fn read_record_batch(&self, n: u64, batch_size: u64) -> Result; /// Read the range of rows from the file. /// If projection is Some, only return the columns in the projection, /// nested columns like Some(&["x.y"]) are not supported. diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index fc531b3bf7a..ca03d250181 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -37,6 +37,8 @@ pub const BITMAP_LOOKUP_NAME: &str = "bitmap_page_lookup.lance"; #[derive(Clone, Debug)] pub struct BitmapIndex { index_map: BTreeMap, + // We put null in its own map to avoid it matching range queries (arrow-rs considers null to come before minval) + null_map: RowIdTreeMap, // Memoized index_map size for DeepSizeOf index_map_size_bytes: usize, store: Arc, @@ -45,11 +47,13 @@ pub struct BitmapIndex { impl BitmapIndex { fn new( index_map: BTreeMap, + null_map: RowIdTreeMap, index_map_size_bytes: usize, store: Arc, ) -> Self { Self { index_map, + null_map, index_map_size_bytes, store, } @@ -74,6 +78,7 @@ impl BitmapIndex { let mut index_map: BTreeMap = BTreeMap::new(); let mut index_map_size_bytes = 0; + let mut null_map = RowIdTreeMap::default(); for idx in 0..data.num_rows() { let key = OrderableScalarValue(ScalarValue::try_from_array(dict_keys, idx)?); let bitmap_bytes = bitmap_binary_array.value(idx); @@ -82,10 +87,14 @@ impl BitmapIndex { index_map_size_bytes += key.deep_size_of(); // This should be a reasonable approximation of the RowIdTreeMap size index_map_size_bytes += bitmap_bytes.len(); - index_map.insert(key, bitmap); + if key.0.is_null() { + null_map = bitmap; + } else { + index_map.insert(key, bitmap); + } } - Ok(Self::new(index_map, index_map_size_bytes, store)) + Ok(Self::new(index_map, null_map, index_map_size_bytes, store)) } } @@ -152,8 +161,12 @@ impl ScalarIndex for BitmapIndex { let row_ids = match query { SargableQuery::Equals(val) => { - let key = OrderableScalarValue(val.clone()); - self.index_map.get(&key).cloned().unwrap_or_default() + if val.is_null() { + self.null_map.clone() + } else { + let key = OrderableScalarValue(val.clone()); + self.index_map.get(&key).cloned().unwrap_or_default() + } } SargableQuery::Range(start, end) => { let range_start = match start { @@ -179,26 +192,19 @@ impl ScalarIndex for BitmapIndex { SargableQuery::IsIn(values) => { let mut union_bitmap = RowIdTreeMap::default(); for val in values { - let key = OrderableScalarValue(val.clone()); - if let Some(bitmap) = self.index_map.get(&key) { - union_bitmap |= bitmap.clone(); + if val.is_null() { + union_bitmap |= self.null_map.clone(); + } else { + let key = OrderableScalarValue(val.clone()); + if let Some(bitmap) = self.index_map.get(&key) { + union_bitmap |= bitmap.clone(); + } } } union_bitmap } - SargableQuery::IsNull() => { - if let Some(array) = self - .index_map - .iter() - .find(|(key, _)| key.0.is_null()) - .map(|(_, value)| value) - { - array.clone() - } else { - RowIdTreeMap::default() - } - } + SargableQuery::IsNull() => self.null_map.clone(), SargableQuery::FullTextSearch(_) => { return Err(Error::NotSupported { source: "full text search is not supported for bitmap indexes".into(), diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index ce23f85d851..abbe65490f8 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -50,6 +50,8 @@ use super::{ const BTREE_LOOKUP_NAME: &str = "page_lookup.lance"; const BTREE_PAGES_NAME: &str = "page_data.lance"; +pub const DEFAULT_BTREE_BATCH_SIZE: u64 = 4096; +const BATCH_SIZE_META_KEY: &str = "batch_size"; /// Wraps a ScalarValue and implements Ord (ScalarValue only implements PartialOrd) #[derive(Clone, Debug)] @@ -573,7 +575,11 @@ impl BTreeLookup { // All pages that could have a value equal to val fn pages_eq(&self, query: &OrderableScalarValue) -> Vec { - self.pages_between((Bound::Included(query), Bound::Excluded(query))) + if query.0.is_null() { + self.pages_null() + } else { + self.pages_between((Bound::Included(query), Bound::Excluded(query))) + } } // All pages that could have a value equal to one of the values @@ -673,6 +679,7 @@ pub struct BTreeIndex { page_lookup: Arc, store: Arc, sub_index: Arc, + batch_size: u64, } impl BTreeIndex { @@ -681,12 +688,14 @@ impl BTreeIndex { null_pages: Vec, store: Arc, sub_index: Arc, + batch_size: u64, ) -> Self { let page_lookup = Arc::new(BTreeLookup::new(tree, null_pages)); Self { page_lookup, store, sub_index, + batch_size, } } @@ -696,7 +705,9 @@ impl BTreeIndex { page_number: u32, index_reader: Arc, ) -> Result { - let serialized_page = index_reader.read_record_batch(page_number).await?; + let serialized_page = index_reader + .read_record_batch(page_number as u64, self.batch_size) + .await?; let subindex = self.sub_index.load_subindex(serialized_page).await?; // TODO: If this is an IN query we can perhaps simplify the subindex query by restricting it to the // values that might be in the page. E.g. if we are searching for X IN [5, 3, 7] and five is in pages @@ -705,7 +716,11 @@ impl BTreeIndex { subindex.search(query).await } - fn try_from_serialized(data: RecordBatch, store: Arc) -> Result { + fn try_from_serialized( + data: RecordBatch, + store: Arc, + batch_size: u64, + ) -> Result { let mut map = BTreeMap::>::new(); let mut null_pages = Vec::::new(); @@ -735,9 +750,13 @@ impl BTreeIndex { let null_count = null_counts.values()[idx]; let page_number = page_numbers.values()[idx]; - map.entry(min) - .or_default() - .push(PageRecord { max, page_number }); + // If the page is entirely null don't even bother putting it in the tree + if !max.0.is_null() { + map.entry(min) + .or_default() + .push(PageRecord { max, page_number }); + } + if null_count > 0 { null_pages.push(page_number); } @@ -751,7 +770,7 @@ impl BTreeIndex { // TODO: Support other page types? let sub_index = Arc::new(FlatIndexMetadata::new(data_type.clone())); - Ok(Self::new(map, null_pages, store, sub_index)) + Ok(Self::new(map, null_pages, store, sub_index, batch_size)) } /// Create a stream of all the data in the index, in the same format used to train the index @@ -844,7 +863,9 @@ impl Index for BTreeIndex { let sub_index_reader = self.store.open_index_file(BTREE_PAGES_NAME).await?; for page_number in self.page_lookup.all_page_ids() { - let serialized = sub_index_reader.read_record_batch(page_number).await?; + let serialized = sub_index_reader + .read_record_batch(page_number as u64, self.batch_size) + .await?; let page = self.sub_index.load_subindex(serialized).await?; frag_ids |= page.calculate_included_frags().await?; } @@ -891,10 +912,20 @@ impl ScalarIndex for BTreeIndex { async fn load(store: Arc) -> Result> { let page_lookup_file = store.open_index_file(BTREE_LOOKUP_NAME).await?; - let serialized_lookup = page_lookup_file.read_record_batch(0).await?; + let num_rows_in_lookup = page_lookup_file.num_rows(); + let serialized_lookup = page_lookup_file + .read_range(0..num_rows_in_lookup, None) + .await?; + let file_schema = page_lookup_file.schema(); + let batch_size = file_schema + .metadata + .get(BATCH_SIZE_META_KEY) + .map(|bs| bs.parse().unwrap_or(DEFAULT_BTREE_BATCH_SIZE)) + .unwrap_or(DEFAULT_BTREE_BATCH_SIZE); Ok(Arc::new(Self::try_from_serialized( serialized_lookup, store, + batch_size, )?)) } @@ -911,7 +942,9 @@ impl ScalarIndex for BTreeIndex { let sub_index_reader = self.store.open_index_file(BTREE_PAGES_NAME).await?; for page_number in self.page_lookup.all_page_ids() { - let old_serialized = sub_index_reader.read_record_batch(page_number).await?; + let old_serialized = sub_index_reader + .read_record_batch(page_number as u64, self.batch_size) + .await?; let remapped = self .sub_index .remap_subindex(old_serialized, mapping) @@ -934,7 +967,13 @@ impl ScalarIndex for BTreeIndex { ) -> Result<()> { // Merge the existing index data with the new data and then retrain the index on the merged stream let merged_data_source = Box::new(BTreeUpdater::new(self.clone(), new_data)); - train_btree_index(merged_data_source, self.sub_index.as_ref(), dest_store).await + train_btree_index( + merged_data_source, + self.sub_index.as_ref(), + dest_store, + DEFAULT_BTREE_BATCH_SIZE as u32, + ) + .await } } @@ -1092,13 +1131,14 @@ pub async fn train_btree_index( data_source: Box, sub_index_trainer: &dyn BTreeSubIndex, index_store: &dyn IndexStore, + batch_size: u32, ) -> Result<()> { let mut sub_index_file = index_store .new_index_file(BTREE_PAGES_NAME, sub_index_trainer.schema().clone()) .await?; let mut encoded_batches = Vec::new(); let mut batch_idx = 0; - let mut batches_source = data_source.scan_ordered_chunks(4096).await?; + let mut batches_source = data_source.scan_ordered_chunks(batch_size).await?; while let Some(batch) = batches_source.try_next().await? { debug_assert_eq!(batch.num_columns(), 2); debug_assert_eq!(*batch.column(1).data_type(), DataType::UInt64); @@ -1109,8 +1149,12 @@ pub async fn train_btree_index( } sub_index_file.finish().await?; let record_batch = btree_stats_as_batch(encoded_batches)?; + let mut file_schema = record_batch.schema().as_ref().clone(); + file_schema + .metadata + .insert(BATCH_SIZE_META_KEY.to_string(), batch_size.to_string()); let mut btree_index_file = index_store - .new_index_file(BTREE_LOOKUP_NAME, record_batch.schema()) + .new_index_file(BTREE_LOOKUP_NAME, Arc::new(file_schema)) .await?; btree_index_file.write_record_batch(record_batch).await?; btree_index_file.finish().await?; @@ -1204,7 +1248,12 @@ impl Stream for IndexReaderStream { let page_number = this.pages[idx]; this.idx += 1; let reader_copy = this.reader.clone(); - let read_task = async move { reader_copy.read_record_batch(page_number).await }.boxed(); + let read_task = async move { + reader_copy + .read_record_batch(page_number as u64, DEFAULT_BTREE_BATCH_SIZE) + .await + } + .boxed(); std::task::Poll::Ready(Some(read_task)) } } diff --git a/rust/lance-index/src/scalar/flat.rs b/rust/lance-index/src/scalar/flat.rs index 66a69e95e53..709f4b38051 100644 --- a/rust/lance-index/src/scalar/flat.rs +++ b/rust/lance-index/src/scalar/flat.rs @@ -33,6 +33,7 @@ use super::{AnyQuery, SargableQuery}; #[derive(Debug)] pub struct FlatIndex { data: Arc, + has_nulls: bool, } impl DeepSizeOf for FlatIndex { @@ -132,8 +133,10 @@ impl BTreeSubIndex for FlatIndexMetadata { } async fn load_subindex(&self, serialized: RecordBatch) -> Result> { + let has_nulls = serialized.column(0).null_count() > 0; Ok(Arc::new(FlatIndex { data: Arc::new(serialized), + has_nulls, })) } @@ -196,13 +199,23 @@ impl ScalarIndex for FlatIndex { let query = query.as_any().downcast_ref::().unwrap(); // Since we have all the values in memory we can use basic arrow-rs compute // functions to satisfy scalar queries. - let predicate = match query { - SargableQuery::Equals(value) => arrow_ord::cmp::eq(self.values(), &value.to_scalar()?)?, + let mut predicate = match query { + SargableQuery::Equals(value) => { + if value.is_null() { + arrow::compute::is_null(self.values())? + } else { + arrow_ord::cmp::eq(self.values(), &value.to_scalar()?)? + } + } SargableQuery::IsNull() => arrow::compute::is_null(self.values())?, SargableQuery::IsIn(values) => { + let mut has_null = false; let choices = values .iter() - .map(|val| lit(val.clone())) + .map(|val| { + has_null |= val.is_null(); + lit(val.clone()) + }) .collect::>(); let in_list_expr = in_list( Arc::new(Column::new("values", 0)), @@ -211,12 +224,20 @@ impl ScalarIndex for FlatIndex { &self.data.schema(), )?; let result_col = in_list_expr.evaluate(&self.data)?; - result_col + let predicate = result_col .into_array(self.data.num_rows())? .as_any() .downcast_ref::() .expect("InList evaluation should return boolean array") - .clone() + .clone(); + + // Arrow's in_list does not handle nulls so we need to join them in here if user asked for them + if has_null && self.has_nulls { + let nulls = arrow::compute::is_null(self.values())?; + arrow::compute::or(&predicate, &nulls)? + } else { + predicate + } } SargableQuery::Range(lower_bound, upper_bound) => match (lower_bound, upper_bound) { (Bound::Unbounded, Bound::Unbounded) => { @@ -256,6 +277,12 @@ impl ScalarIndex for FlatIndex { location!(), )), }; + if self.has_nulls && matches!(query, SargableQuery::Range(_, _)) { + // Arrow's comparison kernels do not return false for nulls. They consider nulls to + // be less than any value. So we need to filter out the nulls manually. + let valid_values = arrow::compute::is_not_null(self.values())?; + predicate = arrow::compute::and(&valid_values, &predicate)?; + } let matching_ids = arrow_select::filter::filter(self.ids(), &predicate)?; let matching_ids = matching_ids .as_any() @@ -269,9 +296,12 @@ impl ScalarIndex for FlatIndex { // data as a single batch named data.lance async fn load(store: Arc) -> Result> { let batches = store.open_index_file("data.lance").await?; - let batch = batches.read_record_batch(0).await?; + let num_rows = batches.num_rows(); + let batch = batches.read_range(0..num_rows, None).await?; + let has_nulls = batch.column(0).null_count() > 0; Ok(Arc::new(Self { data: Arc::new(batch), + has_nulls, })) } @@ -319,6 +349,7 @@ mod tests { FlatIndex { data: Arc::new(batch), + has_nulls: false, } } diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 75639db33e9..865cc5a245f 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -16,7 +16,6 @@ use lance_core::{cache::FileMetadataCache, Error, Result}; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::v2; use lance_file::v2::reader::FileReaderOptions; -use lance_file::writer::FileWriterOptions; use lance_file::{ reader::FileReader, writer::{FileWriter, ManifestProvider}, @@ -24,7 +23,6 @@ use lance_file::{ use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::{object_store::ObjectStore, ReadBatchParams}; use lance_table::format::SelfDescribingFileReader; -use lance_table::io::manifest::ManifestDescribing; use object_store::path::Path; use super::{IndexReader, IndexStore, IndexWriter}; @@ -40,7 +38,6 @@ pub struct LanceIndexStore { index_dir: Path, metadata_cache: FileMetadataCache, scheduler: Arc, - use_legacy_format: bool, } impl DeepSizeOf for LanceIndexStore { @@ -68,14 +65,8 @@ impl LanceIndexStore { index_dir, metadata_cache, scheduler, - use_legacy_format: false, } } - - pub fn with_legacy_format(mut self, use_legacy_format: bool) -> Self { - self.use_legacy_format = use_legacy_format; - self - } } #[async_trait] @@ -119,7 +110,7 @@ impl IndexWriter for v2::writer::FileWriter { #[async_trait] impl IndexReader for FileReader { - async fn read_record_batch(&self, offset: u32) -> Result { + async fn read_record_batch(&self, offset: u64, _batch_size: u64) -> Result { self.read_batch(offset as i32, ReadBatchParams::RangeFull, self.schema()) .await } @@ -151,8 +142,11 @@ impl IndexReader for FileReader { #[async_trait] impl IndexReader for v2::reader::FileReader { - async fn read_record_batch(&self, _offset: u32) -> Result { - unimplemented!("v2 format has no concept of row groups") + async fn read_record_batch(&self, offset: u64, batch_size: u64) -> Result { + let start = offset * batch_size; + let end = start + batch_size; + let end = end.min(self.num_rows()); + self.read_range(start as usize..end as usize, None).await } async fn read_range( @@ -219,24 +213,13 @@ impl IndexStore for LanceIndexStore { ) -> Result> { let path = self.index_dir.child(name); let schema = schema.as_ref().try_into()?; - if self.use_legacy_format { - let writer = FileWriter::::try_new( - &self.object_store, - &path, - schema, - &FileWriterOptions::default(), - ) - .await?; - Ok(Box::new(writer)) - } else { - let writer = self.object_store.create(&path).await?; - let writer = v2::writer::FileWriter::try_new( - writer, - schema, - v2::writer::FileWriterOptions::default(), - )?; - Ok(Box::new(writer)) - } + let writer = self.object_store.create(&path).await?; + let writer = v2::writer::FileWriter::try_new( + writer, + schema, + v2::writer::FileWriterOptions::default(), + )?; + Ok(Box::new(writer)) } async fn open_index_file(&self, name: &str) -> Result> { @@ -305,7 +288,7 @@ mod tests { use crate::scalar::{ bitmap::{train_bitmap_index, BitmapIndex}, - btree::{train_btree_index, BTreeIndex, TrainingSource}, + btree::{train_btree_index, BTreeIndex, TrainingSource, DEFAULT_BTREE_BATCH_SIZE}, flat::FlatIndexMetadata, label_list::{train_label_list_index, LabelListIndex}, LabelListQuery, SargableQuery, ScalarIndex, @@ -335,14 +318,6 @@ mod tests { Arc::new(LanceIndexStore::new(object_store, test_path, cache)) } - fn legacy_test_store(tempdir: &TempDir) -> Arc { - let test_path: &Path = tempdir.path(); - let cache = FileMetadataCache::with_capacity(128 * 1024 * 1024, CapacityMode::Bytes); - let (object_store, test_path) = - ObjectStore::from_path(test_path.as_os_str().to_str().unwrap()).unwrap(); - Arc::new(LanceIndexStore::new(object_store, test_path, cache).with_legacy_format(true)) - } - struct MockTrainingSource { data: SendableRecordBatchStream, } @@ -376,24 +351,31 @@ mod tests { index_store: &Arc, data: impl RecordBatchReader + Send + Sync + 'static, value_type: DataType, + custom_batch_size: Option, ) { let sub_index_trainer = FlatIndexMetadata::new(value_type); let data = Box::new(MockTrainingSource::new(data).await); - train_btree_index(data, &sub_index_trainer, index_store.as_ref()) - .await - .unwrap(); + let batch_size = custom_batch_size.unwrap_or(DEFAULT_BTREE_BATCH_SIZE); + train_btree_index( + data, + &sub_index_trainer, + index_store.as_ref(), + batch_size as u32, + ) + .await + .unwrap(); } #[tokio::test] async fn test_basic_btree() { let tempdir = tempdir().unwrap(); - let index_store = legacy_test_store(&tempdir); + let index_store = test_store(&tempdir); let data = gen() .col("values", array::step::()) .col("row_ids", array::step::()) .into_reader_rows(RowCount::from(4096), BatchCount::from(100)); - train_index(&index_store, data, DataType::Int32).await; + train_index(&index_store, data, DataType::Int32, None).await; let index = BTreeIndex::load(index_store).await.unwrap(); let row_ids = index @@ -428,12 +410,12 @@ mod tests { #[tokio::test] async fn test_btree_update() { let index_dir = tempdir().unwrap(); - let index_store = legacy_test_store(&index_dir); + let index_store = test_store(&index_dir); let data = gen() .col("values", array::step::()) .col("row_ids", array::step::()) .into_reader_rows(RowCount::from(4096), BatchCount::from(100)); - train_index(&index_store, data, DataType::Int32).await; + train_index(&index_store, data, DataType::Int32, None).await; let index = BTreeIndex::load(index_store).await.unwrap(); let data = gen() @@ -442,7 +424,7 @@ mod tests { .into_reader_rows(RowCount::from(4096), BatchCount::from(100)); let updated_index_dir = tempdir().unwrap(); - let updated_index_store = legacy_test_store(&updated_index_dir); + let updated_index_store = test_store(&updated_index_dir); index .update( lance_datafusion::utils::reader_to_stream(Box::new(data)), @@ -478,7 +460,7 @@ mod tests { #[tokio::test] async fn test_btree_with_gaps() { let tempdir = tempdir().unwrap(); - let index_store = legacy_test_store(&tempdir); + let index_store = test_store(&tempdir); let batch_one = gen() .col("values", array::cycle::(vec![0, 1, 4, 5])) .col("row_ids", array::cycle::(vec![0, 1, 2, 3])) @@ -507,7 +489,7 @@ mod tests { Field::new("row_ids", DataType::UInt64, false), ])); let data = RecordBatchIterator::new(batches, schema); - train_index(&index_store, data, DataType::Int32).await; + train_index(&index_store, data, DataType::Int32, Some(4)).await; let index = BTreeIndex::load(index_store).await.unwrap(); // The above should create four pages @@ -703,7 +685,7 @@ mod tests { // DataType::Duration(TimeUnit::Nanosecond), ] { let tempdir = tempdir().unwrap(); - let index_store = legacy_test_store(&tempdir); + let index_store = test_store(&tempdir); let data: RecordBatch = gen() .col("values", array::rand_type(data_type)) .col("row_ids", array::step::()) @@ -742,7 +724,7 @@ mod tests { data.schema().clone(), ); - train_index(&index_store, training_data, data_type.clone()).await; + train_index(&index_store, training_data, data_type.clone(), None).await; let index = BTreeIndex::load(index_store).await.unwrap(); let row_ids = index @@ -761,7 +743,7 @@ mod tests { #[tokio::test] async fn btree_reject_nan() { let tempdir = tempdir().unwrap(); - let index_store = legacy_test_store(&tempdir); + let index_store = test_store(&tempdir); let batch = gen() .col("values", array::cycle::(vec![0.0, f32::NAN])) .col("row_ids", array::cycle::(vec![0, 1])) @@ -777,17 +759,20 @@ mod tests { let data = Box::new(MockTrainingSource::new(data).await); // Until DF handles NaN reliably we need to make sure we reject input // containing NaN - assert!( - train_btree_index(data, &sub_index_trainer, index_store.as_ref()) - .await - .is_err() - ); + assert!(train_btree_index( + data, + &sub_index_trainer, + index_store.as_ref(), + DEFAULT_BTREE_BATCH_SIZE as u32 + ) + .await + .is_err()); } #[tokio::test] async fn btree_entire_null_page() { let tempdir = tempdir().unwrap(); - let index_store = legacy_test_store(&tempdir); + let index_store = test_store(&tempdir); let batch = gen() .col( "values", @@ -805,9 +790,14 @@ mod tests { let sub_index_trainer = FlatIndexMetadata::new(DataType::Utf8); let data = Box::new(MockTrainingSource::new(data).await); - train_btree_index(data, &sub_index_trainer, index_store.as_ref()) - .await - .unwrap(); + train_btree_index( + data, + &sub_index_trainer, + index_store.as_ref(), + DEFAULT_BTREE_BATCH_SIZE as u32, + ) + .await + .unwrap(); let index = BTreeIndex::load(index_store).await.unwrap(); diff --git a/rust/lance/benches/scalar_index.rs b/rust/lance/benches/scalar_index.rs index 58c261ccf50..f14dea1983e 100644 --- a/rust/lance/benches/scalar_index.rs +++ b/rust/lance/benches/scalar_index.rs @@ -16,7 +16,7 @@ use lance_core::{cache::FileMetadataCache, Result}; use lance_datafusion::utils::reader_to_stream; use lance_datagen::{array, gen, BatchCount, RowCount}; use lance_index::scalar::{ - btree::{train_btree_index, BTreeIndex, TrainingSource}, + btree::{train_btree_index, BTreeIndex, TrainingSource, DEFAULT_BTREE_BATCH_SIZE}, flat::FlatIndexMetadata, lance_format::LanceIndexStore, IndexStore, SargableQuery, ScalarIndex, @@ -60,7 +60,6 @@ impl TrainingSource for BenchmarkDataSource { } impl BenchmarkFixture { - #[allow(dead_code)] fn test_store(tempdir: &TempDir) -> Arc { let test_path = tempdir.path(); let (object_store, test_path) = @@ -72,16 +71,6 @@ impl BenchmarkFixture { )) } - fn legacy_test_store(tempdir: &TempDir) -> Arc { - let test_path = tempdir.path(); - let (object_store, test_path) = - ObjectStore::from_path(test_path.as_os_str().to_str().unwrap()).unwrap(); - Arc::new( - LanceIndexStore::new(object_store, test_path, FileMetadataCache::no_cache()) - .with_legacy_format(true), - ) - } - async fn write_baseline_data(tempdir: &TempDir) -> Arc { let test_path = tempdir.path().as_os_str().to_str().unwrap(); Arc::new( @@ -98,6 +87,7 @@ impl BenchmarkFixture { Box::new(BenchmarkDataSource {}), &sub_index_trainer, index_store.as_ref(), + DEFAULT_BTREE_BATCH_SIZE as u32, ) .await .unwrap(); @@ -105,7 +95,7 @@ impl BenchmarkFixture { async fn open() -> Self { let tempdir = tempfile::tempdir().unwrap(); - let index_store = Self::legacy_test_store(&tempdir); + let index_store = Self::test_store(&tempdir); let baseline_dataset = Self::write_baseline_data(&tempdir).await; Self::train_scalar_index(&index_store).await; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index ace9906d5cc..c65a30df332 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -111,13 +111,7 @@ pub(crate) async fn remap_index( match generic.index_type() { it if it.is_scalar() => { - let new_store = match it { - IndexType::Scalar | IndexType::BTree => { - LanceIndexStore::from_dataset(dataset, &new_id.to_string()) - .with_legacy_format(true) - } - _ => LanceIndexStore::from_dataset(dataset, &new_id.to_string()), - }; + let new_store = LanceIndexStore::from_dataset(dataset, &new_id.to_string()); let scalar_index = dataset .open_scalar_index(&field.name, &index_id.to_string()) diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 3c6a377dd5a..9381b8b88f1 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -98,15 +98,7 @@ pub async fn merge_indices<'a>( let new_uuid = Uuid::new_v4(); - // The BTree index implementation leverages the legacy format's batch offset, - // which has been removed from new format, so keep using the legacy format for now. - let new_store = match index.index_type() { - IndexType::Scalar | IndexType::BTree => { - LanceIndexStore::from_dataset(&dataset, &new_uuid.to_string()) - .with_legacy_format(true) - } - _ => LanceIndexStore::from_dataset(&dataset, &new_uuid.to_string()), - }; + let new_store = LanceIndexStore::from_dataset(&dataset, &new_uuid.to_string()); index.update(new_data_stream.into(), &new_store).await?; Ok((new_uuid, 1)) diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index c0394bdb65e..32bf1cb7a41 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -11,6 +11,7 @@ use async_trait::async_trait; use datafusion::physical_plan::SendableRecordBatchStream; use lance_core::{Error, Result}; use lance_datafusion::{chunker::chunk_concat_stream, exec::LanceExecutionOptions}; +use lance_index::scalar::btree::DEFAULT_BTREE_BATCH_SIZE; use lance_index::scalar::InvertedIndexParams; use lance_index::scalar::{ bitmap::{train_bitmap_index, BitmapIndex, BITMAP_LOOKUP_NAME}, @@ -224,11 +225,14 @@ pub(super) async fn build_scalar_index( Ok(inverted_index_details()) } _ => { - // The BTree index implementation leverages the legacy format's batch offset, - // which has been removed from new format, so keep using the legacy format for now. - let index_store = index_store.with_legacy_format(true); let flat_index_trainer = FlatIndexMetadata::new(field.data_type()); - train_btree_index(training_request, &flat_index_trainer, &index_store).await?; + train_btree_index( + training_request, + &flat_index_trainer, + &index_store, + DEFAULT_BTREE_BATCH_SIZE as u32, + ) + .await?; Ok(btree_index_details()) } } diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/bed6140c-b15a-454e-83a4-d66520397899/bitmap_page_lookup.lance b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/bed6140c-b15a-454e-83a4-d66520397899/bitmap_page_lookup.lance new file mode 100644 index 00000000000..5b3983fead5 Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/bed6140c-b15a-454e-83a4-d66520397899/bitmap_page_lookup.lance differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/e034c4d8-77cd-422c-8855-209eed8deff8/page_data.lance b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/e034c4d8-77cd-422c-8855-209eed8deff8/page_data.lance new file mode 100644 index 00000000000..d97d872a3fe Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/e034c4d8-77cd-422c-8855-209eed8deff8/page_data.lance differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/e034c4d8-77cd-422c-8855-209eed8deff8/page_lookup.lance b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/e034c4d8-77cd-422c-8855-209eed8deff8/page_lookup.lance new file mode 100644 index 00000000000..deeb36ca9a9 Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_indices/e034c4d8-77cd-422c-8855-209eed8deff8/page_lookup.lance differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/0-ca14443d-4119-474d-a32d-ae6c59288e9a.txn b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/0-ca14443d-4119-474d-a32d-ae6c59288e9a.txn new file mode 100644 index 00000000000..d880fea9082 Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/0-ca14443d-4119-474d-a32d-ae6c59288e9a.txn differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/1-6c1bfc70-d75f-4b58-84ec-aee73e2389d6.txn b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/1-6c1bfc70-d75f-4b58-84ec-aee73e2389d6.txn new file mode 100644 index 00000000000..8575b67ce2b Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/1-6c1bfc70-d75f-4b58-84ec-aee73e2389d6.txn differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/2-70cf21e4-8f6d-4d41-b303-3dc1ee959c0b.txn b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/2-70cf21e4-8f6d-4d41-b303-3dc1ee959c0b.txn new file mode 100644 index 00000000000..97aed3d6daf Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_transactions/2-70cf21e4-8f6d-4d41-b303-3dc1ee959c0b.txn differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/1.manifest b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/1.manifest new file mode 100644 index 00000000000..4b8b0703d6a Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/1.manifest differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/2.manifest b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/2.manifest new file mode 100644 index 00000000000..f92dab11396 Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/2.manifest differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/3.manifest b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/3.manifest new file mode 100644 index 00000000000..5f747931c41 Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/_versions/3.manifest differ diff --git a/test_data/v0.20.0/old_btree_bitmap_indices.lance/data/1f29c4b8-24ba-4f50-8d07-3b0b5c1b4f3f.lance b/test_data/v0.20.0/old_btree_bitmap_indices.lance/data/1f29c4b8-24ba-4f50-8d07-3b0b5c1b4f3f.lance new file mode 100644 index 00000000000..e6e9d742cfd Binary files /dev/null and b/test_data/v0.20.0/old_btree_bitmap_indices.lance/data/1f29c4b8-24ba-4f50-8d07-3b0b5c1b4f3f.lance differ