diff --git a/Cargo.lock b/Cargo.lock index ed11ebbe949..c4bebaa67a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7578,9 +7578,9 @@ checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" [[package]] name = "roaring" -version = "0.10.12" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 101b19eb093..1c34baec418 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -169,7 +169,7 @@ rand_distr = { version = "0.5.1" } rand_xoshiro = "0.7.0" rangemap = { version = "1.0" } rayon = "1.10" -roaring = "0.10.1" +roaring = "0.11" rstest = "0.23.0" rustc_version = "0.4" serde = { version = "^1" } diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index c62c7dd234c..5d2c626bb8b 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -5292,9 +5292,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.12" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" dependencies = [ "bytemuck", "byteorder", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 078bb73bbb3..3f1b1bdf7dc 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -45,7 +45,7 @@ log = "0.4" env_logger = "0.11.7" uuid = { version = "1.17.0", features = ["v4"] } prost = "0.14.1" -roaring = "0.10.1" +roaring = "0.11" prost-types = "0.14.1" chrono = "0.4.41" diff --git a/python/Cargo.lock b/python/Cargo.lock index e61b043a9bb..f2c5eb504bf 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -6269,9 +6269,9 @@ checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" [[package]] name = "roaring" -version = "0.10.12" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" dependencies = [ "bytemuck", "byteorder", diff --git a/python/Cargo.toml b/python/Cargo.toml index 81bc8590d13..235ee69900b 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -62,7 +62,7 @@ pythonize = "0.26" tokio = { version = "1.48", features = ["rt-multi-thread"] } uuid = "1.3.0" regex = "1" -roaring = "0.10.1" +roaring = "0.11" serde_json = "1" serde = "1.0.197" serde_yaml = "0.9.34" diff --git a/rust/lance-core/src/utils/mask.rs b/rust/lance-core/src/utils/mask.rs index a36f579f08b..a04184b07c4 100644 --- a/rust/lance-core/src/utils/mask.rs +++ b/rust/lance-core/src/utils/mask.rs @@ -934,32 +934,42 @@ impl Extend for RowAddrTreeMap { } } -/// Convert a RoaringBitmap to a vector of contiguous ranges. -/// -/// This is more efficient than iterating over individual bits and coalescing, -/// as it builds ranges directly in a single pass. pub fn bitmap_to_ranges(bitmap: &RoaringBitmap) -> Vec> { - if bitmap.is_empty() { - return vec![]; - } - let mut ranges = Vec::new(); let mut iter = bitmap.iter(); - let first = iter.next().unwrap(); - let mut start = first; - let mut end = first; + while let Some(r) = iter.next_range() { + ranges.push(*r.start() as u64..(*r.end() as u64 + 1)); + } + ranges +} - for val in iter { - if val == end + 1 { - end = val; - } else { - ranges.push(start as u64..(end + 1) as u64); - start = val; - end = val; +pub fn ranges_to_bitmap(ranges: &[Range], sorted: bool) -> RoaringBitmap { + if ranges.is_empty() { + return RoaringBitmap::new(); + } + if sorted { + let sample_size = ranges.len().min(10); + let avg_len: u64 = ranges + .iter() + .take(sample_size) + .map(|r| r.end - r.start) + .sum::() + / sample_size as u64; + // from_sorted_iter appends each value in O(1) but must visit every u32. + // insert_range bulk-fills containers but does a binary search per call. + // Crossover is ~6: below that, iterating all values is cheaper. + if avg_len <= 6 { + return RoaringBitmap::from_sorted_iter( + ranges.iter().flat_map(|r| r.start as u32..r.end as u32), + ) + .unwrap(); } } - ranges.push(start as u64..(end + 1) as u64); - ranges + let mut bm = RoaringBitmap::new(); + for r in ranges { + bm.insert_range(r.start as u32..r.end as u32); + } + bm } /// A set of stable row ids backed by a 64-bit Roaring bitmap. @@ -2036,6 +2046,85 @@ mod tests { } } + // ============================================================================ + // Tests for bitmap_to_ranges / ranges_to_bitmap + // ============================================================================ + + #[test] + fn test_bitmap_to_ranges_empty() { + let bm = RoaringBitmap::new(); + assert!(bitmap_to_ranges(&bm).is_empty()); + } + + #[test] + fn test_bitmap_to_ranges_single() { + let bm = RoaringBitmap::from_iter([5]); + assert_eq!(bitmap_to_ranges(&bm), vec![5..6]); + } + + #[test] + fn test_bitmap_to_ranges_contiguous() { + let mut bm = RoaringBitmap::new(); + bm.insert_range(10..20); + assert_eq!(bitmap_to_ranges(&bm), vec![10..20]); + } + + #[test] + fn test_bitmap_to_ranges_multiple() { + let mut bm = RoaringBitmap::new(); + bm.insert_range(0..3); + bm.insert_range(10..15); + bm.insert(100); + assert_eq!(bitmap_to_ranges(&bm), vec![0..3, 10..15, 100..101]); + } + + #[test] + fn test_ranges_to_bitmap_empty() { + let bm = ranges_to_bitmap(&[], true); + assert!(bm.is_empty()); + } + + #[test] + fn test_ranges_to_bitmap_sorted_short_ranges() { + // avg len = 1, uses from_sorted_iter path + let ranges = vec![0..1, 5..6, 10..11]; + let bm = ranges_to_bitmap(&ranges, true); + assert!(bm.contains(0) && bm.contains(5) && bm.contains(10)); + assert_eq!(bm.len(), 3); + } + + #[test] + fn test_ranges_to_bitmap_sorted_long_ranges() { + // avg len = 100, uses insert_range path + let ranges = vec![0..100, 200..300]; + let bm = ranges_to_bitmap(&ranges, true); + assert_eq!(bm.len(), 200); + assert!(bm.contains(0) && bm.contains(99)); + assert!(!bm.contains(100)); + assert!(bm.contains(200) && bm.contains(299)); + } + + #[test] + fn test_ranges_to_bitmap_unsorted() { + let ranges = vec![200..300, 0..100]; + let bm = ranges_to_bitmap(&ranges, false); + assert_eq!(bm.len(), 200); + assert!(bm.contains(0) && bm.contains(250)); + } + + #[test] + fn test_bitmap_ranges_roundtrip() { + let mut original = RoaringBitmap::new(); + original.insert_range(0..50); + original.insert_range(100..200); + original.insert(500); + original.insert_range(1000..1010); + + let ranges = bitmap_to_ranges(&original); + let reconstructed = ranges_to_bitmap(&ranges, true); + assert_eq!(original, reconstructed); + } + // ============================================================================ // Tests for RowIdSet // ============================================================================ diff --git a/rust/lance/src/io/exec/filtered_read.rs b/rust/lance/src/io/exec/filtered_read.rs index 74be5330588..aab830c594c 100644 --- a/rust/lance/src/io/exec/filtered_read.rs +++ b/rust/lance/src/io/exec/filtered_read.rs @@ -32,7 +32,9 @@ use lance_arrow::RecordBatchExt; use lance_core::datatypes::OnMissing; use lance_core::utils::deletion::DeletionVector; use lance_core::utils::futures::FinallyStreamExt; -use lance_core::utils::mask::{bitmap_to_ranges, RowAddrMask, RowAddrSelection, RowAddrTreeMap}; +use lance_core::utils::mask::{ + bitmap_to_ranges, ranges_to_bitmap, RowAddrMask, RowAddrSelection, RowAddrTreeMap, +}; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{datatypes::Projection, Error, Result}; use lance_datafusion::planner::Planner; @@ -1501,11 +1503,7 @@ impl FilteredReadInternalPlan { let mut rows = RowAddrTreeMap::new(); for (fragment_id, ranges) in &self.rows { if !ranges.is_empty() { - let mut bitmap = RoaringBitmap::new(); - for range in ranges { - bitmap.insert_range(range.start as u32..range.end as u32); - } - rows.insert_bitmap(*fragment_id, bitmap); + rows.insert_bitmap(*fragment_id, ranges_to_bitmap(ranges, true)); } } FilteredReadPlan {