diff --git a/rust/lance-encoding/src/statistics.rs b/rust/lance-encoding/src/statistics.rs index 7d39537c4a9..b5cb65ee879 100644 --- a/rust/lance-encoding/src/statistics.rs +++ b/rust/lance-encoding/src/statistics.rs @@ -102,11 +102,6 @@ impl ComputeStat for FixedWidthDataBlock { let max_len = self.bits_per_value / 8; let max_len_array = Arc::new(UInt64Array::from(vec![max_len])); - let cardidinality_array = match self.bits_per_value { - 64 | 128 => Some(self.cardinality()), - _ => None, - }; - // compute run count let run_count_array = self.run_count(); @@ -119,9 +114,6 @@ impl ComputeStat for FixedWidthDataBlock { info.insert(Stat::MaxLength, max_len_array); info.insert(Stat::RunCount, run_count_array); info.insert(Stat::BytePositionEntropy, byte_position_entropy); - if let Some(cardinality_array) = cardidinality_array { - info.insert(Stat::Cardinality, cardinality_array); - } } } @@ -315,12 +307,30 @@ impl GetStat for AllNullDataBlock { impl GetStat for FixedWidthDataBlock { fn get_stat(&self, stat: Stat) -> Option> { - let block_info = self.block_info.0.read().unwrap(); + { + let block_info = self.block_info.0.read().unwrap(); - if block_info.is_empty() { - panic!("get_stat should be called after statistics are computed."); + if block_info.is_empty() { + panic!("get_stat should be called after statistics are computed."); + } + + if let Some(stat_value) = block_info.get(&stat) { + return Some(stat_value.clone()); + } + } + + if stat == Stat::Cardinality && (self.bits_per_value == 64 || self.bits_per_value == 128) { + let computed = self.cardinality(); + let mut block_info = self.block_info.0.write().unwrap(); + Some( + block_info + .entry(stat) + .or_insert_with(|| computed.clone()) + .clone(), + ) + } else { + None } - block_info.get(&stat).cloned() } } @@ -379,7 +389,7 @@ impl FixedWidthDataBlock { } } - fn cardinality(&mut self) -> Arc { + fn cardinality(&self) -> Arc { match self.bits_per_value { 64 => { let u64_slice_ref = self.data.borrow_to_typed_slice::(); @@ -1186,4 +1196,35 @@ mod tests { let actual_run_count = block.expect_single_stat::(Stat::RunCount); assert_eq!(actual_run_count, expected_run_count); } + + #[test] + fn test_fixed_width_cardinality_is_lazy() { + let int64_array = Int64Array::from(vec![1, 2, 3, 1, 2, 3, 1]); + let block = DataBlock::from_array(int64_array); + + let DataBlock::FixedWidth(fixed) = &block else { + panic!("Expected FixedWidth datablock"); + }; + + let info = fixed.block_info.0.read().unwrap(); + assert!(info.contains_key(&Stat::DataSize)); + assert!(info.contains_key(&Stat::BitWidth)); + assert!(!info.contains_key(&Stat::Cardinality)); + } + + #[test] + fn test_fixed_width_cardinality_computed_on_demand() { + let int64_array = Int64Array::from(vec![1, 2, 3, 1, 2, 3, 1]); + let block = DataBlock::from_array(int64_array); + + let cardinality = block.expect_single_stat::(Stat::Cardinality); + assert_eq!(cardinality, 3); + + let DataBlock::FixedWidth(fixed) = &block else { + panic!("Expected FixedWidth datablock"); + }; + + let info = fixed.block_info.0.read().unwrap(); + assert!(info.contains_key(&Stat::Cardinality)); + } }