Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 54 additions & 13 deletions rust/lance-encoding/src/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,6 @@ impl ComputeStat for FixedWidthDataBlock {
let max_len = self.bits_per_value / 8;
let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));

let cardidinality_array = match self.bits_per_value {
64 | 128 => Some(self.cardinality()),
_ => None,
};

// compute run count
let run_count_array = self.run_count();

Expand All @@ -119,9 +114,6 @@ impl ComputeStat for FixedWidthDataBlock {
info.insert(Stat::MaxLength, max_len_array);
info.insert(Stat::RunCount, run_count_array);
info.insert(Stat::BytePositionEntropy, byte_position_entropy);
if let Some(cardinality_array) = cardidinality_array {
info.insert(Stat::Cardinality, cardinality_array);
}
}
}

Expand Down Expand Up @@ -315,12 +307,30 @@ impl GetStat for AllNullDataBlock {

impl GetStat for FixedWidthDataBlock {
fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
let block_info = self.block_info.0.read().unwrap();
{
let block_info = self.block_info.0.read().unwrap();

if block_info.is_empty() {
panic!("get_stat should be called after statistics are computed.");
if block_info.is_empty() {
panic!("get_stat should be called after statistics are computed.");
}

if let Some(stat_value) = block_info.get(&stat) {
return Some(stat_value.clone());
}
}

if stat == Stat::Cardinality && (self.bits_per_value == 64 || self.bits_per_value == 128) {
let computed = self.cardinality();
let mut block_info = self.block_info.0.write().unwrap();
Some(
block_info
.entry(stat)
.or_insert_with(|| computed.clone())
.clone(),
)
} else {
None
}
block_info.get(&stat).cloned()
}
}

Expand Down Expand Up @@ -379,7 +389,7 @@ impl FixedWidthDataBlock {
}
}

fn cardinality(&mut self) -> Arc<dyn Array> {
fn cardinality(&self) -> Arc<dyn Array> {
match self.bits_per_value {
64 => {
let u64_slice_ref = self.data.borrow_to_typed_slice::<u64>();
Expand Down Expand Up @@ -1186,4 +1196,35 @@ mod tests {
let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
assert_eq!(actual_run_count, expected_run_count);
}

#[test]
fn test_fixed_width_cardinality_is_lazy() {
let int64_array = Int64Array::from(vec![1, 2, 3, 1, 2, 3, 1]);
let block = DataBlock::from_array(int64_array);

let DataBlock::FixedWidth(fixed) = &block else {
panic!("Expected FixedWidth datablock");
};

let info = fixed.block_info.0.read().unwrap();
assert!(info.contains_key(&Stat::DataSize));
assert!(info.contains_key(&Stat::BitWidth));
assert!(!info.contains_key(&Stat::Cardinality));
}

#[test]
fn test_fixed_width_cardinality_computed_on_demand() {
let int64_array = Int64Array::from(vec![1, 2, 3, 1, 2, 3, 1]);
let block = DataBlock::from_array(int64_array);

let cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
assert_eq!(cardinality, 3);

let DataBlock::FixedWidth(fixed) = &block else {
panic!("Expected FixedWidth datablock");
};

let info = fixed.block_info.0.read().unwrap();
assert!(info.contains_key(&Stat::Cardinality));
}
}