diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index b83be5f520a..c633c9b93fa 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -66,9 +66,8 @@ use crate::{ use lance_core::{datatypes::Field, utils::tokio::spawn_cpu, Result}; use crate::constants::DICT_SIZE_RATIO_META_KEY; -use crate::encodings::logical::primitive::dict::{ - DICT_FIXED_WIDTH_BITS_PER_VALUE, DICT_INDICES_BITS_PER_VALUE, -}; +use crate::encodings::logical::primitive::dict::DICT_INDICES_BITS_PER_VALUE; +use crate::version::LanceFileVersion; use crate::{ buffer::LanceBuffer, data::{BlockInfo, DataBlockBuilder, FixedWidthDataBlock}, @@ -3419,6 +3418,7 @@ pub struct PrimitiveStructuralEncoder { column_index: u32, field: Field, encoding_metadata: Arc>, + version: LanceFileVersion, } struct CompressedLevelsChunk { @@ -3459,6 +3459,7 @@ impl PrimitiveStructuralEncoder { compression_strategy, field, encoding_metadata, + version: options.version, }) } @@ -4284,7 +4285,7 @@ impl PrimitiveStructuralEncoder { /// - Dictionary values: cardinality × avg_value_size (actual data) /// - Dictionary offsets: cardinality × offset_size (32 or 64 bits) /// - Indices: num_values × offset_size (same as dictionary offsets) - fn estimate_dict_size(data_block: &DataBlock) -> Option { + fn estimate_dict_size(data_block: &DataBlock, version: LanceFileVersion) -> Option { let cardinality = if let Some(cardinality_array) = data_block.get_stat(Stat::Cardinality) { cardinality_array.as_primitive::().value(0) } else { @@ -4294,9 +4295,23 @@ impl PrimitiveStructuralEncoder { let num_values = data_block.num_values(); match data_block { - DataBlock::FixedWidth(_) => { - // Dictionary: cardinality unique values at 128 bits each - let dict_size = cardinality * (DICT_FIXED_WIDTH_BITS_PER_VALUE / 8); + DataBlock::FixedWidth(fixed) => { + if fixed.bits_per_value == 64 && version < LanceFileVersion::V2_2 { + return None; + } + // The current fixed-width dictionary encoding uses i32 indices. + if cardinality > i32::MAX as u64 { + return None; + } + // We currently only support dictionary encoding for 64-bit and 128-bit fixed-width values. + if fixed.bits_per_value != 64 && fixed.bits_per_value != 128 { + return None; + } + if fixed.bits_per_value % 8 != 0 { + return None; + } + // Dictionary: cardinality unique values at value bit width + let dict_size = cardinality * (fixed.bits_per_value / 8); // Indices: num_values indices at 32 bits each let indices_size = num_values * (DICT_INDICES_BITS_PER_VALUE / 8); Some(dict_size + indices_size) @@ -4307,6 +4322,11 @@ impl PrimitiveStructuralEncoder { return None; } let bits_per_offset = var.bits_per_offset as u64; + if (bits_per_offset == 32 && cardinality > i32::MAX as u64) + || (bits_per_offset == 64 && cardinality > i64::MAX as u64) + { + return None; + } let data_size = data_block.data_size(); let avg_value_size = data_size / num_values; @@ -4324,14 +4344,24 @@ impl PrimitiveStructuralEncoder { } } - fn should_dictionary_encode(data_block: &DataBlock, field: &Field) -> bool { + fn should_dictionary_encode( + data_block: &DataBlock, + field: &Field, + version: LanceFileVersion, + ) -> bool { // Since we only dictionary encode FixedWidth and VariableWidth blocks for now, we skip // estimating the size - if !matches!( - data_block, - DataBlock::FixedWidth(_) | DataBlock::VariableWidth(_) - ) { - return false; + match data_block { + DataBlock::FixedWidth(fixed) => { + if fixed.bits_per_value == 64 && version < LanceFileVersion::V2_2 { + return false; + } + if fixed.bits_per_value != 64 && fixed.bits_per_value != 128 { + return false; + } + } + DataBlock::VariableWidth(_) => {} + _ => return false, } // Don't dictionary encode tiny arrays @@ -4367,7 +4397,7 @@ impl PrimitiveStructuralEncoder { let data_size = data_block.data_size(); // Estimate dictionary-encoded size - let Some(encoded_size) = Self::estimate_dict_size(data_block) else { + let Some(encoded_size) = Self::estimate_dict_size(data_block, version) else { return false; }; @@ -4392,8 +4422,9 @@ impl PrimitiveStructuralEncoder { let field = self.field.clone(); let encoding_metadata = self.encoding_metadata.clone(); let support_large_chunk = self.support_large_chunk; + let version = self.version; let task = spawn_cpu(move || { - let num_values = arrays.iter().map(|arr| arr.len() as u64).sum(); + let num_values = arrays.iter().map(|arr| arr.len() as u64).sum(); if num_values == 0 { // We should not encode empty arrays. So if we get here that should mean that we @@ -4485,7 +4516,7 @@ impl PrimitiveStructuralEncoder { num_rows, support_large_chunk, ) - } else if Self::should_dictionary_encode(&data_block, &field) { + } else if Self::should_dictionary_encode(&data_block, &field, version) { log::debug!( "Encoding column {} with {} items using dictionary encoding (mini-block layout)", column_idx, @@ -5739,10 +5770,86 @@ mod tests { check_round_trip_encoding_of_data(vec![string_array], &test_cases, HashMap::new()).await; } + #[tokio::test] + async fn test_dictionary_encode_int64() { + use crate::constants::{DICT_SIZE_RATIO_META_KEY, STRUCTURAL_ENCODING_META_KEY}; + use crate::testing::{check_round_trip_encoding_of_data, TestCases}; + use crate::version::LanceFileVersion; + use arrow_array::{ArrayRef, Int64Array}; + use std::collections::HashMap; + use std::sync::Arc; + + // Low cardinality with poor RLE opportunity. + let values = (0..1000) + .map(|i| match i % 3 { + 0 => 10i64, + 1 => 20i64, + _ => 30i64, + }) + .collect::>(); + let array = Arc::new(Int64Array::from(values)) as ArrayRef; + + let mut metadata = HashMap::new(); + metadata.insert( + STRUCTURAL_ENCODING_META_KEY.to_string(), + STRUCTURAL_ENCODING_MINIBLOCK.to_string(), + ); + metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.99".to_string()); + + let test_cases = TestCases::default() + .with_min_file_version(LanceFileVersion::V2_2) + .with_batch_size(1000) + .with_range(0..1000) + .with_indices(vec![0, 1, 10, 999]) + .with_expected_encoding("dictionary"); + + check_round_trip_encoding_of_data(vec![array], &test_cases, metadata).await; + } + + #[tokio::test] + async fn test_dictionary_encode_float64() { + use crate::constants::{DICT_SIZE_RATIO_META_KEY, STRUCTURAL_ENCODING_META_KEY}; + use crate::testing::{check_round_trip_encoding_of_data, TestCases}; + use crate::version::LanceFileVersion; + use arrow_array::{ArrayRef, Float64Array}; + use std::collections::HashMap; + use std::sync::Arc; + + // Low cardinality with poor RLE opportunity. + let values = (0..1000) + .map(|i| match i % 3 { + 0 => 0.1f64, + 1 => 0.2f64, + _ => 0.3f64, + }) + .collect::>(); + let array = Arc::new(Float64Array::from(values)) as ArrayRef; + + let mut metadata = HashMap::new(); + metadata.insert( + STRUCTURAL_ENCODING_META_KEY.to_string(), + STRUCTURAL_ENCODING_MINIBLOCK.to_string(), + ); + metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.99".to_string()); + + let test_cases = TestCases::default() + .with_min_file_version(LanceFileVersion::V2_2) + .with_batch_size(1000) + .with_range(0..1000) + .with_indices(vec![0, 1, 10, 999]) + .with_expected_encoding("dictionary"); + + check_round_trip_encoding_of_data(vec![array], &test_cases, metadata).await; + } + // Dictionary encoding decision tests /// Helper to create FixedWidth test data block with exact cardinality stat injected /// to ensure consistent test behavior (avoids HLL estimation error) - fn create_test_fixed_data_block(num_values: u64, cardinality: u64) -> DataBlock { + fn create_test_fixed_data_block( + num_values: u64, + cardinality: u64, + bits_per_value: u64, + ) -> DataBlock { use crate::statistics::Stat; let block_info = BlockInfo::default(); @@ -5755,9 +5862,14 @@ mod tests { .unwrap() .insert(Stat::Cardinality, cardinality_array); + assert_eq!(bits_per_value % 8, 0); + let bytes_per_value = bits_per_value / 8; DataBlock::FixedWidth(FixedWidthDataBlock { - bits_per_value: 32, - data: crate::buffer::LanceBuffer::from(vec![0u8; (num_values * 4) as usize]), + bits_per_value, + data: crate::buffer::LanceBuffer::from(vec![ + 0u8; + (num_values * bytes_per_value) as usize + ]), num_values, block_info, }) @@ -5793,16 +5905,16 @@ mod tests { #[test] fn test_estimate_dict_size_fixed_width() { - use crate::encodings::logical::primitive::dict::{ - DICT_FIXED_WIDTH_BITS_PER_VALUE, DICT_INDICES_BITS_PER_VALUE, - }; + use crate::encodings::logical::primitive::dict::DICT_INDICES_BITS_PER_VALUE; - let block = create_test_fixed_data_block(1000, 400); - let estimated_size = PrimitiveStructuralEncoder::estimate_dict_size(&block).unwrap(); + let bits_per_value = 128; + let block = create_test_fixed_data_block(1000, 400, bits_per_value); + let estimated_size = + PrimitiveStructuralEncoder::estimate_dict_size(&block, LanceFileVersion::V2_1).unwrap(); // Dictionary: 400 * 16 bytes (128-bit values) // Indices: 1000 * 4 bytes (32-bit i32) - let expected_dict_size = 400 * (DICT_FIXED_WIDTH_BITS_PER_VALUE / 8); + let expected_dict_size = 400 * (bits_per_value / 8); let expected_indices_size = 1000 * (DICT_INDICES_BITS_PER_VALUE / 8); let expected_total = expected_dict_size + expected_indices_size; @@ -5812,7 +5924,8 @@ mod tests { #[test] fn test_estimate_dict_size_variable_width() { let block = create_test_variable_width_block(1000, 400); - let estimated_size = PrimitiveStructuralEncoder::estimate_dict_size(&block).unwrap(); + let estimated_size = + PrimitiveStructuralEncoder::estimate_dict_size(&block, LanceFileVersion::V2_1).unwrap(); // Get actual data size let data_size = block.data_size(); @@ -5837,7 +5950,11 @@ mod tests { arrow_schema::Field::new("test", DataType::Int32, false).with_metadata(metadata); let field = LanceField::try_from(&arrow_field).unwrap(); - let result = PrimitiveStructuralEncoder::should_dictionary_encode(&block, &field); + let result = PrimitiveStructuralEncoder::should_dictionary_encode( + &block, + &field, + LanceFileVersion::V2_1, + ); assert!(result, "Should use dictionary encode based on size"); } @@ -5847,7 +5964,7 @@ mod tests { use crate::constants::DICT_SIZE_RATIO_META_KEY; use lance_core::datatypes::Field as LanceField; - let block = create_test_fixed_data_block(1000, 10); + let block = create_test_fixed_data_block(1000, 1000, 128); let mut metadata = HashMap::new(); metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string()); @@ -5855,7 +5972,11 @@ mod tests { arrow_schema::Field::new("test", DataType::Int32, false).with_metadata(metadata); let field = LanceField::try_from(&arrow_field).unwrap(); - let result = PrimitiveStructuralEncoder::should_dictionary_encode(&block, &field); + let result = PrimitiveStructuralEncoder::should_dictionary_encode( + &block, + &field, + LanceFileVersion::V2_1, + ); assert!(!result, "Should not use dictionary encode based on size"); } diff --git a/rust/lance-encoding/src/encodings/logical/primitive/dict.rs b/rust/lance-encoding/src/encodings/logical/primitive/dict.rs index b0de1191cbf..3557c1651a4 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive/dict.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive/dict.rs @@ -3,7 +3,7 @@ use std::{collections::HashMap, sync::Arc}; -/// Bits per value for FixedWidth dictionary values (currently only 128-bit is supported) +/// Bits per value for FixedWidth dictionary values (legacy default for 128-bit values) pub const DICT_FIXED_WIDTH_BITS_PER_VALUE: u64 = 128; /// Bits per index for dictionary indices (always i32) pub const DICT_INDICES_BITS_PER_VALUE: u64 = 32; @@ -114,7 +114,7 @@ pub fn normalize_dict_nulls(array: Arc) -> Result> { /// Dictionary encodes a data block /// -/// Currently only supported for some common cases (string / binary / u128) +/// Currently only supported for some common cases (string / binary / 64-bit / 128-bit) /// /// Returns a block of indices (will always be a fixed width data block) and a block of dictionary pub fn dictionary_encode(mut data_block: DataBlock) -> (DataBlock, DataBlock) { @@ -125,39 +125,77 @@ pub fn dictionary_encode(mut data_block: DataBlock) -> (DataBlock, DataBlock) { .value(0); match data_block { DataBlock::FixedWidth(ref mut fixed_width_data_block) => { - // Currently FixedWidth DataBlock with only bits_per_value 128 has cardinality - // TODO: a follow up PR to support `FixedWidth DataBlock with bits_per_value == 256`. - let mut map = HashMap::new(); - let u128_slice = fixed_width_data_block.data.borrow_to_typed_slice::(); - let u128_slice = u128_slice.as_ref(); - let mut dictionary_buffer = Vec::with_capacity(cardinality as usize); - let mut indices_buffer = Vec::with_capacity(fixed_width_data_block.num_values as usize); - let mut curr_idx: i32 = 0; - u128_slice.iter().for_each(|&value| { - let idx = *map.entry(value).or_insert_with(|| { - dictionary_buffer.push(value); - curr_idx += 1; - curr_idx - 1 - }); - indices_buffer.push(idx); - }); - let dictionary_data_block = DataBlock::FixedWidth(FixedWidthDataBlock { - data: LanceBuffer::reinterpret_vec(dictionary_buffer), - bits_per_value: DICT_FIXED_WIDTH_BITS_PER_VALUE, - num_values: curr_idx as u64, - block_info: BlockInfo::default(), - }); - let mut indices_data_block = DataBlock::FixedWidth(FixedWidthDataBlock { - data: LanceBuffer::reinterpret_vec(indices_buffer), - bits_per_value: DICT_INDICES_BITS_PER_VALUE, - num_values: fixed_width_data_block.num_values, - block_info: BlockInfo::default(), - }); - // Todo: if we decide to do eager statistics computing, wrap statistics computing - // in DataBlock constructor. - indices_data_block.compute_stat(); - - (indices_data_block, dictionary_data_block) + match fixed_width_data_block.bits_per_value { + 64 => { + let mut map = HashMap::new(); + let u64_slice = fixed_width_data_block.data.borrow_to_typed_slice::(); + let u64_slice = u64_slice.as_ref(); + let mut dictionary_buffer = Vec::with_capacity(cardinality as usize); + let mut indices_buffer = + Vec::with_capacity(fixed_width_data_block.num_values as usize); + let mut curr_idx: i32 = 0; + u64_slice.iter().for_each(|&value| { + let idx = *map.entry(value).or_insert_with(|| { + dictionary_buffer.push(value); + curr_idx += 1; + curr_idx - 1 + }); + indices_buffer.push(idx); + }); + let mut dictionary_data_block = DataBlock::FixedWidth(FixedWidthDataBlock { + data: LanceBuffer::reinterpret_vec(dictionary_buffer), + bits_per_value: 64, + num_values: curr_idx as u64, + block_info: BlockInfo::default(), + }); + dictionary_data_block.compute_stat(); + let mut indices_data_block = DataBlock::FixedWidth(FixedWidthDataBlock { + data: LanceBuffer::reinterpret_vec(indices_buffer), + bits_per_value: DICT_INDICES_BITS_PER_VALUE, + num_values: fixed_width_data_block.num_values, + block_info: BlockInfo::default(), + }); + indices_data_block.compute_stat(); + (indices_data_block, dictionary_data_block) + } + 128 => { + // TODO: a follow up PR to support `FixedWidth DataBlock with bits_per_value == 256`. + let mut map = HashMap::new(); + let u128_slice = fixed_width_data_block.data.borrow_to_typed_slice::(); + let u128_slice = u128_slice.as_ref(); + let mut dictionary_buffer = Vec::with_capacity(cardinality as usize); + let mut indices_buffer = + Vec::with_capacity(fixed_width_data_block.num_values as usize); + let mut curr_idx: i32 = 0; + u128_slice.iter().for_each(|&value| { + let idx = *map.entry(value).or_insert_with(|| { + dictionary_buffer.push(value); + curr_idx += 1; + curr_idx - 1 + }); + indices_buffer.push(idx); + }); + let mut dictionary_data_block = DataBlock::FixedWidth(FixedWidthDataBlock { + data: LanceBuffer::reinterpret_vec(dictionary_buffer), + bits_per_value: DICT_FIXED_WIDTH_BITS_PER_VALUE, + num_values: curr_idx as u64, + block_info: BlockInfo::default(), + }); + dictionary_data_block.compute_stat(); + let mut indices_data_block = DataBlock::FixedWidth(FixedWidthDataBlock { + data: LanceBuffer::reinterpret_vec(indices_buffer), + bits_per_value: DICT_INDICES_BITS_PER_VALUE, + num_values: fixed_width_data_block.num_values, + block_info: BlockInfo::default(), + }); + indices_data_block.compute_stat(); + (indices_data_block, dictionary_data_block) + } + other => unreachable!( + "dictionary encode called with FixedWidth DataBlock bits_per_value={}", + other + ), + } } DataBlock::VariableWidth(ref mut variable_width_data_block) => { match variable_width_data_block.bits_per_offset { diff --git a/rust/lance-encoding/src/statistics.rs b/rust/lance-encoding/src/statistics.rs index f1c7be1934f..7d39537c4a9 100644 --- a/rust/lance-encoding/src/statistics.rs +++ b/rust/lance-encoding/src/statistics.rs @@ -102,10 +102,9 @@ impl ComputeStat for FixedWidthDataBlock { let max_len = self.bits_per_value / 8; let max_len_array = Arc::new(UInt64Array::from(vec![max_len])); - let cardidinality_array = if self.bits_per_value == 128 { - Some(self.cardinality()) - } else { - None + let cardidinality_array = match self.bits_per_value { + 64 | 128 => Some(self.cardinality()), + _ => None, }; // compute run count @@ -382,6 +381,20 @@ impl FixedWidthDataBlock { fn cardinality(&mut self) -> Arc { match self.bits_per_value { + 64 => { + let u64_slice_ref = self.data.borrow_to_typed_slice::(); + let u64_slice = u64_slice_ref.as_ref(); + + const PRECISION: u8 = 4; + let mut hll: HyperLogLogPlus = + HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()) + .unwrap(); + for val in u64_slice { + hll.insert(val); + } + let cardinality = hll.count() as u64; + Arc::new(UInt64Array::from(vec![cardinality])) + } 128 => { let u128_slice_ref = self.data.borrow_to_typed_slice::(); let u128_slice = u128_slice_ref.as_ref(); diff --git a/rust/lance-encoding/src/testing.rs b/rust/lance-encoding/src/testing.rs index 9161bcef99f..9b4cadfd8c7 100644 --- a/rust/lance-encoding/src/testing.rs +++ b/rust/lance-encoding/src/testing.rs @@ -640,6 +640,9 @@ fn collect_page_encoding(layout: &PageLayout, actual_chain: &mut Vec) -> if let Some(ref layout_type) = layout.layout { match layout_type { Layout::MiniBlockLayout(mini_block) => { + if mini_block.dictionary.is_some() { + actual_chain.push("dictionary".to_string()); + } // Check value compression if let Some(ref value_comp) = mini_block.value_compression { let chain = extract_array_encoding_chain(value_comp);