Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 150 additions & 29 deletions rust/lance-encoding/src/encodings/logical/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,8 @@ use crate::{
use lance_core::{datatypes::Field, utils::tokio::spawn_cpu, Result};

use crate::constants::DICT_SIZE_RATIO_META_KEY;
use crate::encodings::logical::primitive::dict::{
DICT_FIXED_WIDTH_BITS_PER_VALUE, DICT_INDICES_BITS_PER_VALUE,
};
use crate::encodings::logical::primitive::dict::DICT_INDICES_BITS_PER_VALUE;
use crate::version::LanceFileVersion;
use crate::{
buffer::LanceBuffer,
data::{BlockInfo, DataBlockBuilder, FixedWidthDataBlock},
Expand Down Expand Up @@ -3419,6 +3418,7 @@ pub struct PrimitiveStructuralEncoder {
column_index: u32,
field: Field,
encoding_metadata: Arc<HashMap<String, String>>,
version: LanceFileVersion,
}

struct CompressedLevelsChunk {
Expand Down Expand Up @@ -3459,6 +3459,7 @@ impl PrimitiveStructuralEncoder {
compression_strategy,
field,
encoding_metadata,
version: options.version,
})
}

Expand Down Expand Up @@ -4284,7 +4285,7 @@ impl PrimitiveStructuralEncoder {
/// - Dictionary values: cardinality × avg_value_size (actual data)
/// - Dictionary offsets: cardinality × offset_size (32 or 64 bits)
/// - Indices: num_values × offset_size (same as dictionary offsets)
fn estimate_dict_size(data_block: &DataBlock) -> Option<u64> {
fn estimate_dict_size(data_block: &DataBlock, version: LanceFileVersion) -> Option<u64> {
let cardinality = if let Some(cardinality_array) = data_block.get_stat(Stat::Cardinality) {
cardinality_array.as_primitive::<UInt64Type>().value(0)
} else {
Expand All @@ -4294,9 +4295,23 @@ impl PrimitiveStructuralEncoder {
let num_values = data_block.num_values();

match data_block {
DataBlock::FixedWidth(_) => {
// Dictionary: cardinality unique values at 128 bits each
let dict_size = cardinality * (DICT_FIXED_WIDTH_BITS_PER_VALUE / 8);
DataBlock::FixedWidth(fixed) => {
if fixed.bits_per_value == 64 && version < LanceFileVersion::V2_2 {
return None;
}
// The current fixed-width dictionary encoding uses i32 indices.
if cardinality > i32::MAX as u64 {
return None;
}
// We currently only support dictionary encoding for 64-bit and 128-bit fixed-width values.
if fixed.bits_per_value != 64 && fixed.bits_per_value != 128 {
return None;
}
if fixed.bits_per_value % 8 != 0 {
return None;
}
// Dictionary: cardinality unique values at value bit width
let dict_size = cardinality * (fixed.bits_per_value / 8);
// Indices: num_values indices at 32 bits each
let indices_size = num_values * (DICT_INDICES_BITS_PER_VALUE / 8);
Some(dict_size + indices_size)
Expand All @@ -4307,6 +4322,11 @@ impl PrimitiveStructuralEncoder {
return None;
}
let bits_per_offset = var.bits_per_offset as u64;
if (bits_per_offset == 32 && cardinality > i32::MAX as u64)
|| (bits_per_offset == 64 && cardinality > i64::MAX as u64)
{
return None;
}

let data_size = data_block.data_size();
let avg_value_size = data_size / num_values;
Expand All @@ -4324,14 +4344,24 @@ impl PrimitiveStructuralEncoder {
}
}

fn should_dictionary_encode(data_block: &DataBlock, field: &Field) -> bool {
fn should_dictionary_encode(
data_block: &DataBlock,
field: &Field,
version: LanceFileVersion,
) -> bool {
// Since we only dictionary encode FixedWidth and VariableWidth blocks for now, we skip
// estimating the size
if !matches!(
data_block,
DataBlock::FixedWidth(_) | DataBlock::VariableWidth(_)
) {
return false;
match data_block {
DataBlock::FixedWidth(fixed) => {
if fixed.bits_per_value == 64 && version < LanceFileVersion::V2_2 {
return false;
}
if fixed.bits_per_value != 64 && fixed.bits_per_value != 128 {
return false;
}
}
DataBlock::VariableWidth(_) => {}
_ => return false,
}

// Don't dictionary encode tiny arrays
Expand Down Expand Up @@ -4367,7 +4397,7 @@ impl PrimitiveStructuralEncoder {
let data_size = data_block.data_size();

// Estimate dictionary-encoded size
let Some(encoded_size) = Self::estimate_dict_size(data_block) else {
let Some(encoded_size) = Self::estimate_dict_size(data_block, version) else {
return false;
};

Expand All @@ -4392,8 +4422,9 @@ impl PrimitiveStructuralEncoder {
let field = self.field.clone();
let encoding_metadata = self.encoding_metadata.clone();
let support_large_chunk = self.support_large_chunk;
let version = self.version;
let task = spawn_cpu(move || {
let num_values = arrays.iter().map(|arr| arr.len() as u64).sum();
let num_values = arrays.iter().map(|arr| arr.len() as u64).sum();

if num_values == 0 {
// We should not encode empty arrays. So if we get here that should mean that we
Expand Down Expand Up @@ -4485,7 +4516,7 @@ impl PrimitiveStructuralEncoder {
num_rows,
support_large_chunk,
)
} else if Self::should_dictionary_encode(&data_block, &field) {
} else if Self::should_dictionary_encode(&data_block, &field, version) {
log::debug!(
"Encoding column {} with {} items using dictionary encoding (mini-block layout)",
column_idx,
Expand Down Expand Up @@ -5739,10 +5770,86 @@ mod tests {
check_round_trip_encoding_of_data(vec![string_array], &test_cases, HashMap::new()).await;
}

#[tokio::test]
async fn test_dictionary_encode_int64() {
use crate::constants::{DICT_SIZE_RATIO_META_KEY, STRUCTURAL_ENCODING_META_KEY};
use crate::testing::{check_round_trip_encoding_of_data, TestCases};
use crate::version::LanceFileVersion;
use arrow_array::{ArrayRef, Int64Array};
use std::collections::HashMap;
use std::sync::Arc;

// Low cardinality with poor RLE opportunity.
let values = (0..1000)
.map(|i| match i % 3 {
0 => 10i64,
1 => 20i64,
_ => 30i64,
})
.collect::<Vec<_>>();
let array = Arc::new(Int64Array::from(values)) as ArrayRef;

let mut metadata = HashMap::new();
metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
STRUCTURAL_ENCODING_MINIBLOCK.to_string(),
);
metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.99".to_string());

let test_cases = TestCases::default()
.with_min_file_version(LanceFileVersion::V2_2)
.with_batch_size(1000)
.with_range(0..1000)
.with_indices(vec![0, 1, 10, 999])
.with_expected_encoding("dictionary");

check_round_trip_encoding_of_data(vec![array], &test_cases, metadata).await;
}

#[tokio::test]
async fn test_dictionary_encode_float64() {
use crate::constants::{DICT_SIZE_RATIO_META_KEY, STRUCTURAL_ENCODING_META_KEY};
use crate::testing::{check_round_trip_encoding_of_data, TestCases};
use crate::version::LanceFileVersion;
use arrow_array::{ArrayRef, Float64Array};
use std::collections::HashMap;
use std::sync::Arc;

// Low cardinality with poor RLE opportunity.
let values = (0..1000)
.map(|i| match i % 3 {
0 => 0.1f64,
1 => 0.2f64,
_ => 0.3f64,
})
.collect::<Vec<_>>();
let array = Arc::new(Float64Array::from(values)) as ArrayRef;

let mut metadata = HashMap::new();
metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
STRUCTURAL_ENCODING_MINIBLOCK.to_string(),
);
metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.99".to_string());

let test_cases = TestCases::default()
.with_min_file_version(LanceFileVersion::V2_2)
.with_batch_size(1000)
.with_range(0..1000)
.with_indices(vec![0, 1, 10, 999])
.with_expected_encoding("dictionary");

check_round_trip_encoding_of_data(vec![array], &test_cases, metadata).await;
}

// Dictionary encoding decision tests
/// Helper to create FixedWidth test data block with exact cardinality stat injected
/// to ensure consistent test behavior (avoids HLL estimation error)
fn create_test_fixed_data_block(num_values: u64, cardinality: u64) -> DataBlock {
fn create_test_fixed_data_block(
num_values: u64,
cardinality: u64,
bits_per_value: u64,
) -> DataBlock {
use crate::statistics::Stat;

let block_info = BlockInfo::default();
Expand All @@ -5755,9 +5862,14 @@ mod tests {
.unwrap()
.insert(Stat::Cardinality, cardinality_array);

assert_eq!(bits_per_value % 8, 0);
let bytes_per_value = bits_per_value / 8;
DataBlock::FixedWidth(FixedWidthDataBlock {
bits_per_value: 32,
data: crate::buffer::LanceBuffer::from(vec![0u8; (num_values * 4) as usize]),
bits_per_value,
data: crate::buffer::LanceBuffer::from(vec![
0u8;
(num_values * bytes_per_value) as usize
]),
num_values,
block_info,
})
Expand Down Expand Up @@ -5793,16 +5905,16 @@ mod tests {

#[test]
fn test_estimate_dict_size_fixed_width() {
use crate::encodings::logical::primitive::dict::{
DICT_FIXED_WIDTH_BITS_PER_VALUE, DICT_INDICES_BITS_PER_VALUE,
};
use crate::encodings::logical::primitive::dict::DICT_INDICES_BITS_PER_VALUE;

let block = create_test_fixed_data_block(1000, 400);
let estimated_size = PrimitiveStructuralEncoder::estimate_dict_size(&block).unwrap();
let bits_per_value = 128;
let block = create_test_fixed_data_block(1000, 400, bits_per_value);
let estimated_size =
PrimitiveStructuralEncoder::estimate_dict_size(&block, LanceFileVersion::V2_1).unwrap();

// Dictionary: 400 * 16 bytes (128-bit values)
// Indices: 1000 * 4 bytes (32-bit i32)
let expected_dict_size = 400 * (DICT_FIXED_WIDTH_BITS_PER_VALUE / 8);
let expected_dict_size = 400 * (bits_per_value / 8);
let expected_indices_size = 1000 * (DICT_INDICES_BITS_PER_VALUE / 8);
let expected_total = expected_dict_size + expected_indices_size;

Expand All @@ -5812,7 +5924,8 @@ mod tests {
#[test]
fn test_estimate_dict_size_variable_width() {
let block = create_test_variable_width_block(1000, 400);
let estimated_size = PrimitiveStructuralEncoder::estimate_dict_size(&block).unwrap();
let estimated_size =
PrimitiveStructuralEncoder::estimate_dict_size(&block, LanceFileVersion::V2_1).unwrap();

// Get actual data size
let data_size = block.data_size();
Expand All @@ -5837,7 +5950,11 @@ mod tests {
arrow_schema::Field::new("test", DataType::Int32, false).with_metadata(metadata);
let field = LanceField::try_from(&arrow_field).unwrap();

let result = PrimitiveStructuralEncoder::should_dictionary_encode(&block, &field);
let result = PrimitiveStructuralEncoder::should_dictionary_encode(
&block,
&field,
LanceFileVersion::V2_1,
);

assert!(result, "Should use dictionary encode based on size");
}
Expand All @@ -5847,15 +5964,19 @@ mod tests {
use crate::constants::DICT_SIZE_RATIO_META_KEY;
use lance_core::datatypes::Field as LanceField;

let block = create_test_fixed_data_block(1000, 10);
let block = create_test_fixed_data_block(1000, 1000, 128);

let mut metadata = HashMap::new();
metadata.insert(DICT_SIZE_RATIO_META_KEY.to_string(), "0.8".to_string());
let arrow_field =
arrow_schema::Field::new("test", DataType::Int32, false).with_metadata(metadata);
let field = LanceField::try_from(&arrow_field).unwrap();

let result = PrimitiveStructuralEncoder::should_dictionary_encode(&block, &field);
let result = PrimitiveStructuralEncoder::should_dictionary_encode(
&block,
&field,
LanceFileVersion::V2_1,
);

assert!(!result, "Should not use dictionary encode based on size");
}
Expand Down
Loading