Skip to content
3 changes: 3 additions & 0 deletions protos/encodings_v2_1.proto
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ message MiniBlockLayout {
// The page already records how many rows are in the page. For mini-block we also need to know how
// many "items" are in the page. A row and an item are the same thing unless the page has lists.
uint64 num_items = 9;

// Since Lance 2.2, miniblocks have larger chunk sizes (>= 64KB)
bool has_large_chunk = 10;
}

// A layout used for pages where the data is large
Expand Down
17 changes: 5 additions & 12 deletions rust/lance-encoding/benches/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,6 @@ const PRIMITIVE_TYPES: &[DataType] = &[
// schema doesn't yet parse them in the context of a fixed size list.
const PRIMITIVE_TYPES_FOR_FSL: &[DataType] = &[DataType::Int8, DataType::Float32];

const ENCODING_OPTIONS: EncodingOptions = EncodingOptions {
cache_bytes_per_column: 8 * 1024 * 1024,
max_page_bytes: 32 * 1024 * 1024,
keep_original_array: true,
buffer_alignment: 64,
};

fn bench_decode(c: &mut Criterion) {
let rt = tokio::runtime::Runtime::new().unwrap();
let mut group = c.benchmark_group("decode_primitive");
Expand All @@ -73,7 +66,7 @@ fn bench_decode(c: &mut Criterion) {
&data,
lance_schema,
encoding_strategy.as_ref(),
&ENCODING_OPTIONS,
&EncodingOptions::default(),
))
.unwrap();

Expand Down Expand Up @@ -138,7 +131,7 @@ fn bench_decode_fsl(c: &mut Criterion) {
&data,
lance_schema,
encoding_strategy.as_ref(),
&ENCODING_OPTIONS,
&EncodingOptions::default(),
))
.unwrap();
b.iter(|| {
Expand Down Expand Up @@ -204,7 +197,7 @@ fn bench_decode_str_with_dict_encoding(c: &mut Criterion) {
&data,
lance_schema,
encoding_strategy.as_ref(),
&ENCODING_OPTIONS,
&EncodingOptions::default(),
))
.unwrap();
b.iter(|| {
Expand Down Expand Up @@ -279,7 +272,7 @@ fn bench_decode_packed_struct(c: &mut Criterion) {
&data,
lance_schema,
encoding_strategy.as_ref(),
&ENCODING_OPTIONS,
&EncodingOptions::default(),
))
.unwrap();

Expand Down Expand Up @@ -336,7 +329,7 @@ fn bench_decode_str_with_fixed_size_binary_encoding(c: &mut Criterion) {
&data,
lance_schema,
encoding_strategy.as_ref(),
&ENCODING_OPTIONS,
&EncodingOptions::default(),
))
.unwrap();
b.iter(|| {
Expand Down
37 changes: 31 additions & 6 deletions rust/lance-encoding/src/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ impl DefaultCompressionStrategy {
}

/// Parse compression parameters from field metadata
fn parse_field_metadata(field: &Field) -> CompressionFieldParams {
fn parse_field_metadata(field: &Field, version: &LanceFileVersion) -> CompressionFieldParams {
let mut params = CompressionFieldParams::default();

// Parse compression method
Expand Down Expand Up @@ -335,6 +335,27 @@ impl DefaultCompressionStrategy {
}
}

// Parse minichunk size
if let Some(minichunk_size_str) = field
Comment thread
niyue marked this conversation as resolved.
.metadata
.get(super::constants::MINICHUNK_SIZE_META_KEY)
{
if let Ok(minichunk_size) = minichunk_size_str.parse::<i64>() {
// for lance v2.1, only 32kb or smaller is supported
if minichunk_size >= 32 * 1024 && *version <= LanceFileVersion::V2_1 {
log::warn!(
"minichunk_size '{}' too large for version '{}', using default",
minichunk_size,
version
);
} else {
params.minichunk_size = Some(minichunk_size);
}
} else {
log::warn!("Invalid minichunk_size '{}', skipping", minichunk_size_str);
}
}

params
}

Expand Down Expand Up @@ -377,22 +398,22 @@ impl DefaultCompressionStrategy {

// 1. Check for explicit "none" compression
if params.compression.as_deref() == Some("none") {
return Ok(Box::new(BinaryMiniBlockEncoder::default()));
return Ok(Box::new(BinaryMiniBlockEncoder::new(params.minichunk_size)));
}

// 2. Check for explicit "fsst" compression
if params.compression.as_deref() == Some("fsst") {
return Ok(Box::new(FsstMiniBlockEncoder::default()));
return Ok(Box::new(FsstMiniBlockEncoder::new(params.minichunk_size)));
}

// 3. Choose base encoder (FSST or Binary) based on data characteristics
let mut base_encoder: Box<dyn MiniBlockCompressor> = if max_len
>= FSST_LEAST_INPUT_MAX_LENGTH
&& data_size >= FSST_LEAST_INPUT_SIZE as u64
{
Box::new(FsstMiniBlockEncoder::default())
Box::new(FsstMiniBlockEncoder::new(params.minichunk_size))
} else {
Box::new(BinaryMiniBlockEncoder::default())
Box::new(BinaryMiniBlockEncoder::new(params.minichunk_size))
};

// 4. Apply general compression if configured
Expand All @@ -415,7 +436,7 @@ impl DefaultCompressionStrategy {
.get_field_params(&field.name, &field.data_type());

// Override with field metadata if present (highest priority)
let metadata_params = Self::parse_field_metadata(field);
let metadata_params = Self::parse_field_metadata(field, &self.version);
field_params.merge(&metadata_params);

field_params
Expand Down Expand Up @@ -1105,6 +1126,7 @@ mod tests {
compression: Some("lz4".to_string()),
compression_level: None,
bss: Some(BssMode::Off), // Explicitly disable BSS to test RLE
minichunk_size: None,
},
);

Expand Down Expand Up @@ -1136,6 +1158,7 @@ mod tests {
compression: Some("zstd".to_string()),
compression_level: Some(3),
bss: Some(BssMode::Off), // Disable BSS to test RLE
minichunk_size: None,
},
);

Expand Down Expand Up @@ -1259,6 +1282,7 @@ mod tests {
compression: Some("zstd".to_string()),
compression_level: Some(6),
bss: None,
minichunk_size: None,
},
);

Expand Down Expand Up @@ -1401,6 +1425,7 @@ mod tests {
compression: Some("lz4".to_string()),
compression_level: None,
bss: None,
minichunk_size: None,
},
);

Expand Down
9 changes: 9 additions & 0 deletions rust/lance-encoding/src/compression_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ pub struct CompressionFieldParams {

/// Byte stream split mode for floating point data
pub bss: Option<BssMode>,

/// Minichunk size threshold for encoding
pub minichunk_size: Option<i64>,
}

impl CompressionParams {
Expand Down Expand Up @@ -131,6 +134,9 @@ impl CompressionFieldParams {
if other.bss.is_some() {
self.bss = other.bss;
}
if other.minichunk_size.is_some() {
self.minichunk_size = other.minichunk_size;
}
}
}

Expand Down Expand Up @@ -197,6 +203,7 @@ mod tests {
compression: Some("lz4".to_string()),
compression_level: None,
bss: Some(BssMode::On),
minichunk_size: None,
};

params.merge(&other);
Expand All @@ -210,6 +217,7 @@ mod tests {
compression: Some("zstd".to_string()),
compression_level: Some(3),
bss: Some(BssMode::Auto),
minichunk_size: None,
};

params.merge(&another);
Expand Down Expand Up @@ -241,6 +249,7 @@ mod tests {
compression: Some("zstd".to_string()),
compression_level: Some(3),
bss: None,
minichunk_size: None,
},
);

Expand Down
2 changes: 2 additions & 0 deletions rust/lance-encoding/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ pub const COMPRESSION_META_KEY: &str = "lance-encoding:compression";
pub const COMPRESSION_LEVEL_META_KEY: &str = "lance-encoding:compression-level";
/// Metadata key for specifying RLE (Run-Length Encoding) threshold
pub const RLE_THRESHOLD_META_KEY: &str = "lance-encoding:rle-threshold";
/// Metadata key for specifying minichunk size
pub const MINICHUNK_SIZE_META_KEY: &str = "lance-encoding:minichunk-size";

// Dictionary encoding metadata keys
/// Metadata key for specifying dictionary encoding threshold divisor
Expand Down
14 changes: 14 additions & 0 deletions rust/lance-encoding/src/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ pub struct EncodingOptions {
/// The encoder needs to know this so it figures the position of out-of-line
/// buffers correctly
pub buffer_alignment: u64,

/// The Lance file version being written
pub version: LanceFileVersion,
}

impl Default for EncodingOptions {
Expand All @@ -243,10 +246,20 @@ impl Default for EncodingOptions {
max_page_bytes: 32 * 1024 * 1024,
keep_original_array: true,
buffer_alignment: 64,
version: LanceFileVersion::default(),
}
}
}

impl EncodingOptions {
/// If true (for Lance file version 2.2+), miniblock chunk sizes are u32,
/// to allow storing larger chunks and their sizes for better compression.
/// For Lance file version 2.1, miniblock chunk sizes are u16.
pub fn support_large_chunk(&self) -> bool {
self.version >= LanceFileVersion::V2_2
}
}

/// A trait to pick which kind of field encoding to use for a field
///
/// Unlike the ArrayEncodingStrategy, the field encoding strategy is
Expand Down Expand Up @@ -758,6 +771,7 @@ mod tests {
compression: Some("lz4".to_string()),
compression_level: None,
bss: None,
minichunk_size: None,
},
);

Expand Down
Loading