diff --git a/Cargo.lock b/Cargo.lock index cfcc4899c96..518320fdf12 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4992,6 +4992,7 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ipc", "arrow-schema", "arrow-select", "async-recursion", @@ -4999,7 +5000,9 @@ dependencies = [ "byteorder", "bytes", "criterion", + "datafusion", "datafusion-common", + "datafusion-expr", "deepsize", "futures", "lance-arrow", diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 3193de8daa4..9100857bb49 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3562,13 +3562,16 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ipc", "arrow-schema", "arrow-select", "async-recursion", "async-trait", "byteorder", "bytes", + "datafusion", "datafusion-common", + "datafusion-expr", "deepsize", "futures", "lance-arrow", diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index ea5996aaeed..03c3b956740 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -491,6 +491,7 @@ fn convert_to_java_operation_inner<'local>( table_metadata_updates, schema_metadata_updates, field_metadata_updates, + column_stats: _, } => { let config_updates_obj = export_update_map(env, &config_updates)?; let table_metadata_updates_obj = export_update_map(env, &table_metadata_updates)?; @@ -812,6 +813,7 @@ fn convert_to_rust_operation( table_metadata_updates, schema_metadata_updates, field_metadata_updates, + column_stats: None, } } "Append" => { diff --git a/java/src/test/java/org/lance/FileReaderWriterTest.java b/java/src/test/java/org/lance/FileReaderWriterTest.java index c645acdcaa2..1e93011b767 100644 --- a/java/src/test/java/org/lance/FileReaderWriterTest.java +++ b/java/src/test/java/org/lance/FileReaderWriterTest.java @@ -13,10 +13,18 @@ */ package org.lance; -import org.lance.file.LanceFileReader; -import org.lance.file.LanceFileWriter; -import org.lance.util.Range; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; @@ -30,28 +38,49 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; +import org.lance.file.LanceFileReader; +import org.lance.file.LanceFileWriter; +import org.lance.util.Range; public class FileReaderWriterTest { + /** + * Schema metadata keys written by the file format when column stats are present (must match + * Rust). + */ + private static final String COLUMN_STATS_BUFFER_INDEX_KEY = "lance:column_stats:buffer_index"; + + private static final String COLUMN_STATS_VERSION_KEY = "lance:column_stats:version"; + + /** + * Expected schema for a simple file with x (Int64) and y (Utf8), including column-stats metadata. + */ + private static Schema expectedSchemaWithColumnStats() { + Map metadata = new HashMap<>(); + metadata.put(COLUMN_STATS_BUFFER_INDEX_KEY, "1"); + metadata.put(COLUMN_STATS_VERSION_KEY, "1"); + return new Schema(Arrays.asList(Field.nullable("x", new ArrowType.Int(64, true)), + Field.nullable("y", new ArrowType.Utf8())), + metadata); + } + + /** + * Assert reader schema has same fields and column-stats metadata as expected (avoids + * Schema.equals quirks). + */ + private static void assertSchemaWithColumnStats(Schema expected, Schema actual) { + assertEquals(expected.getFields(), actual.getFields()); + assertNotNull( + actual.getMetadata(), "Schema metadata should be present when column stats are written"); + assertEquals(expected.getMetadata().get(COLUMN_STATS_BUFFER_INDEX_KEY), + actual.getMetadata().get(COLUMN_STATS_BUFFER_INDEX_KEY)); + assertEquals(expected.getMetadata().get(COLUMN_STATS_VERSION_KEY), + actual.getMetadata().get(COLUMN_STATS_VERSION_KEY)); + } private VectorSchemaRoot createBatch(BufferAllocator allocator) throws IOException { - Schema schema = - new Schema( - Arrays.asList( - Field.nullable("x", new ArrowType.Int(64, true)), - Field.nullable("y", new ArrowType.Utf8())), - null); + Schema schema = new Schema(Arrays.asList(Field.nullable("x", new ArrowType.Int(64, true)), + Field.nullable("y", new ArrowType.Utf8())), + null); VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); root.allocateNew(); BigIntVector iVector = (BigIntVector) root.getVector("x"); @@ -82,15 +111,10 @@ void testBasicRead(@TempDir Path tempDir) throws Exception { createSimpleFile(filePath); LanceFileReader reader = LanceFileReader.open(filePath, allocator); - Schema expectedSchema = - new Schema( - Arrays.asList( - Field.nullable("x", new ArrowType.Int(64, true)), - Field.nullable("y", new ArrowType.Utf8())), - null); + Schema expectedSchema = expectedSchemaWithColumnStats(); assertEquals(100, reader.numRows()); - assertEquals(expectedSchema, reader.schema()); + assertSchemaWithColumnStats(expectedSchema, reader.schema()); try (ArrowReader batches = reader.readAll(null, null, 100)) { assertTrue(batches.loadNextBatch()); @@ -120,7 +144,7 @@ void testBasicRead(@TempDir Path tempDir) throws Exception { } // Ok to call schema after close - assertEquals(expectedSchema, reader.schema()); + assertSchemaWithColumnStats(expectedSchema, reader.schema()); // close should be idempotent reader.close(); @@ -133,15 +157,10 @@ void testReadWithProjection(@TempDir Path tempDir) throws Exception { createSimpleFile(filePath); LanceFileReader reader = LanceFileReader.open(filePath, allocator); - Schema expectedSchema = - new Schema( - Arrays.asList( - Field.nullable("x", new ArrowType.Int(64, true)), - Field.nullable("y", new ArrowType.Utf8())), - null); + Schema expectedSchema = expectedSchemaWithColumnStats(); assertEquals(100, reader.numRows()); - assertEquals(expectedSchema, reader.schema()); + assertSchemaWithColumnStats(expectedSchema, reader.schema()); try (ArrowReader batches = reader.readAll(Collections.singletonList("x"), null, 100)) { assertTrue(batches.loadNextBatch()); @@ -161,9 +180,8 @@ void testReadWithProjection(@TempDir Path tempDir) throws Exception { assertFalse(batches.loadNextBatch()); } - try (ArrowReader batches = - reader.readAll( - null, Arrays.asList(Range.of(1, 11), Range.of(14, 19), Range.of(20, 21)), 100)) { + try (ArrowReader batches = reader.readAll( + null, Arrays.asList(Range.of(1, 11), Range.of(14, 19), Range.of(20, 21)), 100)) { assertTrue(batches.loadNextBatch()); VectorSchemaRoot batch = batches.getVectorSchemaRoot(); assertEquals(16, batch.getRowCount()); @@ -171,11 +189,9 @@ void testReadWithProjection(@TempDir Path tempDir) throws Exception { assertFalse(batches.loadNextBatch()); } - try (ArrowReader batches = - reader.readAll( - Collections.singletonList("x"), - Arrays.asList(Range.of(23, 25), Range.of(27, 29)), - 100)) { + try (ArrowReader batches = reader.readAll(Collections.singletonList("x"), + Arrays.asList(Range.of(23, 25), Range.of(27, 29)), + 100)) { assertTrue(batches.loadNextBatch()); VectorSchemaRoot batch = batches.getVectorSchemaRoot(); assertEquals(4, batch.getRowCount()); @@ -183,11 +199,9 @@ void testReadWithProjection(@TempDir Path tempDir) throws Exception { assertFalse(batches.loadNextBatch()); } - try (ArrowReader batches = - reader.readAll( - Collections.singletonList("y"), - Arrays.asList(Range.of(23, 25), Range.of(27, 29)), - 100)) { + try (ArrowReader batches = reader.readAll(Collections.singletonList("y"), + Arrays.asList(Range.of(23, 25), Range.of(27, 29)), + 100)) { assertTrue(batches.loadNextBatch()); VectorSchemaRoot batch = batches.getVectorSchemaRoot(); assertEquals(4, batch.getRowCount()); @@ -227,11 +241,8 @@ void testWriteWithStorage(@TempDir Path tempDir) throws IOException { try { LanceFileWriter.open(filePath, allocator, null, storageOptions); } catch (IllegalArgumentException e) { - assertTrue( - e.getMessage() - .contains( - "Unable to find object store prefix: no Azure account " - + "name in URI, and no storage account configured.")); + assertTrue(e.getMessage().contains("Unable to find object store prefix: no Azure account " + + "name in URI, and no storage account configured.")); } storageOptions.put("account_name", "some_account"); @@ -295,11 +306,9 @@ void testWriteNullSchemaMetadata(@TempDir Path tempDir) throws Exception { try (LanceFileWriter writer = LanceFileWriter.open(filePath, allocator, null)) { try (VectorSchemaRoot batch = createBatch(allocator)) { writer.write(batch); - Assertions.assertThrows( - Exception.class, + Assertions.assertThrows(Exception.class, () -> writer.addSchemaMetadata(Collections.singletonMap("someKey", null))); - Assertions.assertThrows( - Exception.class, + Assertions.assertThrows(Exception.class, () -> writer.addSchemaMetadata(Collections.singletonMap(null, "someValue"))); } } diff --git a/protos/table.proto b/protos/table.proto index e7de867e46e..4a903d76198 100644 --- a/protos/table.proto +++ b/protos/table.proto @@ -176,6 +176,12 @@ message Manifest { // appropriately. map config = 16; + // Column statistics metadata. + // + // If present, indicates that consolidated column statistics are available + // for this dataset version. + optional ColumnStats column_stats = 22; + // Metadata associated with the table. // // This is a key-value map that can be used to store arbitrary metadata @@ -228,6 +234,19 @@ message VersionAuxData { map metadata = 3; } +// Column statistics metadata. +// +// Stores information about consolidated column statistics for the dataset. +message ColumnStats { + // Path to the consolidated column statistics file, relative to the dataset root. + // For example: "_stats/column_stats.lance" + string path = 1; + // Version of the column statistics format. + // This allows for future evolution of the format (e.g., different directory + // structure, different schema, etc.) + uint32 version = 2; +} + // Metadata describing an index. message IndexMetadata { // Unique ID of an index. It is unique across all the dataset versions. diff --git a/protos/transaction.proto b/protos/transaction.proto index 17d96486736..4186119bbc6 100644 --- a/protos/transaction.proto +++ b/protos/transaction.proto @@ -51,7 +51,7 @@ message Transaction { repeated uint64 deleted_fragment_ids = 2; // The predicate that was evaluated // - // This may be used to determine whether the delete would have affected + // This may be used to determine whether the delete would have affected // files written by a concurrent transaction. string predicate = 3; } @@ -183,7 +183,7 @@ message Transaction { // if the target dataset is a branch, this is the branch name of the target dataset optional string branch_name = 5; } - + // Exact set of key hashes for conflict detection. // Used when the number of inserted rows is small. message ExactKeySetFilter { @@ -275,7 +275,7 @@ message Transaction { // If false, the new entries will be merged with the existing map. bool replace = 2; } - + // An operation that updates the table config, table metadata, schema metadata, // or field metadata. message UpdateConfig { @@ -283,6 +283,9 @@ message Transaction { UpdateMap table_metadata_updates = 7; UpdateMap schema_metadata_updates = 8; map field_metadata_updates = 9; + // Column statistics metadata update. + // If present, updates the column_stats field in the manifest. + optional lance.table.ColumnStats column_stats = 10; // Deprecated ------------------------------- map upsert_values = 1; diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 7c15aa9e0ba..74dd84588cb 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -5610,6 +5610,7 @@ def write_dataset( transaction_properties: Optional[Dict[str, str]] = None, initial_bases: Optional[List[DatasetBasePath]] = None, target_bases: Optional[List[str]] = None, + disable_column_stats: bool = False, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, ) -> LanceDataset: @@ -5862,6 +5863,7 @@ def write_dataset( "transaction_properties": merged_properties, "initial_bases": initial_bases, "target_bases": target_bases, + "disable_column_stats": disable_column_stats, } # Add storage_options_provider if created from namespace diff --git a/python/python/tests/compat/test_file_formats.py b/python/python/tests/compat/test_file_formats.py index f65c8611ff6..af5cedfe72f 100644 --- a/python/python/tests/compat/test_file_formats.py +++ b/python/python/tests/compat/test_file_formats.py @@ -99,7 +99,12 @@ def __init__(self, path: Path): def create(self): batch = build_basic_types() - lance.write_dataset(batch, self.path, data_storage_version="0.1") + lance.write_dataset( + batch, + self.path, + data_storage_version="0.1", + disable_column_stats=True, + ) def check_read(self): ds = lance.dataset(self.path) @@ -110,5 +115,9 @@ def check_write(self): ds = lance.dataset(self.path) ds.delete("true") lance.write_dataset( - build_basic_types(), self.path, data_storage_version="0.1", mode="append" + build_basic_types(), + self.path, + data_storage_version="0.1", + mode="append", + disable_column_stats=True, ) diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 4e0ef9f92c0..98da1133d1a 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -1454,16 +1454,18 @@ def test_config_update_auto_cleanup(tmp_path): def test_access_config(tmp_path): + # We assert only on the test key's presence/absence, not on len(ds.config()), + # because the manifest config may contain other keys (e.g. column stats). table = pa.Table.from_pydict({"a": range(100), "b": range(100)}) base_dir = tmp_path / "test" ds = lance.write_dataset(table, base_dir, mode="create") ds.update_config({"test_key": "test_value"}) config_value = ds.config()["test_key"] assert config_value == "test_value" - assert 1 == len(ds.config()) + assert "test_key" in ds.config() ds.delete_config_keys(["test_key"]) - assert 0 == len(ds.config()) + assert "test_key" not in ds.config() def test_auto_cleanup_invalid(tmp_path): diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 1f23f3bac48..4f3b62641f2 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -33,7 +33,8 @@ def test_dataset_optimize(tmp_path: Path): assert metrics.files_removed == 10 assert metrics.files_added == 1 - assert dataset.version == 3 + # compact_files creates an extra commit for column stats metadata, so version is 4. + assert dataset.version == 4 def test_blob_compaction(tmp_path: Path): @@ -343,8 +344,9 @@ def test_dataset_distributed_optimize(tmp_path: Path): metrics = Compaction.commit(dataset, [result1]) assert metrics.fragments_removed == 2 assert metrics.fragments_added == 1 - # Compaction occurs in two transactions so it increments the version by 2. - assert dataset.version == 3 + # With default options (e.g. consolidate_column_stats), compaction uses multiple + # transactions (rewrite + column stats update), so version increments by 3. + assert dataset.version == 4 def test_migration_via_fragment_apis(tmp_path): diff --git a/python/src/dataset.rs b/python/src/dataset.rs index f180a5dd145..8bc852b7041 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3065,6 +3065,9 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult(options, "disable_column_stats")? { + p.disable_column_stats = disable_column_stats; + } if let Some(auto_cleanup) = get_dict_opt::>(options, "auto_cleanup_options")? { let mut auto_cleanup_params = AutoCleanupParams::default(); diff --git a/python/src/transaction.rs b/python/src/transaction.rs index 4f57bf3dd49..5509b2cf2db 100644 --- a/python/src/transaction.rs +++ b/python/src/transaction.rs @@ -320,6 +320,7 @@ impl FromPyObject<'_> for PyLance { table_metadata_updates, schema_metadata_updates, field_metadata_updates, + column_stats: None, }; Ok(Self(op)) } @@ -493,6 +494,7 @@ impl<'py> IntoPyObject<'py> for PyLance<&Operation> { ref table_metadata_updates, ref schema_metadata_updates, ref field_metadata_updates, + column_stats: _, } => { if let Ok(cls) = namespace.getattr("UpdateConfig") { let config = export_update_map(py, config_updates)?; diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs index 663454e001b..e006325b41d 100644 --- a/rust/lance-core/src/utils.rs +++ b/rust/lance-core/src/utils.rs @@ -17,3 +17,4 @@ pub mod tempfile; pub mod testing; pub mod tokio; pub mod tracing; +pub mod zone; diff --git a/rust/lance-core/src/utils/zone.rs b/rust/lance-core/src/utils/zone.rs new file mode 100644 index 00000000000..d1b53a76bc3 --- /dev/null +++ b/rust/lance-core/src/utils/zone.rs @@ -0,0 +1,491 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Zone-related utilities for Lance data structures + +use crate::Result; +use arrow_array::ArrayRef; + +/// Zone bound within a fragment +/// +/// # Example +/// +/// Suppose we have two fragments, each with 4 rows: +/// - Fragment 0: start = 0, length = 4 // covers rows 0, 1, 2, 3 +/// - Fragment 1: start = 0, length = 4 // covers rows 0, 1, 2, 3 +/// +/// After deleting rows 0 and 1 from fragment 0, and rows 1 and 2 from fragment 1: +/// - Fragment 0: start = 2, length = 2 // covers rows 2, 3 +/// - Fragment 1: start = 0, length = 4 // covers rows 0, 3 (with gaps) +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ZoneBound { + /// Fragment ID containing this zone + /// + /// For file-level operations (e.g., `FileZoneBuilder`), this is typically 0 + /// since the fragment ID is assigned during commit, not during file writing. + pub fragment_id: u64, + /// Start row offset within the fragment (local offset) + /// + /// To get the actual first row address, use `(fragment_id << 32) | start`. + pub start: u64, + /// Physical row count in the zone (includes deleted rows) + /// + /// Calculated as (last_row_offset - first_row_offset + 1) + pub length: usize, +} + +/// Trait for processing data in zones and computing zone-level statistics. +/// +/// This trait provides a common interface for zone-based processing used in +/// both scalar indexing (ZoneMap) and file-level column statistics. +/// +/// Implementors accumulate statistics as chunks of data are processed, then +/// emit final statistics when a zone is complete. +pub trait ZoneProcessor { + /// The type of statistics produced for each zone + type ZoneStatistics; + + /// Process a slice of values that belongs to the current zone. + /// + /// This method is called repeatedly with chunks of data. Implementations + /// should accumulate statistics incrementally. + fn process_chunk(&mut self, values: &ArrayRef) -> Result<()>; + + /// Emit statistics when the zone is full or the fragment changes. + /// + /// The provided `bound` describes the row range covered by this zone. + /// Implementations should automatically reset internal state after emitting + /// statistics, preparing for the next zone. + fn finish_zone(&mut self, bound: ZoneBound) -> Result; +} + +/// Builds zones from batches during file writing. +/// +/// `FileZoneBuilder` manages zone boundaries and statistics collection for file-level +/// operations. It processes data synchronously in batches without requiring row addresses, +/// making it ideal for writing new data files. +/// +pub struct FileZoneBuilder { + processor: P, + zone_size: u64, + current_zone_rows: u64, + zone_start: u64, + zones: Vec, +} + +impl FileZoneBuilder

{ + pub fn new(processor: P, zone_size: u64) -> Result { + if zone_size == 0 { + return Err(crate::Error::invalid_input( + "zone size must be greater than zero", + snafu::location!(), + )); + } + Ok(Self { + processor, + zone_size, + current_zone_rows: 0, + zone_start: 0, + zones: Vec::new(), + }) + } + + /// Processes a chunk of data, automatically flushing zones when full. + /// + /// This method accumulates data into the current zone and automatically flushes + /// when the zone reaches capacity. If a chunk exceeds the zone size, it is split + /// across multiple zones. The underlying processor's `process_chunk` is called + /// for statistics computation. + pub fn process_chunk(&mut self, array: &ArrayRef) -> Result<()> { + let total_rows = array.len() as u64; + let mut offset = 0usize; + + while offset < total_rows as usize { + // Calculate how many rows we can add to the current zone + let remaining_capacity = self.zone_size - self.current_zone_rows; + let rows_to_process = (total_rows as usize - offset).min(remaining_capacity as usize); + + // Process the slice + let slice = array.slice(offset, rows_to_process); + self.processor.process_chunk(&slice)?; + self.current_zone_rows += rows_to_process as u64; + offset += rows_to_process; + + // If zone is full, flush it and start a new one + if self.current_zone_rows >= self.zone_size { + self.flush_zone()?; + } + } + + Ok(()) + } + + /// Flushes the current zone if it contains any data. + /// + /// Creates a `ZoneBound` with the current zone's position and length, + /// calls the processor's `finish_zone` to compute final statistics + fn flush_zone(&mut self) -> Result<()> { + if self.current_zone_rows > 0 { + let bound = ZoneBound { + fragment_id: 0, // Placeholder; actual fragment ID assigned during commit + start: self.zone_start, + length: self.current_zone_rows as usize, + }; + let stats = self.processor.finish_zone(bound)?; + self.zones.push(stats); + + self.zone_start += self.current_zone_rows; + self.current_zone_rows = 0; + } + Ok(()) + } + + /// Finalizes processing and returns all collected zone statistics. + /// + /// Flushes any remaining partial zone and consumes the builder, + /// returning ownership of all zone statistics collected during processing. + pub fn finalize(mut self) -> Result> { + self.flush_zone()?; + Ok(self.zones) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{ArrayRef, Int32Array}; + use std::sync::Arc; + + #[derive(Debug, Clone, PartialEq)] + struct MockStats { + sum: i32, + bound: ZoneBound, + } + + #[derive(Debug)] + struct MockProcessor { + current_sum: i32, + } + + impl MockProcessor { + fn new() -> Self { + Self { current_sum: 0 } + } + } + + impl ZoneProcessor for MockProcessor { + type ZoneStatistics = MockStats; + + fn process_chunk(&mut self, values: &ArrayRef) -> Result<()> { + let arr = values.as_any().downcast_ref::().unwrap(); + self.current_sum += arr.iter().map(|v| v.unwrap_or(0)).sum::(); + Ok(()) + } + + fn finish_zone(&mut self, bound: ZoneBound) -> Result { + let stats = MockStats { + sum: self.current_sum, + bound, + }; + // Auto-reset for next zone + self.current_sum = 0; + Ok(stats) + } + } + + fn array_from_vec(values: Vec) -> ArrayRef { + Arc::new(Int32Array::from(values)) + } + + #[test] + fn test_exact_zone_size() { + // Data that exactly fills one zone + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 4).unwrap(); + + let arr = array_from_vec(vec![1, 2, 3, 4]); + builder.process_chunk(&arr).unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 1); + assert_eq!(zones[0].sum, 10); // 1+2+3+4 + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[0].bound.length, 4); + } + + #[test] + fn test_multiple_full_zones() { + // Data that fills multiple zones exactly + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 3).unwrap(); + + // First zone: 3 rows + builder + .process_chunk(&array_from_vec(vec![1, 2, 3])) + .unwrap(); + + // Second zone: 3 rows + builder + .process_chunk(&array_from_vec(vec![4, 5, 6])) + .unwrap(); + + // Third zone: 3 rows + builder + .process_chunk(&array_from_vec(vec![7, 8, 9])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 3); + assert_eq!(zones[0].sum, 6); // 1+2+3 + assert_eq!(zones[1].sum, 15); // 4+5+6 + assert_eq!(zones[2].sum, 24); // 7+8+9 + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[1].bound.start, 3); + assert_eq!(zones[2].bound.start, 6); + } + + #[test] + fn test_partial_final_zone() { + // Data that doesn't fill the last zone completely + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 4).unwrap(); + + // First zone: exactly 4 rows + builder + .process_chunk(&array_from_vec(vec![1, 2, 3, 4])) + .unwrap(); + + // Second zone: only 2 rows (partial) + builder.process_chunk(&array_from_vec(vec![5, 6])).unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 2); + assert_eq!(zones[0].sum, 10); // 1+2+3+4 + assert_eq!(zones[1].sum, 11); // 5+6 + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[0].bound.length, 4); + assert_eq!(zones[1].bound.start, 4); + assert_eq!(zones[1].bound.length, 2); + } + + #[test] + fn test_just_under_zone_size() { + // Data that is just one row short of zone size + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 5).unwrap(); + + builder + .process_chunk(&array_from_vec(vec![1, 2, 3, 4])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 1); + assert_eq!(zones[0].sum, 10); // 1+2+3+4 + assert_eq!(zones[0].bound.length, 4); + } + + #[test] + fn test_just_over_zone_size() { + // Data that exceeds zone size by a few rows + // Chunk should be split across multiple zones + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 4).unwrap(); + + // 6 rows in one chunk: should create two zones [1,2,3,4] and [5,6] + builder + .process_chunk(&array_from_vec(vec![1, 2, 3, 4, 5, 6])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 2); + assert_eq!(zones[0].sum, 10); // 1+2+3+4 + assert_eq!(zones[0].bound.length, 4); + assert_eq!(zones[1].sum, 11); // 5+6 + assert_eq!(zones[1].bound.start, 4); + assert_eq!(zones[1].bound.length, 2); + } + + #[test] + fn test_multiple_chunks_exceeding_zone() { + // Multiple small chunks that together exceed zone size + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 5).unwrap(); + + // Chunk 1: 2 rows + builder.process_chunk(&array_from_vec(vec![1, 2])).unwrap(); + + // Chunk 2: 2 rows (total: 4, still under) + builder.process_chunk(&array_from_vec(vec![3, 4])).unwrap(); + + // Chunk 3: 2 rows (total: 6, exceeds zone size) + builder.process_chunk(&array_from_vec(vec![5, 6])).unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 2); + assert_eq!(zones[0].sum, 15); // 1+2+3+4+5 + assert_eq!(zones[0].bound.length, 5); + assert_eq!(zones[1].sum, 6); // Just row 6 + assert_eq!(zones[1].bound.start, 5); + assert_eq!(zones[1].bound.length, 1); + } + + #[test] + fn test_zone_size_one() { + // With zone size = 1, each row triggers a flush + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 1).unwrap(); + + // Process one row at a time + builder.process_chunk(&array_from_vec(vec![10])).unwrap(); + builder.process_chunk(&array_from_vec(vec![20])).unwrap(); + builder.process_chunk(&array_from_vec(vec![30])).unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 3); + assert_eq!(zones[0].sum, 10); + assert_eq!(zones[1].sum, 20); + assert_eq!(zones[2].sum, 30); + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[1].bound.start, 1); + assert_eq!(zones[2].bound.start, 2); + } + + #[test] + fn test_large_zone_size() { + // Zone size larger than total data - all data in one zone + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 100).unwrap(); + + builder.process_chunk(&array_from_vec(vec![1; 10])).unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 1); + assert_eq!(zones[0].sum, 10); // 10 ones + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[0].bound.length, 10); + } + + #[test] + fn test_empty_array() { + // Empty arrays should be handled gracefully + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 4).unwrap(); + + builder.process_chunk(&array_from_vec(vec![])).unwrap(); + + // Add some real data + builder + .process_chunk(&array_from_vec(vec![1, 2, 3, 4])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 1); + assert_eq!(zones[0].sum, 10); + } + + #[test] + fn test_zone_boundaries_sequential() { + // Verify zone start positions are sequential + // Process in chunks that don't exceed zone size + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 3).unwrap(); + + // Process in chunks of 3 (exactly zone size) + builder + .process_chunk(&array_from_vec(vec![1, 2, 3])) + .unwrap(); + + builder + .process_chunk(&array_from_vec(vec![4, 5, 6])) + .unwrap(); + + // Last chunk: 2 rows (partial) + builder.process_chunk(&array_from_vec(vec![7, 8])).unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 3); + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[1].bound.start, 3); + assert_eq!(zones[2].bound.start, 6); + assert_eq!(zones[0].bound.length, 3); + assert_eq!(zones[1].bound.length, 3); + assert_eq!(zones[2].bound.length, 2); // Last partial zone + } + + #[test] + fn test_rejects_zero_zone_size() { + let processor = MockProcessor::new(); + let result = FileZoneBuilder::new(processor, 0); + assert!(result.is_err()); + let err_msg = format!("{}", result.err().unwrap()); + assert!(err_msg.contains("zone size must be greater than zero")); + } + + #[test] + fn test_fragment_id_placeholder() { + // Verify fragment_id is set to 0 (placeholder) for file-level operations + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 3).unwrap(); + + builder + .process_chunk(&array_from_vec(vec![1, 2, 3])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones[0].bound.fragment_id, 0); + } + + #[test] + fn test_edge_case_one_row_short() { + // Zone size = 5, data = 4 rows (exactly one short) + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 5).unwrap(); + + builder + .process_chunk(&array_from_vec(vec![1, 2, 3, 4])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 1); + assert_eq!(zones[0].bound.length, 4); + } + + #[test] + fn test_edge_case_one_row_over() { + // Zone size = 4, data = 5 rows (exactly one over) + // Should create two zones: [1,2,3,4] and [5] + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 4).unwrap(); + + builder + .process_chunk(&array_from_vec(vec![1, 2, 3, 4, 5])) + .unwrap(); + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 2); + assert_eq!(zones[0].sum, 10); // 1+2+3+4 + assert_eq!(zones[0].bound.length, 4); + assert_eq!(zones[1].sum, 5); // Just row 5 + assert_eq!(zones[1].bound.start, 4); + assert_eq!(zones[1].bound.length, 1); + } + + #[test] + fn test_large_number_of_small_chunks() { + // Many small chunks that accumulate + let processor = MockProcessor::new(); + let mut builder = FileZoneBuilder::new(processor, 10).unwrap(); + + // Add 20 chunks of 1 row each + for i in 1..=20 { + builder.process_chunk(&array_from_vec(vec![i])).unwrap(); + } + + let zones = builder.finalize().unwrap(); + assert_eq!(zones.len(), 2); + assert_eq!(zones[0].sum, 55); // Sum of 1..=10 + assert_eq!(zones[1].sum, 155); // Sum of 11..=20 + assert_eq!(zones[0].bound.start, 0); + assert_eq!(zones[1].bound.start, 10); + } +} diff --git a/rust/lance-file/Cargo.toml b/rust/lance-file/Cargo.toml index abf3ea07bf1..fc81e069569 100644 --- a/rust/lance-file/Cargo.toml +++ b/rust/lance-file/Cargo.toml @@ -20,6 +20,7 @@ arrow-arith.workspace = true arrow-array.workspace = true arrow-buffer.workspace = true arrow-data.workspace = true +arrow-ipc.workspace = true arrow-schema.workspace = true arrow-select.workspace = true async-recursion.workspace = true @@ -27,6 +28,8 @@ async-trait.workspace = true byteorder.workspace = true bytes.workspace = true datafusion-common.workspace = true +datafusion-expr.workspace = true +datafusion.workspace = true deepsize.workspace = true futures.workspace = true log.workspace = true diff --git a/rust/lance-file/src/reader.rs b/rust/lance-file/src/reader.rs index 4c48edf5e9e..bf66a4c3c95 100644 --- a/rust/lance-file/src/reader.rs +++ b/rust/lance-file/src/reader.rs @@ -10,6 +10,7 @@ use std::{ }; use arrow_array::RecordBatchReader; +use arrow_ipc; use arrow_schema::Schema as ArrowSchema; use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; use bytes::{Bytes, BytesMut}; @@ -47,7 +48,10 @@ use crate::{ datatypes::{Fields, FieldsWithMeta}, format::{pb, pbfile, MAGIC, MAJOR_VERSION, MINOR_VERSION}, io::LanceEncodingsIo, - writer::PAGE_BUFFER_ALIGNMENT, + writer::{ + COLUMN_STATS_BUFFER_INDEX_KEY, COLUMN_STATS_VERSION, COLUMN_STATS_VERSION_KEY, + PAGE_BUFFER_ALIGNMENT, + }, }; /// Default chunk size for reading large pages (8MiB) @@ -767,7 +771,14 @@ impl FileReader { )); } if *column_index >= metadata.column_infos.len() as u32 { - return Err(Error::invalid_input(format!("The projection specified the column index {} but there are only {} columns in the file", column_index, metadata.column_infos.len()), location!())); + return Err(Error::invalid_input( + format!( + "The projection specified the column index {} but there are only {} columns in the file", + column_index, + metadata.column_infos.len() + ), + location!(), + )); } } Ok(()) @@ -1400,6 +1411,110 @@ impl FileReader { pub fn schema(&self) -> &Arc { &self.metadata.file_schema } + + /// Check if the file contains column statistics. + /// + /// Column statistics are stored in the schema metadata. If the metadata + /// contains the buffer index key, the file has column statistics that can + /// be read with `read_column_stats()`. + /// + pub fn has_column_stats(&self) -> bool { + self.metadata + .file_schema + .metadata + .contains_key(COLUMN_STATS_BUFFER_INDEX_KEY) + } + + /// Read column statistics from the file. + /// + /// Column statistics are stored as a global buffer containing an Arrow IPC + /// encoded RecordBatch. The batch uses a **columnar layout**: one column per + /// dataset column (each of type `ColumnZoneStatistics` struct), one row per zone. + /// See details in writer.rs + /// + pub async fn read_column_stats(&self) -> Result> { + // Check if column stats exist + let Some(buffer_index_str) = self + .metadata + .file_schema + .metadata + .get(COLUMN_STATS_BUFFER_INDEX_KEY) + else { + return Ok(None); + }; + + // Check version for forward compatibility + let version = self + .metadata + .file_schema + .metadata + .get(COLUMN_STATS_VERSION_KEY) + .and_then(|v| v.parse::().ok()) + .unwrap_or(0); + + // Skip stats from newer versions for forward compatibility + if version > COLUMN_STATS_VERSION { + log::warn!( + "Column stats version {} is newer than supported version {}. \ + Skipping column stats for forward compatibility.", + version, + COLUMN_STATS_VERSION + ); + return Ok(None); + } + + // Parse the buffer index + let buffer_index: usize = buffer_index_str.parse().map_err(|_| Error::Internal { + message: format!( + "Invalid column stats buffer index in metadata: {}", + buffer_index_str + ), + location: location!(), + })?; + + // Check bounds + if buffer_index >= self.metadata.file_buffers.len() { + return Err(Error::Internal { + message: format!( + "Column stats buffer index {} out of bounds (only {} buffers)", + buffer_index, + self.metadata.file_buffers.len() + ), + location: location!(), + }); + } + + // Read the buffer + let buffer_descriptor = &self.metadata.file_buffers[buffer_index]; + let stats_bytes_vec = self + .scheduler + .submit_request( + vec![ + buffer_descriptor.position..buffer_descriptor.position + buffer_descriptor.size, + ], + 0, + ) + .await?; + + // The buffer is returned as a single chunk since we requested one range + let stats_bytes = stats_bytes_vec.into_iter().next().unwrap(); + + // Decode Arrow IPC format + let cursor = Cursor::new(stats_bytes.as_ref()); + let mut reader = + arrow_ipc::reader::FileReader::try_new(cursor, None).map_err(|e| Error::Internal { + message: format!("Failed to decode column stats Arrow IPC: {}", e), + location: location!(), + })?; + + // Read the single batch + let batch = reader.next().transpose().map_err(|e| Error::Internal { + message: format!("Failed to read column stats batch: {}", e), + location: location!(), + })?; + + Ok(batch) + } } /// Inspects a page and returns a String describing the page's encoding @@ -2274,4 +2389,213 @@ pub mod tests { let buf = file_reader.read_global_buffer(1).await.unwrap(); assert_eq!(buf, test_bytes); } + + #[tokio::test] + async fn test_column_stats_reading() { + use arrow_array::{Int32Array, RecordBatch}; + use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use std::sync::Arc; + + let fs = FsFixture::default(); + + // Create a schema with metadata indicating column stats + let lance_schema = + lance_core::datatypes::Schema::try_from(&ArrowSchema::new(vec![ArrowField::new( + "data", + DataType::Int32, + false, + )])) + .unwrap(); + + let mut file_writer = FileWriter::try_new( + fs.object_store.create(&fs.tmp_path).await.unwrap(), + lance_schema.clone(), + FileWriterOptions { + disable_column_stats: false, // Stats enabled + ..Default::default() + }, + ) + .unwrap(); + + // Write some data (this will trigger column stats generation) + let data_batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ArrowField::new( + "data", + DataType::Int32, + false, + )])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + ) + .unwrap(); + + file_writer.write_batch(&data_batch).await.unwrap(); + file_writer.finish().await.unwrap(); + + // Read the file and check column stats + let file_scheduler = fs + .scheduler + .open_file(&fs.tmp_path, &CachedFileSize::unknown()) + .await + .unwrap(); + let file_reader = FileReader::try_open( + file_scheduler.clone(), + None, + Arc::::default(), + &test_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); + + // Check that column stats exist + assert!( + file_reader.has_column_stats(), + "File should have column stats" + ); + + // Read the column stats + let stats_batch = file_reader + .read_column_stats() + .await + .unwrap() + .expect("Expected column stats to be present"); + + // Columnar layout: one column per dataset column, each of type ColumnZoneStatistics struct. + // One row per zone. Schema has one column "data" (Struct: min, max, null_count, nan_count, bound). + assert_eq!(stats_batch.num_columns(), 1); + assert_eq!( + stats_batch.schema().field(0).name(), + "data", + "Single column should be named after the dataset column" + ); + + assert!( + stats_batch.num_rows() > 0, + "Should have at least one row (one per zone)" + ); + + let data_column = stats_batch.column_by_name("data").unwrap(); + let data_struct = data_column + .as_any() + .downcast_ref::() + .unwrap(); + + use arrow_array::{UInt32Array, UInt64Array}; + let min_val: i32 = data_struct + .column_by_name("min") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + let max_val: i32 = data_struct + .column_by_name("max") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + let null_counts = data_struct + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let nan_counts = data_struct + .column_by_name("nan_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let bound_column = data_struct.column_by_name("bound").unwrap(); + let bound_struct = bound_column + .as_any() + .downcast_ref::() + .unwrap(); + let zone_starts = bound_struct + .column_by_name("start") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let zone_lengths = bound_struct + .column_by_name("length") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(zone_starts.value(0), 0, "Zone should start at row 0"); + assert_eq!(zone_lengths.value(0), 5, "Zone should have 5 rows"); + assert_eq!(null_counts.value(0), 0, "Should have 0 nulls"); + assert_eq!(nan_counts.value(0), 0, "Should have 0 NaNs (Int32 type)"); + assert_eq!(min_val, 1, "Min value should be 1"); + assert_eq!(max_val, 5, "Max value should be 5"); + } + + #[tokio::test] + async fn test_no_column_stats() { + use arrow_array::{Int32Array, RecordBatch}; + use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use std::sync::Arc; + + let fs = FsFixture::default(); + + let lance_schema = + lance_core::datatypes::Schema::try_from(&ArrowSchema::new(vec![ArrowField::new( + "foo", + DataType::Int32, + false, + )])) + .unwrap(); + + let mut file_writer = FileWriter::try_new( + fs.object_store.create(&fs.tmp_path).await.unwrap(), + lance_schema.clone(), + FileWriterOptions { + disable_column_stats: true, // Explicitly disable + ..Default::default() + }, + ) + .unwrap(); + + // Write some data + let data_batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ArrowField::new( + "foo", + DataType::Int32, + false, + )])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + file_writer.write_batch(&data_batch).await.unwrap(); + file_writer.finish().await.unwrap(); + + // Read the file + let file_scheduler = fs + .scheduler + .open_file(&fs.tmp_path, &CachedFileSize::unknown()) + .await + .unwrap(); + let file_reader = FileReader::try_open( + file_scheduler.clone(), + None, + Arc::::default(), + &test_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); + + // Verify no column stats + assert!( + !file_reader.has_column_stats(), + "File should not have column stats" + ); + + let stats = file_reader.read_column_stats().await.unwrap(); + assert!(stats.is_none(), "Should return None when no stats present"); + } } diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index ea753f463f9..7defae6367a 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -6,7 +6,9 @@ use std::collections::HashMap; use std::sync::atomic::AtomicBool; use std::sync::Arc; -use arrow_array::RecordBatch; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_schema::{DataType, Field as ArrowField, Fields, Schema as ArrowSchema}; +use lance_core::utils::zone::FileZoneBuilder; use arrow_data::ArrayData; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -33,6 +35,8 @@ use snafu::location; use tokio::io::AsyncWriteExt; use tracing::instrument; +use datafusion_common::ScalarValue; + use crate::datatypes::FieldsWithMeta; use crate::format::pb; use crate::format::pbfile; @@ -50,6 +54,13 @@ const PAD_BUFFER: [u8; PAGE_BUFFER_ALIGNMENT] = [72; PAGE_BUFFER_ALIGNMENT]; const MAX_PAGE_BYTES: usize = 32 * 1024 * 1024; const ENV_LANCE_FILE_WRITER_MAX_PAGE_BYTES: &str = "LANCE_FILE_WRITER_MAX_PAGE_BYTES"; +/// Metadata key for column statistics buffer index +pub const COLUMN_STATS_BUFFER_INDEX_KEY: &str = "lance:column_stats:buffer_index"; +/// Metadata key for column statistics version +pub const COLUMN_STATS_VERSION_KEY: &str = "lance:column_stats:version"; +/// Current version of column statistics format +pub const COLUMN_STATS_VERSION: u32 = 1; + #[derive(Debug, Clone, Default)] pub struct FileWriterOptions { /// How many bytes to use for buffering column data @@ -98,6 +109,11 @@ pub struct FileWriterOptions { /// versions may have more efficient encodings. However, newer format versions will /// require more up-to-date readers to read the data. pub format_version: Option, + + /// If true, disable column statistics generation when writing data files. + /// Column statistics can be used for planning optimization and filtering. + /// Default is false (column stats are enabled by default). + pub disable_column_stats: bool, } // Total in-memory budget for buffering serialized page metadata before flushing @@ -181,6 +197,112 @@ impl PageMetadataSpill { Ok(()) } } +/// Column statistics for a single zone +#[derive(Debug, Clone)] +struct ColumnZoneStatistics { + min: ScalarValue, + max: ScalarValue, + null_count: u32, + nan_count: u32, + // TODO: add more stats like mean, avg_len and dist_cnt + bound: ZoneBound, +} + +/// Statistics processor for a single column that implements ZoneProcessor trait +struct ColumnStatisticsProcessor { + data_type: DataType, + min: MinAccumulator, + max: MaxAccumulator, + null_count: u32, + nan_count: u32, +} + +impl ColumnStatisticsProcessor { + fn new(data_type: DataType) -> Result { + // TODO: Upstream DataFusion accumulators does not handle many nested types + let min = MinAccumulator::try_new(&data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + let max = MaxAccumulator::try_new(&data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + Ok(Self { + data_type, + min, + max, + null_count: 0, + nan_count: 0, + }) + } + + fn count_nans(array: &ArrayRef) -> u32 { + match array.data_type() { + DataType::Float16 => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + array.values().iter().filter(|&&x| x.is_nan()).count() as u32 + } + DataType::Float32 => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + array.values().iter().filter(|&&x| x.is_nan()).count() as u32 + } + DataType::Float64 => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + array.values().iter().filter(|&&x| x.is_nan()).count() as u32 + } + _ => 0, + } + } +} + +/// Implement ZoneProcessor trait for ColumnStatisticsProcessor +impl ZoneProcessor for ColumnStatisticsProcessor { + type ZoneStatistics = ColumnZoneStatistics; + + fn process_chunk(&mut self, array: &ArrayRef) -> Result<()> { + self.null_count += array.null_count() as u32; + self.nan_count += Self::count_nans(array); + self.min + .update_batch(std::slice::from_ref(array)) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + self.max + .update_batch(std::slice::from_ref(array)) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + Ok(()) + } + + fn finish_zone(&mut self, bound: ZoneBound) -> Result { + Ok(ColumnZoneStatistics { + min: self + .min + .evaluate() + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?, + max: self + .max + .evaluate() + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?, + null_count: self.null_count, + nan_count: self.nan_count, + bound, + }) + } + + fn reset(&mut self) -> Result<()> { + self.min = MinAccumulator::try_new(&self.data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + self.max = MaxAccumulator::try_new(&self.data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + self.null_count = 0; + self.nan_count = 0; + Ok(()) + } +} fn decode_spilled_chunk(data: &Bytes) -> Result> { let mut pages = Vec::new(); @@ -203,6 +325,46 @@ enum PageSpillState { Active(PageMetadataSpill), } +/// Convert ScalarValue to string, extracting only the value without type prefix +/// E.g., Int32(42) -> "42", Float64(3.14) -> "3.14", Utf8("hello") -> "hello" +fn scalar_value_to_string(value: &ScalarValue) -> String { + let debug_str = format!("{:?}", value); + + // For string types, extract the quoted value + if debug_str.starts_with("Utf8(") || debug_str.starts_with("LargeUtf8(") { + // Extract content between quotes: Utf8("hello") -> "hello" + if let Some(start) = debug_str.find('"') { + if let Some(end) = debug_str.rfind('"') { + if end > start { + return debug_str[start + 1..end].to_string(); + } + } + } + } + + // For numeric types, extract content between parentheses + // Int32(42) -> "42", Float64(3.14) -> "3.14" + if let Some(start) = debug_str.find('(') { + if let Some(end) = debug_str.rfind(')') { + return debug_str[start + 1..end].to_string(); + } + } + + // Fallback: return the whole debug string (shouldn't happen for supported types) + debug_str +} + +/// Zone size for column statistics (1 million rows per zone) +const COLUMN_STATS_ZONE_SIZE: u64 = 1_000_000; +// Column statistics types and processors are defined in the column_stats submodule +mod column_stats; +use column_stats::{ + create_column_zone_statistics_struct_type, ColumnStatisticsProcessor, COLUMN_STATS_ZONE_SIZE, +}; + +// Re-export for use in consolidation +pub use column_stats::create_consolidated_zone_struct_type; + pub struct FileWriter { writer: ObjectWriter, schema: Option, @@ -215,6 +377,8 @@ pub struct FileWriter { schema_metadata: HashMap, options: FileWriterOptions, page_spill: Option, + /// Column statistics processors (one per column; None for types that don't support min/max, e.g. List) + column_stats_processors: Option>>>, } fn initial_column_metadata() -> pbfile::ColumnMetadata { @@ -256,7 +420,9 @@ impl FileWriter { ) .is_ok() { - warn!("You have requested an unstable format version. Files written with this format version may not be readable in the future! This is a development feature and should only be used for experimentation and never for production data."); + warn!( + "You have requested an unstable format version. Files written with this format version may not be readable in the future! This is a development feature and should only be used for experimentation and never for production data." + ); } } Self { @@ -271,6 +437,7 @@ impl FileWriter { schema_metadata: HashMap::new(), page_spill: None, options, + column_stats_processors: None, } } @@ -392,7 +559,13 @@ impl FileWriter { fn verify_field_nullability(arr: &ArrayData, field: &Field) -> Result<()> { if !field.nullable && arr.null_count() > 0 { - return Err(Error::invalid_input(format!("The field `{}` contained null values even though the field is marked non-null in the schema", field.name), location!())); + return Err(Error::invalid_input( + format!( + "The field `{}` contained null values even though the field is marked non-null in the schema", + field.name + ), + location!(), + )); } for (child_field, child_arr) in field.children.iter().zip(arr.child_data()) { @@ -459,6 +632,22 @@ impl FileWriter { self.schema_metadata .extend(std::mem::take(&mut schema.metadata)); self.schema = Some(schema); + + // Initialize column statistics processors if enabled; skip columns for which DataFusion + // min/max is not supported (try_new fails), so we stay in sync with DataFusion upgrades. + if !self.options.disable_column_stats { + let mut processors = Vec::new(); + for field in &self.schema.as_ref().unwrap().fields { + let data_type = field.data_type().clone(); + let opt_processor = match ColumnStatisticsProcessor::new(data_type) { + Ok(processor) => Some(FileZoneBuilder::new(processor, COLUMN_STATS_ZONE_SIZE)?), + Err(_) => None, + }; + processors.push(opt_processor); + } + self.column_stats_processors = Some(processors); + } + Ok(()) } @@ -553,6 +742,25 @@ impl FileWriter { self.write_pages(encoding_tasks).await?; + // TODO: Reuse the other read path so that we dont need to do the calculation twice + // Accumulate column statistics if enabled (skip columns with None processor, set at init from try_new). + if let Some(ref mut processors) = self.column_stats_processors { + for (field, opt_processor) in self + .schema + .as_ref() + .unwrap() + .fields + .iter() + .zip(processors.iter_mut()) + { + if let (Some(processor), Some(array)) = + (opt_processor, batch.column_by_name(&field.name)) + { + processor.process_chunk(array)?; + } + } + } + Ok(()) } @@ -777,6 +985,10 @@ impl FileWriter { } // 3. write global buffers (we write the schema here) + // Build the column statistics if enabled + if !self.options.disable_column_stats { + self.build_column_statistics().await?; + } let global_buffer_offsets = self.write_global_buffers().await?; let num_global_buffers = global_buffer_offsets.len() as u32; @@ -819,6 +1031,220 @@ impl FileWriter { self.writer.abort().await; } + /// Build column statistics for the written data. + /// + /// Statistics are serialized as an Arrow RecordBatch and stored in a global buffer. + /// This format is forward/backward compatible - new statistics fields can be added + /// without breaking older readers. + /// + async fn build_column_statistics(&mut self) -> Result<()> { + let Some(processors) = self.column_stats_processors.take() else { + return Ok(()); // Statistics not enabled + }; + + let schema = self.schema.as_ref().ok_or_else(|| { + Error::invalid_input( + "Cannot build statistics: schema not initialized", + location!(), + ) + })?; + + // Columnar layout: one column per dataset column, each containing ColumnZoneStatistics structs + // Rows = zones (one row per zone) + // + // Example layout for a dataset with 2 columns ("id", "price") and 2 zones: + // ┌─────────────────────────────────────┬─────────────────────────────────────┐ + // │ id (ColumnZoneStatistics) │ price (ColumnZoneStatistics) │ + // ├─────────────────────────────────────┼─────────────────────────────────────┤ + // │ {min:"1", max:"1000000", ...} │ {min:"9.99", max:"99.99", ...} │ + // │ {min:"1000001", max:"2000000", ...} │ {min:"10.50", max:"100.50", ...} │ + // └─────────────────────────────────────┴─────────────────────────────────────┘ + // + // Each row represents one zone. Each column contains ColumnZoneStatistics for that dataset column. + + use arrow_array::StructArray; + + // Collect zones per column (name, zones). Arrow type is looked up from schema by name when writing. + let mut column_zones: Vec<(String, Vec)> = Vec::new(); + let mut num_zones = None; + + for (field, opt_processor) in schema.fields.iter().zip(processors.into_iter()) { + let Some(processor) = opt_processor else { + continue; // Unsupported type (e.g. List), skip column stats + }; + let zones = processor.finalize()?; + + // Skip columns with no zones + if zones.is_empty() { + continue; + } + + // All columns should have the same number of zones in a single file + if let Some(expected_zones) = num_zones { + if zones.len() != expected_zones { + return Err(Error::Internal { + message: format!( + "Column statistics mismatch: column '{}' has {} zones but expected {}", + field.name, + zones.len(), + expected_zones + ), + location: location!(), + }); + } + } else { + num_zones = Some(zones.len()); + } + + column_zones.push((field.name.clone(), zones)); + } + + // If no statistics were collected, return early + if column_zones.is_empty() { + return Ok(()); + } + + let num_zones = num_zones.unwrap(); + + // Build struct arrays for each column (min/max use column's actual type) + let mut column_arrays: Vec = Vec::new(); + let mut schema_fields: Vec = Vec::new(); + + for (col_name, zones) in &column_zones { + let field = schema.field(col_name).ok_or_else(|| Error::Internal { + message: format!( + "Column '{}' not found in schema when building column stats", + col_name + ), + location: location!(), + })?; + let data_type = field.data_type(); + + // Build min/max arrays from zone scalars; array type is inferred from ScalarValue + let min_array = ScalarValue::iter_to_array(zones.iter().map(|z| z.min.clone())) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + let max_array = ScalarValue::iter_to_array(zones.iter().map(|z| z.max.clone())) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + + let mut null_counts = Vec::with_capacity(num_zones); + let mut nan_counts = Vec::with_capacity(num_zones); + let mut fragment_ids = Vec::with_capacity(num_zones); + let mut zone_starts = Vec::with_capacity(num_zones); + let mut zone_lengths = Vec::with_capacity(num_zones); + + for zone in zones { + null_counts.push(zone.null_count); + nan_counts.push(zone.nan_count); + fragment_ids.push(zone.bound.fragment_id); + zone_starts.push(zone.bound.start); + zone_lengths.push(zone.bound.length as u64); + } + + let column_zone_stats_type = create_column_zone_statistics_struct_type(&data_type); + + // Build ZoneBound struct array + let zone_bound_struct = StructArray::from(vec![ + ( + Arc::new(ArrowField::new("fragment_id", DataType::UInt64, false)), + Arc::new(arrow_array::UInt64Array::from(fragment_ids)) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("start", DataType::UInt64, false)), + Arc::new(arrow_array::UInt64Array::from(zone_starts)) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("length", DataType::UInt64, false)), + Arc::new(arrow_array::UInt64Array::from(zone_lengths)) as ArrayRef, + ), + ]); + + // Build ColumnZoneStatistics struct array (min/max are typed, nullable) + let column_stats_struct = StructArray::from(vec![ + ( + Arc::new(ArrowField::new("min", data_type.clone(), true)), + min_array, + ), + ( + Arc::new(ArrowField::new("max", data_type.clone(), true)), + max_array, + ), + ( + Arc::new(ArrowField::new("null_count", DataType::UInt32, false)), + Arc::new(arrow_array::UInt32Array::from(null_counts)) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("nan_count", DataType::UInt32, false)), + Arc::new(arrow_array::UInt32Array::from(nan_counts)) as ArrayRef, + ), + ( + Arc::new(ArrowField::new( + "bound", + DataType::Struct(Fields::from(vec![ + ArrowField::new("fragment_id", DataType::UInt64, false), + ArrowField::new("start", DataType::UInt64, false), + ArrowField::new("length", DataType::UInt64, false), + ])), + false, + )), + Arc::new(zone_bound_struct) as ArrayRef, + ), + ]); + + schema_fields.push(ArrowField::new( + col_name, + column_zone_stats_type.clone(), + false, + )); + column_arrays.push(Arc::new(column_stats_struct) as ArrayRef); + } + + // Create schema for the statistics RecordBatch (columnar: one column per dataset column) + let stats_schema = Arc::new(ArrowSchema::new(schema_fields)); + + // Create RecordBatch (columnar structure: one row per zone, one column per dataset column) + let stats_batch = RecordBatch::try_new(stats_schema, column_arrays).map_err(|e| { + Error::invalid_input( + format!("Failed to create statistics batch: {}", e), + location!(), + ) + })?; + + // Serialize to Arrow IPC format + let mut buffer = Vec::new(); + { + let mut writer = + arrow_ipc::writer::FileWriter::try_new(&mut buffer, &stats_batch.schema()) + .map_err(|e| { + Error::invalid_input( + format!("Failed to create IPC writer: {}", e), + location!(), + ) + })?; + writer.write(&stats_batch).map_err(|e| { + Error::invalid_input(format!("Failed to write statistics: {}", e), location!()) + })?; + writer.finish().map_err(|e| { + Error::invalid_input(format!("Failed to finish IPC writer: {}", e), location!()) + })?; + } + + // Store as global buffer + let buffer_bytes = Bytes::from(buffer); + let buffer_index = self.add_global_buffer(buffer_bytes).await?; + + // Store the buffer index in schema metadata so readers can find it + self.schema_metadata.insert( + COLUMN_STATS_BUFFER_INDEX_KEY.to_string(), + buffer_index.to_string(), + ); + self.schema_metadata.insert( + COLUMN_STATS_VERSION_KEY.to_string(), + COLUMN_STATS_VERSION.to_string(), + ); + + Ok(()) + } + pub async fn tell(&mut self) -> Result { Ok(self.writer.tell().await? as u64) } @@ -1710,4 +2136,380 @@ mod tests { .await; assert_eq!(baseline, spilled); } + + #[tokio::test] + async fn test_column_stats_flat_layout() { + // Test that column statistics use columnar layout: one column per dataset column, + // each of type ColumnZoneStatistics struct, one row per zone. + use arrow_array::{Float64Array, Int32Array, StructArray, UInt64Array}; + use arrow_schema::Schema; + + let arrow_schema = Arc::new(Schema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("value", DataType::Float64, false), + ])); + let lance_schema = LanceSchema::try_from(arrow_schema.as_ref()).unwrap(); + + // Create data with 2.5M rows (will create 3 zones at 1M rows each) + let id_data: Vec = (0..2_500_000).collect(); + let value_data: Vec = (0..2_500_000).map(|i| i as f64 * 0.5).collect(); + + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int32Array::from(id_data)), + Arc::new(Float64Array::from(value_data)), + ], + ) + .unwrap(); + + let path = TempObjFile::default(); + let object_store = ObjectStore::local(); + + let options = FileWriterOptions { + disable_column_stats: false, + ..Default::default() + }; + + let mut writer = FileWriter::try_new( + object_store.create(&path).await.unwrap(), + lance_schema.clone(), + options, + ) + .unwrap(); + + writer.write_batch(&batch).await.unwrap(); + writer.finish().await.unwrap(); + + // Read back and verify the columnar layout + let fs = FsFixture::default(); + let file_scheduler = fs + .scheduler + .open_file(&path, &CachedFileSize::unknown()) + .await + .unwrap(); + + let file_reader = FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); + + let stats_batch = file_reader + .read_column_stats() + .await + .unwrap() + .expect("Should have column stats"); + + // Columnar layout: one column per dataset column (id, value), one row per zone + let schema = stats_batch.schema(); + assert_eq!( + schema.fields().len(), + 2, + "Schema: {:?}", + schema.fields().iter().map(|f| f.name()).collect::>() + ); + assert_eq!(schema.field(0).name(), "id"); + assert_eq!(schema.field(1).name(), "value"); + + // 3 zones → 3 rows + assert_eq!(stats_batch.num_rows(), 3); + + // Each column is a StructArray (ColumnZoneStatistics: min, max, null_count, nan_count, bound) + let id_col = stats_batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let bound_col = id_col.column_by_name("bound").unwrap(); + let bound_struct = bound_col.as_any().downcast_ref::().unwrap(); + let starts = bound_struct + .column_by_name("start") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let lengths = bound_struct + .column_by_name("length") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(starts.value(0), 0); + assert_eq!(lengths.value(0), 1_000_000); + assert_eq!(starts.value(1), 1_000_000); + assert_eq!(lengths.value(1), 1_000_000); + assert_eq!(starts.value(2), 2_000_000); + assert_eq!(lengths.value(2), 500_000); + } + + #[tokio::test] + async fn test_column_stats_multiple_columns() { + // Test that stats are correctly computed for multiple columns with multiple zones. + // Columnar layout: one column per dataset column (col1, col2, col3), one row per zone. + use arrow_array::{Float64Array, Int32Array}; + use arrow_schema::Schema; + + let arrow_schema = Arc::new(Schema::new(vec![ + ArrowField::new("col1", DataType::Int32, false), + ArrowField::new("col2", DataType::Int32, false), + ArrowField::new("col3", DataType::Float64, false), + ])); + let lance_schema = LanceSchema::try_from(arrow_schema.as_ref()).unwrap(); + + // Create data with 1.5M rows (will create 2 zones) + let rows = 1_500_000; + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..rows)), + Arc::new(Int32Array::from_iter_values((0..rows).map(|i| i * 2))), + Arc::new(Float64Array::from_iter_values( + (0..rows).map(|i| i as f64 * 0.5), + )), + ], + ) + .unwrap(); + + let path = TempObjFile::default(); + let object_store = ObjectStore::local(); + + let options = FileWriterOptions { + disable_column_stats: false, + ..Default::default() + }; + + let mut writer = FileWriter::try_new( + object_store.create(&path).await.unwrap(), + lance_schema.clone(), + options, + ) + .unwrap(); + + writer.write_batch(&batch).await.unwrap(); + writer.finish().await.unwrap(); + + // Read back and verify stats + let fs = FsFixture::default(); + let file_scheduler = fs + .scheduler + .open_file(&path, &CachedFileSize::unknown()) + .await + .unwrap(); + + let file_reader = FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); + + let stats_batch = file_reader + .read_column_stats() + .await + .unwrap() + .expect("Should have column stats"); + + // Columnar layout: 3 columns (col1, col2, col3), 2 rows (one per zone) + assert_eq!(stats_batch.num_columns(), 3); + assert_eq!(stats_batch.num_rows(), 2); + + assert!(stats_batch.column_by_name("col1").is_some()); + assert!(stats_batch.column_by_name("col2").is_some()); + assert!(stats_batch.column_by_name("col3").is_some()); + + // Each column is a StructArray (ColumnZoneStatistics) with min, max, null_count, nan_count, bound + for col_name in ["col1", "col2", "col3"] { + let col = stats_batch.column_by_name(col_name).unwrap(); + let struct_arr = col + .as_any() + .downcast_ref::() + .unwrap(); + assert!(struct_arr.column_by_name("min").is_some()); + assert!(struct_arr.column_by_name("max").is_some()); + assert!(struct_arr.column_by_name("null_count").is_some()); + assert!(struct_arr.column_by_name("bound").is_some()); + } + } + + #[tokio::test] + async fn test_column_stats_with_nulls_and_nans() { + // Test that null_count and nan_count are correctly tracked. + // Columnar layout: one column per dataset column (id, value), one row per zone. + use arrow_array::{Float64Array, Int32Array, StructArray, UInt32Array}; + use arrow_schema::Schema; + + let arrow_schema = Arc::new(Schema::new(vec![ + ArrowField::new("id", DataType::Int32, true), // nullable + ArrowField::new("value", DataType::Float64, false), + ])); + let lance_schema = LanceSchema::try_from(arrow_schema.as_ref()).unwrap(); + + // Create data with nulls and NaNs + let id_data = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let value_data = Float64Array::from(vec![1.0, f64::NAN, 3.0, f64::NAN, 5.0]); + + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(id_data), Arc::new(value_data)], + ) + .unwrap(); + + let path = TempObjFile::default(); + let object_store = ObjectStore::local(); + + let options = FileWriterOptions { + disable_column_stats: false, + ..Default::default() + }; + + let mut writer = FileWriter::try_new( + object_store.create(&path).await.unwrap(), + lance_schema.clone(), + options, + ) + .unwrap(); + + writer.write_batch(&batch).await.unwrap(); + writer.finish().await.unwrap(); + + // Read back and verify null/nan counts + let fs = FsFixture::default(); + let file_scheduler = fs + .scheduler + .open_file(&path, &CachedFileSize::unknown()) + .await + .unwrap(); + + let file_reader = FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); + + let stats_batch = file_reader + .read_column_stats() + .await + .unwrap() + .expect("Should have column stats"); + + // Columnar layout: 2 columns (id, value), 1 row (one zone for 5 rows) + assert_eq!(stats_batch.num_columns(), 2); + assert_eq!(stats_batch.num_rows(), 1); + + let id_col = stats_batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value_col = stats_batch + .column_by_name("value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let id_null_counts = id_col + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_nan_counts = id_col + .column_by_name("nan_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value_null_counts = value_col + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value_nan_counts = value_col + .column_by_name("nan_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(id_null_counts.value(0), 2); // 2 nulls in id column + assert_eq!(id_nan_counts.value(0), 0); // No NaNs in int column + assert_eq!(value_null_counts.value(0), 0); // No nulls in value column + assert_eq!(value_nan_counts.value(0), 2); // 2 NaNs in value column + } + + #[tokio::test] + async fn test_column_stats_disabled() { + // Test that no stats are written when disabled + use arrow_array::Int32Array; + use arrow_schema::Schema; + + let arrow_schema = Arc::new(Schema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let lance_schema = LanceSchema::try_from(arrow_schema.as_ref()).unwrap(); + + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1000))], + ) + .unwrap(); + + let path = TempObjFile::default(); + let object_store = ObjectStore::local(); + + let options = FileWriterOptions { + disable_column_stats: true, // Disabled + ..Default::default() + }; + + let mut writer = FileWriter::try_new( + object_store.create(&path).await.unwrap(), + lance_schema.clone(), + options, + ) + .unwrap(); + + writer.write_batch(&batch).await.unwrap(); + writer.finish().await.unwrap(); + + // Read back and verify no stats + let fs = FsFixture::default(); + let file_scheduler = fs + .scheduler + .open_file(&path, &CachedFileSize::unknown()) + .await + .unwrap(); + + let file_reader = FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &LanceCache::no_cache(), + FileReaderOptions::default(), + ) + .await + .unwrap(); + + let stats_batch = file_reader.read_column_stats().await.unwrap(); + assert!(stats_batch.is_none(), "Should not have column stats"); + } } diff --git a/rust/lance-file/src/writer/column_stats.rs b/rust/lance-file/src/writer/column_stats.rs new file mode 100644 index 00000000000..4827a69df4e --- /dev/null +++ b/rust/lance-file/src/writer/column_stats.rs @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Column statistics collection for Lance data files. +//! +//! This module provides per-zone column statistics +//! that are collected during file writing and stored in the file metadata +//! as a global buffer + +use arrow_array::ArrayRef; +use arrow_schema::{DataType, Field as ArrowField, Fields}; +use datafusion::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; +use datafusion_common::ScalarValue; +use datafusion_expr::Accumulator; +use lance_core::utils::zone::{ZoneBound, ZoneProcessor}; +use lance_core::{Error, Result}; +use snafu::location; + +/// Zone size for column statistics (1 million rows per zone) +pub(super) const COLUMN_STATS_ZONE_SIZE: u64 = 1_000_000; + +/// Column statistics for a single zone +#[derive(Debug, Clone)] +pub(super) struct ColumnZoneStatistics { + pub min: ScalarValue, + pub max: ScalarValue, + pub null_count: u32, + pub nan_count: u32, + pub bound: ZoneBound, +} + +/// Statistics processor for a single column that implements ZoneProcessor trait +pub(super) struct ColumnStatisticsProcessor { + data_type: DataType, + min: MinAccumulator, + max: MaxAccumulator, + null_count: u32, + nan_count: u32, +} + +/// Returns true for types that support min/max aggregation. +/// We exclude nested types (Struct, List, etc.) because DataFusion's try_new can succeed +/// for them but comparison fails at runtime. For other types we delegate to try_new. +fn supports_min_max(data_type: &DataType) -> bool { + // Exclude types that try_new accepts but fail at runtime when comparing. + // FixedSizeList is excluded because extension types (e.g. bfloat16) use it as storage; + // min/max arrays then lack extension metadata and cause schema mismatch. + if matches!( + data_type, + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + | DataType::Struct(_) + | DataType::Map(_, _) + | DataType::RunEndEncoded(_, _) + | DataType::Dictionary(_, _) + ) { + return false; + } + MinAccumulator::try_new(data_type).is_ok() && MaxAccumulator::try_new(data_type).is_ok() +} + +impl ColumnStatisticsProcessor { + pub(super) fn new(data_type: DataType) -> Result { + if !supports_min_max(&data_type) { + return Err(Error::invalid_input( + format!( + "Column statistics (min/max) not supported for type {:?}", + data_type + ), + location!(), + )); + } + let min = MinAccumulator::try_new(&data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + let max = MaxAccumulator::try_new(&data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + Ok(Self { + data_type, + min, + max, + null_count: 0, + nan_count: 0, + }) + } + + fn count_nans(array: &ArrayRef) -> u32 { + match array.data_type() { + DataType::Float16 => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + array.values().iter().filter(|&&x| x.is_nan()).count() as u32 + } + DataType::Float32 => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + array.values().iter().filter(|&&x| x.is_nan()).count() as u32 + } + DataType::Float64 => { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + array.values().iter().filter(|&&x| x.is_nan()).count() as u32 + } + _ => 0, + } + } +} + +impl ZoneProcessor for ColumnStatisticsProcessor { + type ZoneStatistics = ColumnZoneStatistics; + + fn process_chunk(&mut self, array: &ArrayRef) -> Result<()> { + self.null_count += array.null_count() as u32; + self.nan_count += Self::count_nans(array); + self.min + .update_batch(std::slice::from_ref(array)) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + self.max + .update_batch(std::slice::from_ref(array)) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + Ok(()) + } + + fn finish_zone(&mut self, bound: ZoneBound) -> Result { + let stats = ColumnZoneStatistics { + min: self + .min + .evaluate() + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?, + max: self + .max + .evaluate() + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?, + null_count: self.null_count, + nan_count: self.nan_count, + bound, + }; + + // Auto-reset for next zone + self.min = MinAccumulator::try_new(&self.data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + self.max = MaxAccumulator::try_new(&self.data_type) + .map_err(|e| Error::invalid_input(e.to_string(), location!()))?; + self.null_count = 0; + self.nan_count = 0; + + Ok(stats) + } +} + +/// Create Arrow struct type for file level ColumnZoneStatistics for a given column type. +pub(super) fn create_column_zone_statistics_struct_type(column_type: &DataType) -> DataType { + let zone_bound_fields = Fields::from(vec![ + ArrowField::new("fragment_id", DataType::UInt64, false), + ArrowField::new("start", DataType::UInt64, false), + ArrowField::new("length", DataType::UInt64, false), + ]); + + DataType::Struct(Fields::from(vec![ + // min and max are nullable because they can be null for empty zones + ArrowField::new("min", column_type.clone(), true), + ArrowField::new("max", column_type.clone(), true), + ArrowField::new("null_count", DataType::UInt32, false), + ArrowField::new("nan_count", DataType::UInt32, false), + ArrowField::new("bound", DataType::Struct(zone_bound_fields), false), + ])) +} + +/// Create Arrow struct type for consolidated zone statistics for a given column type. +pub fn create_consolidated_zone_struct_type(column_type: &DataType) -> DataType { + DataType::Struct(Fields::from(vec![ + ArrowField::new("fragment_id", DataType::UInt64, false), + ArrowField::new("zone_start", DataType::UInt64, false), + ArrowField::new("zone_length", DataType::UInt64, false), + ArrowField::new("null_count", DataType::UInt32, false), + ArrowField::new("nan_count", DataType::UInt32, false), + ArrowField::new("min_value", column_type.clone(), true), + ArrowField::new("max_value", column_type.clone(), true), + ])) +} diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index 3057323b5da..e95bb456dd9 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -40,7 +40,7 @@ use lance_core::Result; use roaring::RoaringBitmap; use snafu::location; -use super::zoned::{rebuild_zones, search_zones, ZoneBound, ZoneProcessor, ZoneTrainer}; +use super::zoned::{rebuild_zones, search_zones, IndexZoneTrainer, ZoneBound, ZoneProcessor}; const BLOOMFILTER_FILENAME: &str = "bloomfilter.lance"; const BLOOMFILTER_ITEM_META_KEY: &str = "bloomfilter_item"; @@ -498,7 +498,7 @@ impl ScalarIndex for BloomFilterIndex { }; let processor = BloomFilterProcessor::new(params.clone())?; - let trainer = ZoneTrainer::new(processor, params.number_of_items)?; + let trainer = IndexZoneTrainer::new(processor, params.number_of_items)?; let updated_blocks = rebuild_zones(&self.zones, trainer, new_data).await?; // Write the combined zones back to storage @@ -602,12 +602,12 @@ impl BloomFilterIndexBuilder { }) } - /// Train the builder using the shared ZoneTrainer. The input stream is expected to + /// Train the builder using the shared IndexZoneTrainer. The input stream is expected to /// contain the value column followed by `_rowaddr`, matching the order emitted by /// the scalar index training pipeline. pub async fn train(&mut self, batches_source: SendableRecordBatchStream) -> Result<()> { let processor = BloomFilterProcessor::new(self.params.clone())?; - let trainer = ZoneTrainer::new(processor, self.params.number_of_items)?; + let trainer = IndexZoneTrainer::new(processor, self.params.number_of_items)?; self.blocks = trainer.train(batches_source).await?; Ok(()) } @@ -697,13 +697,12 @@ struct BloomFilterProcessor { impl BloomFilterProcessor { fn new(params: BloomFilterIndexBuilderParams) -> Result { - let mut processor = Self { + let sbbf = Self::build_filter(¶ms)?; + Ok(Self { params, - sbbf: None, + sbbf: Some(sbbf), cur_zone_has_null: false, - }; - processor.reset()?; - Ok(processor) + }) } fn build_filter(params: &BloomFilterIndexBuilderParams) -> Result { @@ -1009,17 +1008,17 @@ impl ZoneProcessor for BloomFilterProcessor { location!(), ) })?; - Ok(BloomFilterStatistics { + let stats = BloomFilterStatistics { bound, has_null: self.cur_zone_has_null, bloom_filter: bloom_filter.clone(), - }) - } + }; - fn reset(&mut self) -> Result<()> { + // Auto-reset for next zone self.sbbf = Some(Self::build_filter(&self.params)?); self.cur_zone_has_null = false; - Ok(()) + + Ok(stats) } } diff --git a/rust/lance-index/src/scalar/zoned.rs b/rust/lance-index/src/scalar/zoned.rs index bb2be962d16..f5ce3ce069d 100644 --- a/rust/lance-index/src/scalar/zoned.rs +++ b/rust/lance-index/src/scalar/zoned.rs @@ -6,8 +6,11 @@ //! This module provides common infrastructure for building zone-based scalar indexes. //! It handles chunking data streams into fixed-size zones while respecting fragment //! boundaries and computing zone bounds that remain valid after row deletions. +//! +//! Core zone types (`ZoneBound`, `ZoneProcessor`) are defined in `lance_core::utils::zone` +//! and re-exported here for convenience. -use arrow_array::{ArrayRef, UInt64Array}; +use arrow_array::UInt64Array; use datafusion::execution::SendableRecordBatchStream; use futures::TryStreamExt; use lance_core::error::Error; @@ -17,55 +20,17 @@ use lance_core::{Result, ROW_ADDR}; use lance_datafusion::chunker::chunk_concat_stream; use snafu::location; -// -// Example: Suppose we have two fragments, each with 4 rows. -// Fragment 0: start = 0, length = 4 // covers rows 0, 1, 2, 3 in fragment 0 -// The row addresses for fragment 0 are: 0, 1, 2, 3 -// Fragment 1: start = 0, length = 4 // covers rows 0, 1, 2, 3 in fragment 1 -// The row addresses for fragment 1 are: (1<<32), (1<<32)+1, (1<<32)+2, (1<<32)+3 -// -// Deletion is 0 index based. We delete the 0th and 1st row in fragment 0, -// and the 1st and 2nd row in fragment 1, -// Fragment 0: start = 2, length = 2 // covers rows 2, 3 in fragment 0 -// The row addresses for fragment 0 are: 2, 3 -// Fragment 1: start = 0, length = 4 // covers rows 0, 3 in fragment 1 -// The row addresses for fragment 1 are: (1<<32), (1<<32)+3 -/// Zone bound within a fragment -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct ZoneBound { - pub fragment_id: u64, - // start is start row of the zone in the fragment, also known - // as the local offset. To get the actual first row address, - // use `(fragment_id << 32) | start`. - pub start: u64, - // length is the span of row offsets between the first and last row in the zone, - // calculated as (last_row_offset - first_row_offset + 1). It is not the count - // of physical rows, since deletions may create gaps within the span. - pub length: usize, -} - -/// Index-specific logic used while building zones. -pub trait ZoneProcessor { - type ZoneStatistics; - - /// Process a slice of values that belongs to the current zone. - fn process_chunk(&mut self, values: &ArrayRef) -> Result<()>; - - /// Emit statistics when the zone is full or the fragment changes. - fn finish_zone(&mut self, bound: ZoneBound) -> Result; - - /// Reset state so the processor can handle the next zone. - fn reset(&mut self) -> Result<()>; -} +// Re-export core zone types for convenience +pub use lance_core::utils::zone::{ZoneBound, ZoneProcessor}; /// Trainer that handles chunking, fragment boundaries, and zone flushing. #[derive(Debug)] -pub struct ZoneTrainer

{ +pub struct IndexZoneTrainer

{ processor: P, zone_capacity: u64, } -impl

ZoneTrainer

+impl

IndexZoneTrainer

where P: ZoneProcessor, { @@ -109,8 +74,6 @@ where let mut zone_start_offset: Option = None; let mut zone_end_offset: Option = None; - self.processor.reset()?; - while let Some(batch) = batches.try_next().await? { if batch.num_rows() == 0 { continue; @@ -200,8 +163,6 @@ where &mut zone_start_offset, &mut zone_end_offset, )?; - } else { - self.processor.reset()?; } } @@ -236,7 +197,7 @@ where *current_zone_len = 0; *zone_start_offset = None; *zone_end_offset = None; - processor.reset()?; + // finish_zone() resets the processor internally Ok(()) } } @@ -278,7 +239,7 @@ where /// into an existing zone list. pub async fn rebuild_zones

( existing: &[P::ZoneStatistics], - trainer: ZoneTrainer

, + trainer: IndexZoneTrainer

, stream: SendableRecordBatchStream, ) -> Result> where @@ -329,15 +290,13 @@ mod tests { } fn finish_zone(&mut self, bound: ZoneBound) -> Result { - Ok(MockStats { + let stats = MockStats { sum: self.current_sum, bound, - }) - } - - fn reset(&mut self) -> Result<()> { + }; + // Auto-reset for next zone self.current_sum = 0; - Ok(()) + Ok(stats) } } @@ -369,7 +328,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 4).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 4).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Three zones: offsets [0..=3], [4..=7], [8..=9] @@ -400,7 +359,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 10).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 10).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Two zones, one per fragment (capacity=10 is large enough) @@ -425,7 +384,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 10).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 10).unwrap(); let err = trainer.train(stream).await.unwrap_err(); assert!( format!("{}", err).contains("zone row offsets are out of order"), @@ -454,7 +413,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 10).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 10).unwrap(); let stats = trainer.train(stream).await.unwrap(); // One zone containing the 3 valid rows (empty batches skipped) @@ -476,7 +435,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 1).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 1).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Three zones, one per row (capacity=1) @@ -501,7 +460,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 10000).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 10000).unwrap(); let stats = trainer.train(stream).await.unwrap(); // One zone containing all 100 rows (capacity is large enough) @@ -514,7 +473,7 @@ mod tests { #[tokio::test] async fn rejects_zero_capacity() { let processor = MockProcessor::new(); - let result = ZoneTrainer::new(processor, 0); + let result = IndexZoneTrainer::new(processor, 0); assert!(result.is_err()); assert!(result .unwrap_err() @@ -535,7 +494,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 4).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 4).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Two zones: first 4 rows, then remaining 2 rows @@ -566,7 +525,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 3).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 3).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Three zones: frag 0 full zone, frag 0 partial (flushed at boundary), frag 1 @@ -607,7 +566,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 4).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 4).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Should create 2 zones (capacity=4): @@ -642,7 +601,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 10).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 10).unwrap(); let stats = trainer.train(stream).await.unwrap(); // One zone with 3 rows, but offset span [0..=200] so length=201 due to large gaps @@ -668,7 +627,7 @@ mod tests { )); let processor = MockProcessor::new(); - let trainer = ZoneTrainer::new(processor, 10).unwrap(); + let trainer = IndexZoneTrainer::new(processor, 10).unwrap(); let stats = trainer.train(stream).await.unwrap(); // Should create 3 zones (one per fragment) @@ -815,7 +774,7 @@ mod tests { stream::once(async { Ok(batch) }), )); - let trainer = ZoneTrainer::new(MockProcessor::new(), 2).unwrap(); + let trainer = IndexZoneTrainer::new(MockProcessor::new(), 2).unwrap(); let rebuilt = rebuild_zones(&existing, trainer, stream).await.unwrap(); // Existing zone should remain unchanged and new stats appended afterwards assert_eq!(rebuilt.len(), 2); @@ -845,7 +804,7 @@ mod tests { stream::once(async { Ok(batch) }), )); - let trainer = ZoneTrainer::new(MockProcessor::new(), 2).unwrap(); + let trainer = IndexZoneTrainer::new(MockProcessor::new(), 2).unwrap(); let rebuilt = rebuild_zones(&existing, trainer, stream).await.unwrap(); // Existing zone plus two new fragments should yield three total zones assert_eq!(rebuilt.len(), 3); diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index b631ba89d48..aceb09e7035 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -44,7 +44,7 @@ use lance_core::Result; use roaring::RoaringBitmap; use snafu::location; -use super::zoned::{rebuild_zones, search_zones, ZoneBound, ZoneProcessor, ZoneTrainer}; +use super::zoned::{rebuild_zones, search_zones, IndexZoneTrainer, ZoneBound, ZoneProcessor}; const ROWS_PER_ZONE_DEFAULT: u64 = 8192; // 1 zone every two batches const ZONEMAP_FILENAME: &str = "zonemap.lance"; @@ -131,6 +131,16 @@ impl DeepSizeOf for ZoneMapIndex { } impl ZoneMapIndex { + /// Check if a ScalarValue is NaN + fn is_nan(value: &ScalarValue) -> bool { + match value { + ScalarValue::Float16(Some(f)) => f.is_nan(), + ScalarValue::Float32(Some(f)) => f.is_nan(), + ScalarValue::Float64(Some(f)) => f.is_nan(), + _ => false, + } + } + /// Evaluates whether a zone could potentially contain values matching the query /// For NaN, total order is used here /// reference: https://doc.rust-lang.org/std/primitive.f64.html#method.total_cmp @@ -147,92 +157,40 @@ impl ZoneMapIndex { Ok(zone.null_count > 0) } SargableQuery::Equals(target) => { - // Zone contains matching values if target falls within [min, max] range - // Handle null values - if target is null, check null_count + // Handle null values if target.is_null() { return Ok(zone.null_count > 0); } - // Handle NaN values - if target is NaN, check nan_count - let is_nan = match target { - ScalarValue::Float16(Some(f)) => f.is_nan(), - ScalarValue::Float32(Some(f)) => f.is_nan(), - ScalarValue::Float64(Some(f)) => f.is_nan(), - _ => false, - }; - - if is_nan { + // Handle NaN values + if Self::is_nan(target) { return Ok(zone.nan_count > 0); } // Check if target is within the zone's range // Handle the case where zone.max is NaN (zone contains both finite values and NaN) let min_check = target >= &zone.min; - let max_check = match &zone.max { - ScalarValue::Float16(Some(f)) if f.is_nan() => true, - ScalarValue::Float32(Some(f)) if f.is_nan() => true, - ScalarValue::Float64(Some(f)) if f.is_nan() => true, - _ => target <= &zone.max, - }; + let max_check = Self::is_nan(&zone.max) || target <= &zone.max; Ok(min_check && max_check) } SargableQuery::Range(start, end) => { - // Zone overlaps with query range if there's any intersection between - // the zone's [min, max] and the query's range let zone_min = &zone.min; let zone_max = &zone.max; let start_check = match start { Bound::Unbounded => true, Bound::Included(s) => { - // Handle NaN in range bounds - NaN is greater than all finite values - match s { - ScalarValue::Float16(Some(f)) => { - if f.is_nan() { - return Ok(zone.nan_count > 0); - } - } - ScalarValue::Float32(Some(f)) => { - if f.is_nan() { - return Ok(zone.nan_count > 0); - } - } - ScalarValue::Float64(Some(f)) => { - if f.is_nan() { - return Ok(zone.nan_count > 0); - } - } - _ => {} - } - // Handle the case where zone_max is NaN - // If zone_max is NaN, the zone contains both finite values and NaN - // Since we don't know the actual max, we'll be conservative and include the zone - match zone_max { - ScalarValue::Float16(Some(f)) if f.is_nan() => true, - ScalarValue::Float32(Some(f)) if f.is_nan() => true, - ScalarValue::Float64(Some(f)) if f.is_nan() => true, - _ => zone_max >= s, + // If bound is NaN, check if zone has NaN values + if Self::is_nan(s) { + return Ok(zone.nan_count > 0); } + // If zone_max is NaN, be conservative and include the zone + Self::is_nan(zone_max) || zone_max >= s } Bound::Excluded(s) => { - // Handle NaN in range bounds - match s { - ScalarValue::Float16(Some(f)) => { - if f.is_nan() { - return Ok(false); // Nothing is greater than NaN - } - } - ScalarValue::Float32(Some(f)) => { - if f.is_nan() { - return Ok(false); // Nothing is greater than NaN - } - } - ScalarValue::Float64(Some(f)) => { - if f.is_nan() { - return Ok(false); // Nothing is greater than NaN - } - } - _ => {} + // Nothing is greater than NaN + if Self::is_nan(s) { + return Ok(false); } zone_max > s } @@ -241,48 +199,16 @@ impl ZoneMapIndex { let end_check = match end { Bound::Unbounded => true, Bound::Included(e) => { - // Handle NaN in range bounds - match e { - ScalarValue::Float16(Some(f)) => { - if f.is_nan() { - // NaN is included, so check if zone has NaN values or finite values - return Ok(zone.nan_count > 0 || zone_min <= e); - } - } - ScalarValue::Float32(Some(f)) => { - if f.is_nan() { - return Ok(zone.nan_count > 0 || zone_min <= e); - } - } - ScalarValue::Float64(Some(f)) => { - if f.is_nan() { - return Ok(zone.nan_count > 0 || zone_min <= e); - } - } - _ => {} + // NaN is included, so check if zone has NaN values or finite values + if Self::is_nan(e) { + return Ok(zone.nan_count > 0 || zone_min <= e); } zone_min <= e } Bound::Excluded(e) => { - // Handle NaN in range bounds - match e { - ScalarValue::Float16(Some(f)) => { - if f.is_nan() { - // Everything is less than NaN, so include all finite values - return Ok(true); - } - } - ScalarValue::Float32(Some(f)) => { - if f.is_nan() { - return Ok(true); - } - } - ScalarValue::Float64(Some(f)) => { - if f.is_nan() { - return Ok(true); - } - } - _ => {} + // Everything is less than NaN, so include all finite values + if Self::is_nan(e) { + return Ok(true); } zone_min < e } @@ -295,31 +221,10 @@ impl ZoneMapIndex { Ok(values.iter().any(|value| { if value.is_null() { zone.null_count > 0 + } else if Self::is_nan(value) { + zone.nan_count > 0 } else { - match value { - ScalarValue::Float16(Some(f)) => { - if f.is_nan() { - zone.nan_count > 0 - } else { - value >= &zone.min && value <= &zone.max - } - } - ScalarValue::Float32(Some(f)) => { - if f.is_nan() { - zone.nan_count > 0 - } else { - value >= &zone.min && value <= &zone.max - } - } - ScalarValue::Float64(Some(f)) => { - if f.is_nan() { - zone.nan_count > 0 - } else { - value >= &zone.min && value <= &zone.max - } - } - _ => value >= &zone.min && value <= &zone.max, - } + value >= &zone.min && value <= &zone.max } })) } @@ -572,7 +477,7 @@ impl ScalarIndex for ZoneMapIndex { let options = ZoneMapIndexBuilderParams::new(self.rows_per_zone); let processor = ZoneMapProcessor::new(value_type.clone())?; - let trainer = ZoneTrainer::new(processor, self.rows_per_zone)?; + let trainer = IndexZoneTrainer::new(processor, self.rows_per_zone)?; let updated_zones = rebuild_zones(&self.zones, trainer, new_data).await?; // Serialize the combined zones back into the index file @@ -657,7 +562,7 @@ impl ZoneMapIndexBuilder { /// by the scalar index registry. pub async fn train(&mut self, batches_source: SendableRecordBatchStream) -> Result<()> { let processor = ZoneMapProcessor::new(self.items_type.clone())?; - let trainer = ZoneTrainer::new(processor, self.options.rows_per_zone)?; + let trainer = IndexZoneTrainer::new(processor, self.options.rows_per_zone)?; self.maps = trainer.train(batches_source).await?; Ok(()) } @@ -792,21 +697,21 @@ impl ZoneProcessor for ZoneMapProcessor { } fn finish_zone(&mut self, bound: ZoneBound) -> Result { - Ok(ZoneMapStatistics { + let stats = ZoneMapStatistics { min: self.min.evaluate()?, max: self.max.evaluate()?, null_count: self.null_count, nan_count: self.nan_count, bound, - }) - } + }; - fn reset(&mut self) -> Result<()> { + // Auto-reset for next zone self.min = MinAccumulator::try_new(&self.data_type)?; self.max = MaxAccumulator::try_new(&self.data_type)?; self.null_count = 0; self.nan_count = 0; - Ok(()) + + Ok(stats) } } diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index d50e59d1bc7..b77071ffb05 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -101,6 +101,9 @@ pub struct Manifest { /* external base paths */ pub base_paths: HashMap, + + /// Column statistics metadata. + pub column_stats: Option, } // We use the most significant bit to indicate that a transaction is detached @@ -196,6 +199,7 @@ impl Manifest { config: HashMap::new(), table_metadata: HashMap::new(), base_paths, + column_stats: None, } } @@ -227,6 +231,7 @@ impl Manifest { config: previous.config.clone(), table_metadata: previous.table_metadata.clone(), base_paths: previous.base_paths.clone(), + column_stats: previous.column_stats.clone(), } } @@ -289,6 +294,7 @@ impl Manifest { base_paths }, table_metadata: self.table_metadata.clone(), + column_stats: self.column_stats.clone(), } } @@ -601,6 +607,12 @@ impl DeepSizeOf for BasePath { } } +impl DeepSizeOf for pb::ColumnStats { + fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + self.path.deep_size_of_children(context) + size_of::() + } +} + #[derive(Debug, Clone, PartialEq, DeepSizeOf)] pub struct WriterVersion { pub library: String, @@ -939,6 +951,7 @@ impl TryFrom for Manifest { .iter() .map(|item| (item.id, item.clone().into())) .collect(), + column_stats: p.column_stats, }) } } @@ -1002,6 +1015,7 @@ impl From<&Manifest> for pb::Manifest { }) .collect(), transaction_section: m.transaction_section.map(|i| i as u64), + column_stats: m.column_stats.clone(), } } } diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 3913c5b255f..8656c75dbef 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -64,6 +64,8 @@ pub(crate) mod blob; mod branch_location; pub mod builder; pub mod cleanup; +pub mod column_stats_consolidator; +pub mod column_stats_reader; pub mod delta; pub mod fragment; mod hash_joiner; @@ -2963,6 +2965,7 @@ impl Dataset { table_metadata_updates: None, schema_metadata_updates: None, field_metadata_updates, + column_stats: None, }, ) .await diff --git a/rust/lance/src/dataset/cleanup.rs b/rust/lance/src/dataset/cleanup.rs index 1c4d0c90cca..f343bdf3a4a 100644 --- a/rust/lance/src/dataset/cleanup.rs +++ b/rust/lance/src/dataset/cleanup.rs @@ -1158,6 +1158,7 @@ mod tests { store_params: Some(self.os_params()), commit_handler: Some(Arc::new(RenameCommitHandler)), mode, + disable_column_stats: true, // One commit per write for predictable file counts ..Default::default() }), ) diff --git a/rust/lance/src/dataset/column_stats_consolidator.rs b/rust/lance/src/dataset/column_stats_consolidator.rs new file mode 100644 index 00000000000..7cc74dc9753 --- /dev/null +++ b/rust/lance/src/dataset/column_stats_consolidator.rs @@ -0,0 +1,1775 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Column statistics consolidation utilities. +//! +//! This module provides functionality for consolidating per-fragment column statistics +//! into a single consolidated stats file. It works in conjunction with +//! [`column_stats_reader`](crate::dataset::column_stats_reader) which provides +//! the reading API. +//! +//! # Overview +//! +//! Per-fragment statistics are stored in each data file's global buffer in a **columnar layout** +//! (one column per dataset column, each row represents a zone, with type `ColumnZoneStatistics`). +//! This module consolidates them into a **columnar layout** with one row total +//! (one column per dataset column, each containing a `List>` with zone statistics). +//! +//! # Workflow +//! +//! 1. **Per-fragment stats** (columnar layout, local offsets) → stored in data files +//! 2. **Consolidation** (this module) → converts to columnar layout with one row, local offsets preserved +//! 3. **Reading** ([`column_stats_reader`](crate::dataset::column_stats_reader)) → provides +//! typed access to consolidated stats +//! + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::StructArray; +use arrow_array::{Array, ArrayRef, ListArray, RecordBatch, UInt32Array, UInt64Array}; +use arrow_buffer::OffsetBuffer; +use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; +use datafusion::scalar::ScalarValue; +use lance_core::datatypes::Schema; +use lance_core::utils::zone::ZoneBound; +use lance_core::Result; +use lance_encoding::decoder::DecoderPlugins; +use lance_encoding::version::LanceFileVersion; +use lance_file::determine_file_version; +use lance_file::reader::FileReader; +use lance_file::writer::create_consolidated_zone_struct_type; +use lance_io::object_store::ObjectStore; +use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; +use lance_io::utils::CachedFileSize; +use object_store::path::Path; +use snafu::location; + +use crate::dataset::fragment::FileFragment; +use crate::{Dataset, Error}; + +/// Consolidated statistics for a single zone of a single column. +#[derive(Debug, Clone)] +pub struct ZoneStats { + /// Zone boundary information (fragment_id, start offset, length) + pub bound: ZoneBound, + /// Zone ID within the fragment (0, 1, 2, ...) + /// This is the index of the zone within the fragment file + pub zone_id: u32, + pub null_count: u32, + pub nan_count: u32, + pub min: ScalarValue, + pub max: ScalarValue, +} + +/// Consolidate column statistics from all fragments into a single file. +/// +/// This function implements an "all-or-nothing" approach: if any fragment +/// lacks column statistics, consolidation is skipped entirely. +/// It should be relaxed in the future to support partial stats dataset consolidation. #5857 +/// +/// # How It Works +/// +/// Each fragment file contains per-fragment statistics in a **columnar layout** (see writer.rs): +/// Each dataset column maps to a column in the stats file, with type `ColumnZoneStatistics` (struct). +/// Each row represents a zone. +/// +/// **Fragment file layout**: +/// ```text +/// ┌─────────────┬──────────────────────────────┬──────────────────────────────┐ +/// │ Row (Zone) │ "id" (ColumnZoneStatistics) │ "price" (ColumnZoneStatistics)│ +/// ├─────────────┼──────────────────────────────┼──────────────────────────────┤ +/// │ 0 │ {min, max, null_count, ...} │ {min, max, null_count, ...} │ +/// │ 1 │ {min, max, null_count, ...} │ {min, max, null_count, ...} │ +/// │ ... │ ... │ ... │ +/// └─────────────┴──────────────────────────────┴──────────────────────────────┘ +/// ``` +/// +/// **Fragment 0 stats** (2 zones, local offsets): +/// ```text +/// Row 0 (zone 0): +/// "id": ColumnZoneStatistics{min="1", max="1000000", null_count=0, nan_count=0, bound={fragment_id=0, start=0, length=1000000}} +/// "price": ColumnZoneStatistics{min="9.99", max="99.99", null_count=0, nan_count=0, bound={fragment_id=0, start=0, length=1000000}} +/// +/// Row 1 (zone 1): +/// "id": ColumnZoneStatistics{min="1000001", max="2000000", null_count=0, nan_count=0, bound={fragment_id=0, start=1000000, length=1000000}} +/// "price": ColumnZoneStatistics{min="10.50", max="100.50", null_count=0, nan_count=0, bound={fragment_id=0, start=1000000, length=1000000}} +/// ``` +/// +/// **Fragment 1 stats** (2 zones, local offsets): +/// ```text +/// Row 0 (zone 0): +/// "id": ColumnZoneStatistics{min="2000001", max="3000000", null_count=0, nan_count=0, bound={fragment_id=1, start=0, length=1000000}} +/// "price": ColumnZoneStatistics{min="15.00", max="150.00", null_count=0, nan_count=0, bound={fragment_id=1, start=0, length=1000000}} +/// +/// Row 1 (zone 1): +/// "id": ColumnZoneStatistics{min="3000001", max="4000000", null_count=0, nan_count=0, bound={fragment_id=1, start=1000000, length=1000000}} +/// "price": ColumnZoneStatistics{min="20.00", max="200.00", null_count=0, nan_count=0, bound={fragment_id=1, start=1000000, length=1000000}} +/// ``` +/// +/// This function **consolidates** them into a **columnar layout** with one row total: +/// Each dataset column maps to a column in the consolidated stats file, with type `List>`. +/// The list is ordered by zone_id first, then fragment_id. Zone offsets remain local (per fragment). +/// +/// **Consolidated file layout**: +/// ```text +/// ┌─────┬──────────────────────────────────────┬──────────────────────────────────────┐ +/// │ Row │ "id" (List>) │ "price" (List>) │ +/// ├─────┼──────────────────────────────────────┼──────────────────────────────────────┤ +/// │ 0 │ [struct{...}, struct{...}, ...] │ [struct{...}, struct{...}, ...] │ +/// └─────┴──────────────────────────────────────┴──────────────────────────────────────┘ +/// ``` +/// +/// **Consolidated stats** (one row total, columnar): +/// ```text +/// Row 0: +/// "id": List[ +/// struct{fragment_id=0, zone_start=0, zone_length=1000000, null_count=0, nan_count=0, min_value="1", max_value="1000000"}, +/// struct{fragment_id=1, zone_start=0, zone_length=1000000, null_count=0, nan_count=0, min_value="2000001", max_value="3000000"}, +/// struct{fragment_id=0, zone_start=1000000, zone_length=1000000, null_count=0, nan_count=0, min_value="1000001", max_value="2000000"}, +/// struct{fragment_id=1, zone_start=1000000, zone_length=1000000, null_count=0, nan_count=0, min_value="3000001", max_value="4000000"} +/// ] +/// "price": List[ +/// struct{fragment_id=0, zone_start=0, zone_length=1000000, null_count=0, nan_count=0, min_value="9.99", max_value="99.99"}, +/// struct{fragment_id=1, zone_start=0, zone_length=1000000, null_count=0, nan_count=0, min_value="15.00", max_value="150.00"}, +/// struct{fragment_id=0, zone_start=1000000, zone_length=1000000, null_count=0, nan_count=0, min_value="10.50", max_value="100.50"}, +/// struct{fragment_id=1, zone_start=1000000, zone_length=1000000, null_count=0, nan_count=0, min_value="20.00", max_value="200.00"} +/// ] +/// ``` +/// +/// **Key points**: +/// - Zone offsets (`zone_start`) remain **local** (per fragment), not global +/// - List elements are ordered by `(zone_id, fragment_id)`: all zone 0s first, then all zone 1s, etc. +/// - Each dataset column has its own column in the consolidated file +/// +pub async fn consolidate_column_stats(dataset: &Dataset) -> Result> { + // Step 1: Pre-check - ALL fragments must have stats (all-or-nothing) + let fragments = dataset.get_fragments(); + let total_fragments = fragments.len(); + let mut fragments_with_stats = 0; + + for fragment in &fragments { + if fragment_has_stats(dataset, fragment).await? { + fragments_with_stats += 1; + } + } + + // TODO: Support partial stats dataset consolidation + if fragments_with_stats < total_fragments { + log::warn!( + "Skipping column stats consolidation: only {fragments_with_stats}/{total_fragments} fragments have stats" + ); + return Ok(None); + } + + // Step 2: Collect stats from all fragments, organized by column + let mut stats_by_column: HashMap> = HashMap::new(); + + for fragment in &fragments { + for data_file in &fragment.metadata().files { + let file_path = dataset + .data_file_dir(data_file)? + .child(data_file.path.as_str()); + let file_stats = read_fragment_column_stats(dataset, &file_path).await?; + + if let Some(file_stats) = file_stats { + for (col_name, zones) in file_stats { + let adjusted_zones: Vec = zones + .into_iter() + .map(|z| ZoneStats { + bound: ZoneBound { + fragment_id: fragment.id() as u64, + start: z.bound.start, // Keep local offset + length: z.bound.length, + }, + zone_id: z.zone_id, + null_count: z.null_count, + nan_count: z.nan_count, + min: z.min, + max: z.max, + }) + .collect(); + + stats_by_column + .entry(col_name) + .or_default() + .extend(adjusted_zones); + } + } + } + } + + // If no statistics were collected, return early + if stats_by_column.is_empty() { + return Ok(None); + } + + // Step 3: Build consolidated batch + let consolidated_batch = build_consolidated_batch(stats_by_column, dataset.schema())?; + + // Step 4: Write as Lance file + let stats_path = String::from("_stats/column_stats.lance"); + write_stats_file( + dataset.object_store(), + &dataset.base.child(stats_path.as_str()), + consolidated_batch, + ) + .await?; + + log::info!( + "Consolidated column stats from {} fragments into {}", + total_fragments, + stats_path, + ); + + Ok(Some(stats_path)) +} + +/// Check if a fragment has column statistics. +/// +/// A fragment consists of one or more data files. Column statistics are stored +/// per-file (each FileWriter writes stats independently). This function returns +/// true only if ALL data files in the fragment have column statistics. +/// +/// This is necessary because: +/// - A fragment can have multiple data files (e.g., after appending or splitting) +/// - Each file's FileWriter independently decides whether to write stats +/// - For consolidation, we need stats from ALL files to be present +async fn fragment_has_stats(dataset: &Dataset, fragment: &FileFragment) -> Result { + // Check all data files - all must have stats for the fragment to be considered complete + for data_file in &fragment.metadata().files { + let file_path = dataset + .data_file_dir(data_file)? + .child(data_file.path.as_str()); + // Legacy (0.2) format does not have column stats; skip to avoid opening with v2 reader + if determine_file_version(dataset.object_store.as_ref(), &file_path, None).await? + == LanceFileVersion::Legacy + { + return Ok(false); + } + let scheduler = ScanScheduler::new( + dataset.object_store.clone(), + SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&file_path, &CachedFileSize::unknown()) + .await?; + + let file_reader = FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&file_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await?; + + // If any file lacks stats, return false immediately + if !file_reader.has_column_stats() { + return Ok(false); + } + } + + // All files have stats + Ok(true) +} + +/// Read column statistics from a single data file (.lance file). +/// +/// Returns a map from column name to list of zone statistics. The zones are +/// stored in a columnar layout in the data file (one column per dataset column, +/// each row represents a zone, with type `ColumnZoneStatistics`), which +/// this function converts to a nested structure for easier processing. +/// +/// # Example +/// +/// For a data file with 2 columns and 2 zones each, the columnar layout in the file: +/// ```text +/// ┌─────┬──────────────────────────────┬──────────────────────────────┐ +/// │ Row │ "id" (ColumnZoneStatistics) │ "price" (ColumnZoneStatistics)│ +/// ├─────┼──────────────────────────────┼──────────────────────────────┤ +/// │ 0 │ {min, max, null_count, ...} │ {min, max, null_count, ...} │ +/// │ 1 │ {min, max, null_count, ...} │ {min, max, null_count, ...} │ +/// └─────┴──────────────────────────────┴──────────────────────────────┘ +/// ``` +/// +/// Gets converted to: +/// ```text +/// { +/// "id": [ZoneStats(zone_id=0, ...), ZoneStats(zone_id=1, ...)], +/// "price": [ZoneStats(zone_id=0, ...), ZoneStats(zone_id=1, ...)] +/// } +/// ``` +async fn read_fragment_column_stats( + dataset: &Dataset, + file_path: &Path, +) -> Result>>> { + // Legacy (0.2) format does not have column stats; v2 reader would reject the file + if determine_file_version(dataset.object_store.as_ref(), file_path, None).await? + == LanceFileVersion::Legacy + { + return Ok(None); + } + let scheduler = ScanScheduler::new( + dataset.object_store.clone(), + SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(file_path, &CachedFileSize::unknown()) + .await?; + + let file_reader = FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(file_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await?; + + let Some(stats_batch) = file_reader.read_column_stats().await? else { + return Ok(None); + }; + + // Parse the columnar stats batch: one column per dataset column, each containing ColumnZoneStatistics structs + // Rows = zones (one row per zone) + let mut result = HashMap::new(); + use arrow_array::StructArray; + + let num_zones = stats_batch.num_rows(); + let schema = stats_batch.schema(); + + // Iterate over each column in the batch (each column corresponds to a dataset column) + for (col_idx, field) in schema.fields().iter().enumerate() { + let col_name = field.name(); + let column_array = stats_batch.column(col_idx); + + // Extract the StructArray for this column + let struct_array = column_array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected StructArray for column '{}' in column stats", + col_name + ), + location: location!(), + })?; + + // Extract min/max arrays (typed as the column's type in fragment stats) + let min_array = struct_array + .column_by_name("min") + .ok_or_else(|| Error::Internal { + message: format!("Missing 'min' field in column stats for '{}'", col_name), + location: location!(), + })?; + + let max_array = struct_array + .column_by_name("max") + .ok_or_else(|| Error::Internal { + message: format!("Missing 'max' field in column stats for '{}'", col_name), + location: location!(), + })?; + + let null_count_array = struct_array + .column_by_name("null_count") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'null_count' field in column stats for '{}'", + col_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt32Array for 'null_count' field in column '{}'", + col_name + ), + location: location!(), + })?; + + let nan_count_array = struct_array + .column_by_name("nan_count") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'nan_count' field in column stats for '{}'", + col_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt32Array for 'nan_count' field in column '{}'", + col_name + ), + location: location!(), + })?; + + // Extract the bound struct + let bound_struct = struct_array + .column_by_name("bound") + .ok_or_else(|| Error::Internal { + message: format!("Missing 'bound' field in column stats for '{}'", col_name), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected StructArray for 'bound' field in column '{}'", + col_name + ), + location: location!(), + })?; + + let fragment_id_array = bound_struct + .column_by_name("fragment_id") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'fragment_id' in bound struct for column '{}'", + col_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt64Array for 'fragment_id' in bound struct for column '{}'", + col_name + ), + location: location!(), + })?; + + let start_array = bound_struct + .column_by_name("start") + .ok_or_else(|| Error::Internal { + message: format!("Missing 'start' in bound struct for column '{}'", col_name), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt64Array for 'start' in bound struct for column '{}'", + col_name + ), + location: location!(), + })?; + + let length_array = bound_struct + .column_by_name("length") + .ok_or_else(|| Error::Internal { + message: format!("Missing 'length' in bound struct for column '{}'", col_name), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt64Array for 'length' in bound struct for column '{}'", + col_name + ), + location: location!(), + })?; + + // Process each zone (row) for this column + // zone_idx is the zone_id within the fragment + let mut zones = Vec::with_capacity(num_zones); + for zone_idx in 0..num_zones { + let min_scalar = + ScalarValue::try_from_array(min_array.as_ref(), zone_idx).map_err(|e| { + Error::Internal { + message: format!( + "Failed to get min ScalarValue for column '{}': {}", + col_name, e + ), + location: location!(), + } + })?; + let max_scalar = + ScalarValue::try_from_array(max_array.as_ref(), zone_idx).map_err(|e| { + Error::Internal { + message: format!( + "Failed to get max ScalarValue for column '{}': {}", + col_name, e + ), + location: location!(), + } + })?; + let zone_stat = ZoneStats { + bound: ZoneBound { + fragment_id: fragment_id_array.value(zone_idx), + start: start_array.value(zone_idx), + length: length_array.value(zone_idx) as usize, + }, + zone_id: zone_idx as u32, + null_count: null_count_array.value(zone_idx), + nan_count: nan_count_array.value(zone_idx), + min: min_scalar, + max: max_scalar, + }; + zones.push(zone_stat); + } + + result.insert(col_name.to_string(), zones); + } + + Ok(Some(result)) +} + +/// Create Arrow schema for consolidated statistics +/// +/// Schema: one column per dataset column, each of type List +pub(crate) fn create_consolidated_stats_schema(dataset_schema: &Schema) -> Arc { + let fields: Vec = dataset_schema + .fields + .iter() + .map(|field| { + let column_type = field.data_type(); + ArrowField::new( + &field.name, + DataType::List(Arc::new(ArrowField::new( + "zone", + create_consolidated_zone_struct_type(&column_type), + false, + ))), + false, + ) + }) + .collect(); + + Arc::new(ArrowSchema::new(fields)) +} + +/// Build a consolidated RecordBatch from collected statistics. +/// +/// Uses columnar layout: one row total, one column per dataset column. +/// Each column is List where struct contains zone statistics. +/// List is ordered by zone_id first, then fragment_id. +fn build_consolidated_batch( + stats_by_column: HashMap>, + dataset_schema: &Schema, +) -> Result { + let mut column_arrays: Vec = Vec::new(); + let mut schema_fields: Vec = Vec::new(); + + // Get the full schema (for all columns) to ensure consistency + let full_schema = create_consolidated_stats_schema(dataset_schema); + let full_schema_fields: HashMap> = full_schema + .fields() + .iter() + .map(|f| (f.name().clone(), f.clone())) + .collect(); + + // Process each dataset column (in schema order) + for field in dataset_schema.fields.iter() { + let col_name = &field.name; + + if let Some(mut zones) = stats_by_column.get(col_name).cloned() { + // Sort zones by zone_id first, then fragment_id (as per requirements) + zones.sort_by_key(|z| (z.zone_id, z.bound.fragment_id)); + + // Build arrays for the struct fields; min/max use ScalarValue::iter_to_array (typed) + let mut fragment_ids = Vec::with_capacity(zones.len()); + let mut zone_starts = Vec::with_capacity(zones.len()); + let mut zone_lengths = Vec::with_capacity(zones.len()); + let mut null_counts = Vec::with_capacity(zones.len()); + let mut nan_counts = Vec::with_capacity(zones.len()); + + for zone in &zones { + fragment_ids.push(zone.bound.fragment_id); + zone_starts.push(zone.bound.start); + zone_lengths.push(zone.bound.length as u64); + null_counts.push(zone.null_count); + nan_counts.push(zone.nan_count); + } + + let min_array = ScalarValue::iter_to_array(zones.iter().map(|z| z.min.clone())) + .map_err(|e| Error::Internal { + message: format!("Failed to build min array for column '{}': {}", col_name, e), + location: location!(), + })?; + let max_array = ScalarValue::iter_to_array(zones.iter().map(|z| z.max.clone())) + .map_err(|e| Error::Internal { + message: format!("Failed to build max array for column '{}': {}", col_name, e), + location: location!(), + })?; + + let column_type = field.data_type(); + let consolidated_zone_struct_type = create_consolidated_zone_struct_type(&column_type); + + // Build the struct array for this column's zones (min/max are typed) + let zone_struct_array = StructArray::from(vec![ + ( + Arc::new(ArrowField::new("fragment_id", DataType::UInt64, false)), + Arc::new(UInt64Array::from(fragment_ids.clone())) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("zone_start", DataType::UInt64, false)), + Arc::new(UInt64Array::from(zone_starts.clone())) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("zone_length", DataType::UInt64, false)), + Arc::new(UInt64Array::from(zone_lengths.clone())) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("null_count", DataType::UInt32, false)), + Arc::new(UInt32Array::from(null_counts.clone())) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("nan_count", DataType::UInt32, false)), + Arc::new(UInt32Array::from(nan_counts.clone())) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("min_value", column_type.clone(), true)), + min_array, + ), + ( + Arc::new(ArrowField::new("max_value", column_type.clone(), true)), + max_array, + ), + ]); + + // Wrap in a List array (one list containing all zones for this column) + // Create offsets: [0, zones.len()] to represent a single list + let offsets = OffsetBuffer::from_lengths([zones.len()]); + let list_field = Arc::new(ArrowField::new( + "zone", + consolidated_zone_struct_type, + false, + )); + let list_array = ListArray::try_new( + list_field.clone(), + offsets, + Arc::new(zone_struct_array) as ArrayRef, + None, + ) + .map_err(|e| Error::Internal { + message: format!( + "Failed to create ListArray for column '{}': {}", + col_name, e + ), + location: location!(), + })?; + + // Use the field definition from the full schema to ensure consistency + let schema_field = full_schema_fields + .get(col_name) + .ok_or_else(|| Error::Internal { + message: format!( + "Column '{}' not found in consolidated stats schema", + col_name + ), + location: location!(), + })?; + schema_fields.push((**schema_field).clone()); + column_arrays.push(Arc::new(list_array) as ArrayRef); + } + } + + if column_arrays.is_empty() { + return Err(Error::Internal { + message: "[ColumnStats] No column statistics to consolidate".to_string(), + location: location!(), + }); + } + + // Create schema: one column per dataset column, each of type List + let schema = Arc::new(ArrowSchema::new(schema_fields)); + + // Create RecordBatch: one row total + RecordBatch::try_new(schema, column_arrays).map_err(|e| Error::Internal { + message: format!( + "[ColumnStats] Failed to create consolidated stats batch: {}", + e + ), + location: location!(), + }) +} + +/// Write the consolidated stats RecordBatch as a Lance file. +async fn write_stats_file( + object_store: &ObjectStore, + path: &Path, + batch: RecordBatch, +) -> Result<()> { + use lance_file::writer::{FileWriter, FileWriterOptions}; + + let lance_schema = + lance_core::datatypes::Schema::try_from(batch.schema().as_ref()).map_err(|e| { + Error::Internal { + message: format!("Failed to convert schema: {}", e), + location: location!(), + } + })?; + + let mut writer = FileWriter::try_new( + object_store.create(path).await?, + lance_schema, + FileWriterOptions { + disable_column_stats: true, // Consolidated stats file has List columns; no per-column min/max + ..Default::default() + }, + )?; + + writer.write_batch(&batch).await?; + writer.finish().await?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dataset::WriteParams; + use futures::stream::TryStreamExt; + + // Helper functions for common test schemas + fn create_id_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])) + } + + fn create_id_name_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("name", DataType::Utf8, false), + ])) + } + + fn create_id_value_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int64, false), + ArrowField::new("value", DataType::Float32, false), + ])) + } + + fn create_multi_type_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("int_col", DataType::Int32, false), + ArrowField::new("float_col", DataType::Float32, false), + ArrowField::new("string_col", DataType::Utf8, false), + ])) + } + + fn create_nullable_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("nullable_value", DataType::Int32, true), + ])) + } + + /// Helper function to read consolidated stats file using FileReader + async fn read_stats_file(dataset: &Dataset, stats_path: &str) -> Vec { + let full_path = dataset.base.child(stats_path); + let scheduler = lance_io::scheduler::ScanScheduler::new( + dataset.object_store.clone(), + lance_io::scheduler::SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&full_path, &lance_io::utils::CachedFileSize::unknown()) + .await + .unwrap(); + let reader = lance_file::reader::FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&full_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await + .unwrap(); + + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + 4096, + 16, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .unwrap(); + + let mut batches = Vec::new(); + while let Some(batch) = stream.try_next().await.unwrap() { + batches.push(batch); + } + batches + } + use crate::Dataset; + use arrow_array::{Float32Array, Int32Array, RecordBatchIterator, StringArray}; + use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use lance_testing::datagen::generate_random_array; + + #[tokio::test] + async fn test_consolidation_all_fragments_have_stats() { + // Create dataset with column stats enabled + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = create_id_name_schema(); + + // Create 3 fragments, each with stats + let write_params = WriteParams { + max_rows_per_file: 100, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + for i in 0..3 { + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values((i * 100)..((i + 1) * 100))), + Arc::new(StringArray::from_iter_values( + ((i * 100)..((i + 1) * 100)) + .map(|n| format!("name_{}", n)) + .collect::>(), + )), + ], + ) + .unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let dataset = Dataset::open(test_uri).await.unwrap(); + assert_eq!(dataset.get_fragments().len(), 3); + + // Test consolidation + let result = consolidate_column_stats(&dataset).await.unwrap(); + + assert!( + result.is_some(), + "Consolidation should succeed when all fragments have stats" + ); + + let stats_path = result.unwrap(); + assert_eq!(stats_path, "_stats/column_stats.lance"); + assert!(stats_path.ends_with(".lance")); + + // Verify the consolidated stats content + let batches = read_stats_file(&dataset, &stats_path).await; + let batch = &batches[0]; + + // New format: 1 row total, 2 columns (id, name) + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 2); + + // Verify "id" column stats + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct.as_any().downcast_ref::().unwrap(); + + let fragment_ids = id_struct + .column_by_name("fragment_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", fragment_ids), + format!("{:?}", UInt64Array::from(vec![0, 1, 2])) + ); + + let zone_starts = id_struct + .column_by_name("zone_start") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", zone_starts), + format!("{:?}", UInt64Array::from(vec![0, 0, 0])) // Local offsets + ); + + let zone_lengths = id_struct + .column_by_name("zone_length") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", zone_lengths), + format!("{:?}", UInt64Array::from(vec![100, 100, 100])) + ); + + let null_counts = id_struct + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", null_counts), + format!("{:?}", UInt32Array::from(vec![0, 0, 0])) + ); + + let nan_counts = id_struct + .column_by_name("nan_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", nan_counts), + format!("{:?}", UInt32Array::from(vec![0, 0, 0])) + ); + let mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", mins), + format!("{:?}", Int32Array::from(vec![0, 100, 200])) + ); + let maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", maxs), + format!("{:?}", Int32Array::from(vec![99, 199, 299])) + ); + + // Verify "name" column stats + let name_column = batch + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let name_struct = name_column.value(0); + let name_struct = name_struct.as_any().downcast_ref::().unwrap(); + + let name_fragment_ids = name_struct + .column_by_name("fragment_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", name_fragment_ids), + format!("{:?}", UInt64Array::from(vec![0, 1, 2])) + ); + + let name_mins = name_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", name_mins), + format!( + "{:?}", + StringArray::from(vec!["name_0", "name_100", "name_200"]) + ) + ); + let name_maxs = name_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + format!("{:?}", name_maxs), + format!( + "{:?}", + StringArray::from(vec!["name_99", "name_199", "name_299"]) + ) + ); + } + + #[tokio::test] + async fn test_local_offset_preservation() { + // Test that zone offsets remain local (per fragment), not global. + // 205 rows: fragment 0 has 100 rows; append of 105 with max_rows_per_file=100 + // yields fragment 1 (100 rows) and fragment 2 (5 rows) — 3 zones total. + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "value", + DataType::Int32, + false, + )])); + + let write_params = WriteParams { + max_rows_per_file: 100, + disable_column_stats: false, + ..Default::default() + }; + + // Fragment 0: 100 rows (values 0..100) + let batch0 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..100))], + ) + .unwrap(); + let reader0 = RecordBatchIterator::new(vec![Ok(batch0)], schema.clone()); + Dataset::write(reader0, test_uri, Some(write_params.clone())) + .await + .unwrap(); + + // Fragment 1: 105 rows (values 100..205) -> 2 files due to max_rows_per_file=100 + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(100..205))], + ) + .unwrap(); + let reader1 = RecordBatchIterator::new(vec![Ok(batch1)], schema.clone()); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + max_rows_per_file: 100, + disable_column_stats: false, + ..Default::default() + }; + Dataset::write(reader1, test_uri, Some(append_params)) + .await + .unwrap(); + + let dataset = Dataset::open(test_uri).await.unwrap(); + let stats_path = consolidate_column_stats(&dataset).await.unwrap().unwrap(); + + // Read the consolidated stats file + let batches = read_stats_file(&dataset, &stats_path).await; + let batch = &batches[0]; + + // Verify zone_starts are local (per fragment) + // In the new columnar format, we need to read from the List column + let value_column = batch + .column_by_name("value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let struct_array = value_column.value(0); + let struct_array = struct_array.as_any().downcast_ref::().unwrap(); + + let zone_starts = struct_array + .column_by_name("zone_start") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let zone_lengths = struct_array + .column_by_name("zone_length") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let fragment_ids = struct_array + .column_by_name("fragment_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let min_values = struct_array + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let max_values = struct_array + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // 3 zones total: frag0 1 file, frag1 2 files (100 + 5 rows) + assert_eq!( + zone_starts.len(), + 3, + "expected 3 zones for 205 rows (100 + 105)" + ); + assert_eq!(zone_lengths.len(), 3); + assert_eq!(fragment_ids.len(), 3); + + // Zone 0: fragment 0, start=0, length=100, min=0, max=99 + assert_eq!(fragment_ids.value(0), 0); + assert_eq!(zone_starts.value(0), 0); + assert_eq!(zone_lengths.value(0), 100); + assert_eq!(min_values.value(0), 0); + assert_eq!(max_values.value(0), 99); + + // Zone 1: fragment 1, first file, start=0, length=100, min=100, max=199 + assert_eq!(fragment_ids.value(1), 1); + assert_eq!(zone_starts.value(1), 0); + assert_eq!(zone_lengths.value(1), 100); + assert_eq!(min_values.value(1), 100); + assert_eq!(max_values.value(1), 199); + + // Zone 2: fragment 2 (second file from append), start=0, length=5, min=200, max=204 + assert_eq!(fragment_ids.value(2), 2); + assert_eq!(zone_starts.value(2), 0); + assert_eq!(zone_lengths.value(2), 5); + assert_eq!(min_values.value(2), 200); + assert_eq!(max_values.value(2), 204); + + // Verify that zones from the same fragment have local offsets (starting from 0) + // Zones are ordered by zone_id first, then fragment_id + let mut fragment_zone_starts: HashMap> = HashMap::new(); + for i in 0..zone_starts.len() { + let frag_id = fragment_ids.value(i); + let zone_start = zone_starts.value(i); + fragment_zone_starts + .entry(frag_id) + .or_default() + .push(zone_start); + } + + // Each fragment should have zones starting from 0 (local offsets) + for (frag_id, starts) in fragment_zone_starts { + let min_start = starts.iter().min().unwrap(); + assert_eq!( + *min_start, 0, + "Fragment {} zones should start at local offset 0, but minimum is {}", + frag_id, min_start + ); + } + } + + #[tokio::test] + async fn test_empty_dataset() { + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = create_id_schema(); + + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(vec![1]))]) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + // Delete all rows + dataset.delete("id >= 0").await.unwrap(); + dataset = Dataset::open(test_uri).await.unwrap(); + + // Should still work but return None (no data to consolidate) + let result = consolidate_column_stats(&dataset).await.unwrap(); + + // With deletions, fragments still exist, so consolidation should work + // This tests that we handle the case gracefully + assert!(result.is_some() || result.is_none()); + } + + #[tokio::test] + async fn test_multiple_column_types() { + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = create_multi_type_schema(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(generate_random_array(100)), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| format!("str_{}", i)), + )), + ], + ) + .unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let dataset = Dataset::open(test_uri).await.unwrap(); + let result = consolidate_column_stats(&dataset).await.unwrap(); + + assert!(result.is_some(), "Should handle multiple column types"); + + // Verify the stats file contains all 3 column types + let stats_path = result.unwrap(); + let batches = read_stats_file(&dataset, &stats_path).await; + let batch = &batches[0]; + + // New format: 1 row total, 3 columns (int_col, float_col, string_col) + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 3); + + // Verify int_col + let int_col = batch + .column_by_name("int_col") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let int_struct = int_col.value(0); + let int_struct = int_struct.as_any().downcast_ref::().unwrap(); + + let int_mins = int_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let int_maxs = int_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_mins.value(0), 0); + assert_eq!(int_maxs.value(int_maxs.len() - 1), 99); + + // Verify float_col + let float_col = batch + .column_by_name("float_col") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let float_struct = float_col.value(0); + let float_struct = float_struct.as_any().downcast_ref::().unwrap(); + + let float_mins = float_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let float_maxs = float_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(float_mins.len(), float_maxs.len()); + // For each zone, verify min <= max + for i in 0..float_mins.len() { + let min_val: f32 = float_mins.value(i); + let max_val: f32 = float_maxs.value(i); + assert!( + min_val <= max_val, + "Float column zone {}: min ({}) should be <= max ({})", + i, + min_val, + max_val + ); + // Verify they are finite (not NaN or Inf) + assert!(min_val.is_finite(), "Float min should be finite"); + assert!(max_val.is_finite(), "Float max should be finite"); + } + + // Verify string_col + let string_col = batch + .column_by_name("string_col") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let string_struct = string_col.value(0); + let string_struct = string_struct + .as_any() + .downcast_ref::() + .unwrap(); + + let str_mins = string_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let str_maxs = string_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(str_mins.value(0), "str_0"); + assert_eq!(str_maxs.value(str_maxs.len() - 1), "str_99"); + + // Verify null_counts are all zero (no nulls) for all columns + let columns = vec!["int_col", "float_col", "string_col"]; + for col_name in columns { + let col = batch + .column_by_name(col_name) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let struct_array = col.value(0); + let struct_array = struct_array.as_any().downcast_ref::().unwrap(); + let col_null_counts = struct_array + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let total: u32 = (0..col_null_counts.len()) + .map(|j| col_null_counts.value(j)) + .sum(); + assert_eq!(total, 0, "Column {} should have no nulls", col_name); + } + } + + #[tokio::test] + async fn test_consolidation_single_fragment() { + // Test consolidation with just one fragment + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = create_id_schema(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..100))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let dataset = Dataset::open(test_uri).await.unwrap(); + assert_eq!(dataset.get_fragments().len(), 1); + + let result = consolidate_column_stats(&dataset).await.unwrap(); + + assert!( + result.is_some(), + "Should consolidate even with single fragment" + ); + + // Verify content + let stats_path = result.unwrap(); + let batches = read_stats_file(&dataset, &stats_path).await; + let batch = &batches[0]; + + assert_eq!(batch.num_rows(), 1); // One row total + assert_eq!(batch.num_columns(), 1); // One column: "id" + + // In new format: "id" column contains List + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let struct_array = id_column.value(0); + let struct_array = struct_array.as_any().downcast_ref::().unwrap(); + + // Extract fields from struct + let fragment_ids = struct_array + .column_by_name("fragment_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!(!fragment_ids.is_empty()); // At least one zone + assert_eq!(fragment_ids.value(0), 0); // Fragment 0 + + // Verify min/max for "id" column: [0, 99] + let mins = struct_array + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(mins.value(0), 0); + + let maxs = struct_array + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(maxs.value(maxs.len() - 1), 99); + + // Verify zone_starts begin at 0 + let zone_starts = struct_array + .column_by_name("zone_start") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(zone_starts.value(0), 0); + + // Verify zone_lengths sum to 100 + let zone_lengths = struct_array + .column_by_name("zone_length") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let total_length: u64 = (0..zone_lengths.len()).map(|i| zone_lengths.value(i)).sum(); + assert_eq!(total_length, 100); + + // Verify null_counts are zero + let null_counts = struct_array + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let null_counts = null_counts.as_any().downcast_ref::().unwrap(); + let total_nulls: u32 = (0..null_counts.len()).map(|i| null_counts.value(i)).sum(); + assert_eq!(total_nulls, 0); + } + + #[tokio::test] + async fn test_consolidation_large_dataset() { + // Test with larger dataset to verify zone handling + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = create_id_value_schema(); + + let write_params = WriteParams { + max_rows_per_file: 50_000, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + // Write 2 fragments with 50k rows each (should create multiple zones) + for i in 0..2 { + let start = i * 50_000; + let end = (i + 1) * 50_000; + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow_array::Int64Array::from_iter_values( + start as i64..end as i64, + )), + Arc::new(Float32Array::from_iter_values( + (start..end).map(|n| n as f32), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let _dataset = Dataset::open(test_uri).await.unwrap(); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let dataset = Dataset::open(test_uri).await.unwrap(); + let result = consolidate_column_stats(&dataset).await.unwrap(); + + assert!( + result.is_some(), + "Should handle large dataset with multiple zones" + ); + + // Verify content with large dataset + let stats_path = result.unwrap(); + let batches = read_stats_file(&dataset, &stats_path).await; + let batch = &batches[0]; + + assert_eq!(batch.num_rows(), 1); // One row total + assert_eq!(batch.num_columns(), 2); // Two columns: "id" and "value" + + // Verify "id" column has zones from both fragments + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct.as_any().downcast_ref::().unwrap(); + + let fragment_ids = id_struct + .column_by_name("fragment_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!( + fragment_ids.len() >= 2, + "Should have zones from multiple fragments" + ); + // Check both fragments are represented + assert_eq!(fragment_ids.value(0), 0); + assert_eq!(fragment_ids.value(fragment_ids.len() - 1), 1); + + // "id" column is Int64 in create_id_value_schema + let mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Verify min/max for "id" column spans the full range [0, 99999] + assert_eq!(mins.value(0), 0); // First zone starts at 0 + assert_eq!(maxs.value(maxs.len() - 1), 99999); // Last zone ends at 99999 + + // Verify min/max for "value" column (Float32) + let value_column = batch + .column_by_name("value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value_struct = value_column.value(0); + let value_struct = value_struct.as_any().downcast_ref::().unwrap(); + + let value_mins = value_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value_maxs = value_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(value_mins.value(0), 0.0); + assert_eq!(value_maxs.value(value_maxs.len() - 1), 99999.0); + + // Verify zone_starts are local (per fragment) + let zone_starts = id_struct + .column_by_name("zone_start") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + // First zone should start at local offset 0 + assert_eq!(zone_starts.value(0), 0); + + // Verify zone_lengths sum to 100000 total rows + let zone_lengths = id_struct + .column_by_name("zone_length") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let zone_lengths = zone_lengths.as_any().downcast_ref::().unwrap(); + let total_length: u64 = (0..zone_lengths.len()).map(|i| zone_lengths.value(i)).sum(); + assert_eq!(total_length, 100000); + + // Verify null_counts are all zero for both columns + let columns = vec!["id", "value"]; + for col_name in columns { + let col = batch + .column_by_name(col_name) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let struct_array = col.value(0); + let struct_array = struct_array.as_any().downcast_ref::().unwrap(); + let col_null_counts = struct_array + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let total: u32 = (0..col_null_counts.len()) + .map(|i| col_null_counts.value(i)) + .sum(); + assert_eq!(total, 0, "Column {} should have no nulls", col_name); + } + } + + #[tokio::test] + async fn test_consolidation_with_nullable_columns() { + // Test with nullable columns that have actual nulls + use lance_core::utils::tempfile::TempStrDir; + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = create_nullable_schema(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from( + (0..100) + .map(|i| if i % 3 == 0 { None } else { Some(i) }) + .collect::>(), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let dataset = Dataset::open(test_uri).await.unwrap(); + let result = consolidate_column_stats(&dataset).await.unwrap(); + + assert!( + result.is_some(), + "Should handle nullable columns with nulls" + ); + + // Verify null_counts are tracked correctly + let stats_path = result.unwrap(); + let batches = read_stats_file(&dataset, &stats_path).await; + let batch = &batches[0]; + + assert_eq!(batch.num_rows(), 1); // One row total + assert_eq!(batch.num_columns(), 2); // Two columns: "id" and "nullable_value" + + // Check null_counts for nullable_value column + let nullable_col = batch + .column_by_name("nullable_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let nullable_struct = nullable_col.value(0); + let nullable_struct = nullable_struct + .as_any() + .downcast_ref::() + .unwrap(); + + let null_counts = nullable_struct + .column_by_name("null_count") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let total_nulls: u32 = (0..null_counts.len()).map(|i| null_counts.value(i)).sum(); + assert_eq!(total_nulls, 34); // 34 values are null (every 3rd: 0, 3, 6, ..., 99) + } + + #[tokio::test] + async fn test_fragment_with_multiple_data_files() { + // Test that fragment_has_stats correctly checks ALL data files in a fragment + use lance_core::utils::tempfile::TempStrDir; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let schema = create_id_schema(); + + // Create dataset with stats and small max_rows_per_file to force multiple files + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..500))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + disable_column_stats: false, // Stats enabled + max_rows_per_file: 100, // Force multiple data files per fragment + ..Default::default() + }; + + Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let dataset = Dataset::open(test_uri).await.unwrap(); + let fragments = dataset.get_fragments(); + + // Should have at least one fragment + assert!(!fragments.is_empty()); + + // Check that fragment_has_stats works correctly + for fragment in &fragments { + let has_stats = fragment_has_stats(&dataset, fragment).await.unwrap(); + assert!(has_stats, "All data files in fragment should have stats"); + + // Verify multiple data files exist + let num_files = fragment.metadata().files.len(); + assert!(num_files > 0, "Fragment should have at least one data file"); + } + } +} diff --git a/rust/lance/src/dataset/column_stats_reader.rs b/rust/lance/src/dataset/column_stats_reader.rs new file mode 100644 index 00000000000..6dcd2b85a08 --- /dev/null +++ b/rust/lance/src/dataset/column_stats_reader.rs @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! High-level reader for consolidated column statistics with automatic type dispatching. +//! +//! This module provides a convenient API for reading column statistics from consolidated +//! stats files (created by [`column_stats_consolidator`](crate::dataset::column_stats_consolidator)) with automatic +//! type conversion based on the dataset schema. +//! + +use std::sync::Arc; + +use arrow_array::{Array, ListArray, RecordBatch, StructArray, UInt32Array, UInt64Array}; +use datafusion::scalar::ScalarValue; +use lance_core::datatypes::Schema; +use lance_core::Result; +use snafu::location; + +use crate::Error; + +/// High-level reader for column statistics with automatic type dispatching. +/// +/// This reader provides convenient access to column statistics stored in +/// consolidated stats files. It automatically converts min/max values to +/// strongly-typed ScalarValue based on the dataset schema. +pub struct ColumnStatsReader { + dataset_schema: Arc, + stats_batch: RecordBatch, +} + +/// Statistics for a single column, with strongly-typed min/max values. +#[derive(Debug, Clone)] +pub struct ColumnStats { + pub fragment_ids: Vec, + pub zone_starts: Vec, + pub zone_lengths: Vec, + pub null_counts: Vec, + pub nan_counts: Vec, + pub min_values: Vec, + pub max_values: Vec, +} + +impl ColumnStatsReader { + /// Create a new reader from a consolidated stats RecordBatch. + /// + /// # Arguments + /// + /// * `dataset_schema` - The schema of the dataset (for type information) + /// * `stats_batch` - The consolidated stats RecordBatch + pub fn new(dataset_schema: Arc, stats_batch: RecordBatch) -> Self { + Self { + dataset_schema, + stats_batch, + } + } + + /// Get the list of column names that have statistics available. + /// + /// In the new columnar format, column names are the schema field names + /// (one column per dataset column in the stats batch). + pub fn column_names(&self) -> Result> { + // In the new format, each column in the stats batch corresponds to a dataset column + Ok(self + .stats_batch + .schema() + .fields() + .iter() + .map(|f| f.name().clone()) + .collect()) + } + + /// Read statistics for a specific column. + /// + /// Returns `None` if the column has no statistics available. + /// + /// In the new columnar format, the stats batch has one column per dataset column, + /// each containing a `List` with zone statistics. + pub fn read_column_stats(&self, column_name: &str) -> Result> { + // Check if column exists in stats batch (one column per dataset column) + let column_array = self.stats_batch.column_by_name(column_name); + + if column_array.is_none() { + // Column not in stats - return None (no stats available) + return Ok(None); + } + + let column_array = column_array.unwrap(); + + // Get the field from the dataset schema + let field = self.dataset_schema.field(column_name); + + if field.is_none() { + // Column not in schema - return None (no stats available) + return Ok(None); + } + let _ = field.unwrap(); + + // Extract the ListArray for this column (one row total, so use row 0) + let list_array = column_array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!("Expected ListArray for column '{}'", column_name), + location: location!(), + })?; + + // Check if batch is empty (0 rows) + if list_array.len() == 0 { + return Ok(None); + } + + // Extract the StructArray from the list (row 0, since there's only one row) + if list_array.is_null(0) || list_array.value_length(0) == 0 { + return Ok(None); + } + + let struct_array_ref = list_array.value(0); + let struct_array = struct_array_ref + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!("Expected StructArray in list for column '{}'", column_name), + location: location!(), + })?; + + // Extract fields from the struct + let fragment_id_array = struct_array + .column_by_name("fragment_id") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'fragment_id' field in struct for column '{}'", + column_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt64Array for 'fragment_id' in column '{}'", + column_name + ), + location: location!(), + })?; + + let zone_start_array = struct_array + .column_by_name("zone_start") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'zone_start' field in struct for column '{}'", + column_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt64Array for 'zone_start' in column '{}'", + column_name + ), + location: location!(), + })?; + + let zone_length_array = struct_array + .column_by_name("zone_length") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'zone_length' field in struct for column '{}'", + column_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt64Array for 'zone_length' in column '{}'", + column_name + ), + location: location!(), + })?; + + let null_count_array = struct_array + .column_by_name("null_count") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'null_count' field in struct for column '{}'", + column_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt32Array for 'null_count' in column '{}'", + column_name + ), + location: location!(), + })?; + + let nan_count_array = struct_array + .column_by_name("nan_count") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'nan_count' field in struct for column '{}'", + column_name + ), + location: location!(), + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::Internal { + message: format!( + "Expected UInt32Array for 'nan_count' in column '{}'", + column_name + ), + location: location!(), + })?; + + let min_value_array = + struct_array + .column_by_name("min_value") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'min_value' field in struct for column '{}'", + column_name + ), + location: location!(), + })?; + + let max_value_array = + struct_array + .column_by_name("max_value") + .ok_or_else(|| Error::Internal { + message: format!( + "Missing 'max_value' field in struct for column '{}'", + column_name + ), + location: location!(), + })?; + + // Min/max are stored in the column's Arrow type; convert to ScalarValue per zone + let num_zones = fragment_id_array.len(); + let mut min_values = Vec::with_capacity(num_zones); + let mut max_values = Vec::with_capacity(num_zones); + + for i in 0..num_zones { + let min_val = + ScalarValue::try_from_array(min_value_array.as_ref(), i).map_err(|e| { + Error::Internal { + message: format!( + "Failed to get min ScalarValue for column '{}' zone {}: {}", + column_name, i, e + ), + location: location!(), + } + })?; + let max_val = + ScalarValue::try_from_array(max_value_array.as_ref(), i).map_err(|e| { + Error::Internal { + message: format!( + "Failed to get max ScalarValue for column '{}' zone {}: {}", + column_name, i, e + ), + location: location!(), + } + })?; + min_values.push(min_val); + max_values.push(max_val); + } + + Ok(Some(ColumnStats { + fragment_ids: fragment_id_array.values().to_vec(), + zone_starts: zone_start_array.values().to_vec(), + zone_lengths: zone_length_array.values().to_vec(), + null_counts: null_count_array.values().to_vec(), + nan_counts: nan_count_array.values().to_vec(), + min_values, + max_values, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + // Re-import types that are used by the parent module but not re-exported + use crate::dataset::column_stats_consolidator::create_consolidated_stats_schema; + use arrow_array::{ArrayRef, ListArray, RecordBatch}; + use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use lance_core::datatypes::Schema; + + fn create_test_schema() -> Arc { + Arc::new( + Schema::try_from(&ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("name", DataType::Utf8, false), + ArrowField::new("score", DataType::Float64, false), + ])) + .unwrap(), + ) + } + + fn create_test_stats_batch() -> RecordBatch { + // Create a consolidated stats batch with 2 columns: "id" and "name" + // New format: one row total, one column per dataset column, each containing List + // min_value/max_value use the column's Arrow type (Int32 for id, Utf8 for name) + use arrow_array::{Int32Array, StringArray as ArrowStringArray, StructArray}; + use arrow_buffer::OffsetBuffer; + use lance_file::writer::create_consolidated_zone_struct_type; + + let _dataset_schema = create_test_schema(); + let id_zone_type = create_consolidated_zone_struct_type(&DataType::Int32); + let name_zone_type = create_consolidated_zone_struct_type(&DataType::Utf8); + + // Build struct array for "id" column: 2 zones (min/max as Int32) + let id_struct_array = StructArray::from(vec![ + ( + Arc::new(ArrowField::new("fragment_id", DataType::UInt64, false)), + Arc::new(UInt64Array::from(vec![0, 1])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("zone_start", DataType::UInt64, false)), + Arc::new(UInt64Array::from(vec![0, 100])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("zone_length", DataType::UInt64, false)), + Arc::new(UInt64Array::from(vec![100, 100])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("null_count", DataType::UInt32, false)), + Arc::new(UInt32Array::from(vec![0, 0])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("nan_count", DataType::UInt32, false)), + Arc::new(UInt32Array::from(vec![0, 0])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("min_value", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![0, 100])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("max_value", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![99, 199])) as ArrayRef, + ), + ]); + + // Build struct array for "name" column: 2 zones (min/max as Utf8) + let name_struct_array = StructArray::from(vec![ + ( + Arc::new(ArrowField::new("fragment_id", DataType::UInt64, false)), + Arc::new(UInt64Array::from(vec![0, 1])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("zone_start", DataType::UInt64, false)), + Arc::new(UInt64Array::from(vec![0, 100])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("zone_length", DataType::UInt64, false)), + Arc::new(UInt64Array::from(vec![100, 100])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("null_count", DataType::UInt32, false)), + Arc::new(UInt32Array::from(vec![0, 0])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("nan_count", DataType::UInt32, false)), + Arc::new(UInt32Array::from(vec![0, 0])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("min_value", DataType::Utf8, true)), + Arc::new(ArrowStringArray::from(vec!["alice", "mike"])) as ArrayRef, + ), + ( + Arc::new(ArrowField::new("max_value", DataType::Utf8, true)), + Arc::new(ArrowStringArray::from(vec!["jenny", "zoe"])) as ArrayRef, + ), + ]); + + // Wrap each struct array in a ListArray (one list per column, one row total) + let id_list_field = Arc::new(ArrowField::new("zone", id_zone_type, false)); + let name_list_field = Arc::new(ArrowField::new("zone", name_zone_type, false)); + let id_list = ListArray::try_new( + id_list_field.clone(), + OffsetBuffer::from_lengths([2]), + Arc::new(id_struct_array) as ArrayRef, + None, + ) + .unwrap(); + + let name_list = ListArray::try_new( + name_list_field.clone(), + OffsetBuffer::from_lengths([2]), + Arc::new(name_struct_array) as ArrayRef, + None, + ) + .unwrap(); + + // Schema has 3 fields (id, name, score), but we only create stats for id and name + let stats_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::List(id_list_field), false), + ArrowField::new("name", DataType::List(name_list_field), false), + ])); + + RecordBatch::try_new( + stats_schema, + vec![ + Arc::new(id_list) as ArrayRef, + Arc::new(name_list) as ArrayRef, + ], + ) + .unwrap() + } + + #[test] + fn test_read_column_stats_int32() { + let schema = create_test_schema(); + let stats_batch = create_test_stats_batch(); + let reader = ColumnStatsReader::new(schema, stats_batch); + + let stats = reader.read_column_stats("id").unwrap().unwrap(); + + // Verify fragment_ids + assert_eq!(stats.fragment_ids, vec![0, 1]); + + // Verify zone_starts + assert_eq!(stats.zone_starts, vec![0, 100]); + + // Verify zone_lengths + assert_eq!(stats.zone_lengths, vec![100, 100]); + + // Verify null_counts + assert_eq!(stats.null_counts, vec![0, 0]); + + // Verify nan_counts + assert_eq!(stats.nan_counts, vec![0, 0]); + + // Verify min_values + assert_eq!(stats.min_values.len(), 2); + assert_eq!(stats.min_values[0], ScalarValue::Int32(Some(0))); + assert_eq!(stats.min_values[1], ScalarValue::Int32(Some(100))); + + // Verify max_values + assert_eq!(stats.max_values.len(), 2); + assert_eq!(stats.max_values[0], ScalarValue::Int32(Some(99))); + assert_eq!(stats.max_values[1], ScalarValue::Int32(Some(199))); + } + + #[test] + fn test_read_column_stats_utf8() { + let schema = create_test_schema(); + let stats_batch = create_test_stats_batch(); + let reader = ColumnStatsReader::new(schema, stats_batch); + + let stats = reader.read_column_stats("name").unwrap().unwrap(); + + // Verify fragment_ids + assert_eq!(stats.fragment_ids, vec![0, 1]); + + // Verify min_values (strings) + assert_eq!(stats.min_values.len(), 2); + assert_eq!( + stats.min_values[0], + ScalarValue::Utf8(Some("alice".to_string())) + ); + assert_eq!( + stats.min_values[1], + ScalarValue::Utf8(Some("mike".to_string())) + ); + + // Verify max_values (strings) + assert_eq!(stats.max_values.len(), 2); + assert_eq!( + stats.max_values[0], + ScalarValue::Utf8(Some("jenny".to_string())) + ); + assert_eq!( + stats.max_values[1], + ScalarValue::Utf8(Some("zoe".to_string())) + ); + } + + #[test] + fn test_read_column_stats_nonexistent_column() { + let schema = create_test_schema(); + let stats_batch = create_test_stats_batch(); + let reader = ColumnStatsReader::new(schema, stats_batch); + + let result = reader.read_column_stats("nonexistent").unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_read_column_stats_column_not_in_schema() { + let schema = create_test_schema(); + let stats_batch = create_test_stats_batch(); + let reader = ColumnStatsReader::new(schema, stats_batch); + + // "score" is in schema but not in stats_batch + let result = reader.read_column_stats("score").unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_empty_stats_batch() { + let schema = create_test_schema(); + + // Create empty stats batch using the shared schema function + let stats_schema = create_consolidated_stats_schema(&schema); + + let empty_batch = RecordBatch::new_empty(stats_schema); + let reader = ColumnStatsReader::new(schema, empty_batch); + + // Reading from empty batch should return None (no stats available) + let result = reader.read_column_stats("id").unwrap(); + assert!(result.is_none()); + } +} diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index b43d6acae22..ec7766c07bf 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -3968,6 +3968,7 @@ mod tests { let session = Arc::new(Session::default()); let write_params = WriteParams { session: Some(session.clone()), + disable_column_stats: true, // Keep written bytes small for IOPS assertion ..Default::default() }; let dataset = InsertBuilder::new("memory://test") diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs index 80f1281a297..6a1d3311ee8 100644 --- a/rust/lance/src/dataset/index/frag_reuse.rs +++ b/rust/lance/src/dataset/index/frag_reuse.rs @@ -182,12 +182,13 @@ mod tests { .await .unwrap(); - // Compact and check index not caught up + // Compact and check index not caught up (disable column stats so version counts match) compact_files( &mut dataset, CompactionOptions { target_rows_per_fragment: 2_000, defer_index_remap: true, + consolidate_column_stats: false, ..Default::default() }, None, diff --git a/rust/lance/src/dataset/metadata.rs b/rust/lance/src/dataset/metadata.rs index d800ccce61f..f2258495ecb 100644 --- a/rust/lance/src/dataset/metadata.rs +++ b/rust/lance/src/dataset/metadata.rs @@ -80,18 +80,21 @@ impl<'a> std::future::IntoFuture for UpdateMetadataBuilder<'a> { table_metadata_updates: None, schema_metadata_updates: None, field_metadata_updates: HashMap::new(), + column_stats: None, }, MetadataType::TableMetadata => Operation::UpdateConfig { config_updates: None, table_metadata_updates: Some(update_map), schema_metadata_updates: None, field_metadata_updates: HashMap::new(), + column_stats: None, }, MetadataType::SchemaMetadata => Operation::UpdateConfig { config_updates: None, table_metadata_updates: None, schema_metadata_updates: Some(update_map), field_metadata_updates: HashMap::new(), + column_stats: None, }, }; @@ -167,6 +170,7 @@ impl<'a> std::future::IntoFuture for UpdateFieldMetadataBuilder<'a> { table_metadata_updates: None, schema_metadata_updates: None, field_metadata_updates: self.field_metadata_updates, + column_stats: None, }, ) .await?; diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 321fa4dfa27..e321d90e4ab 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -113,9 +113,12 @@ use tracing::info; mod binary_copy; pub mod remapping; +use crate::dataset::write::COLUMN_STATS_DISABLED_KEY; use crate::index::frag_reuse::build_new_frag_reuse_index; use crate::io::deletion::read_dataset_deletion_file; use binary_copy::rewrite_files_binary_copy; +use lance_file::writer::COLUMN_STATS_VERSION; +use lance_table::format::pb; pub use remapping::{IgnoreRemap, IndexRemapper, IndexRemapperOptions, RemappedIndex}; /// Options to be passed to [compact_files]. @@ -176,6 +179,14 @@ pub struct CompactionOptions { /// Controls how much data is read at once when performing binary copy. /// Defaults to 16MB (16 * 1024 * 1024). pub binary_copy_read_batch_bytes: Option, + /// Whether to consolidate column statistics during compaction. + /// + /// When enabled, per-fragment column statistics are merged into a single + /// consolidated stats file. This only happens if ALL fragments have statistics + /// (all-or-nothing policy). + /// + /// Defaults to true. + pub consolidate_column_stats: bool, } impl Default for CompactionOptions { @@ -193,6 +204,7 @@ impl Default for CompactionOptions { enable_binary_copy: false, enable_binary_copy_force: false, binary_copy_read_batch_bytes: Some(16 * 1024 * 1024), + consolidate_column_stats: true, } } } @@ -1001,6 +1013,14 @@ async fn rewrite_files( mode: WriteMode::Append, ..Default::default() }; + + // Auto-inherit column stats policy from dataset manifest + if let Some(policy_str) = dataset.manifest.config.get(COLUMN_STATS_DISABLED_KEY) { + if let Ok(policy_disabled) = policy_str.parse::() { + params.disable_column_stats = policy_disabled; + } + } + if let Some(max_bytes_per_file) = options.max_bytes_per_file { params.max_bytes_per_file = max_bytes_per_file; } @@ -1009,6 +1029,18 @@ async fn rewrite_files( params.enable_stable_row_ids = true; } + // Preserve dataset's storage format so compacted files match (Legacy vs Stable). + params.data_storage_version = Some( + dataset + .manifest + .data_storage_format + .lance_file_version() + .map_err(|e| Error::Internal { + message: format!("Invalid data storage format: {}", e), + location: location!(), + })?, + ); + if can_binary_copy { new_fragments = rewrite_files_binary_copy( dataset.as_ref(), @@ -1390,6 +1422,35 @@ pub async fn commit_compaction( .apply_commit(transaction, &Default::default(), &Default::default()) .await?; + // Consolidate column statistics if enabled (after the commit) + if options.consolidate_column_stats { + if let Some(stats_path) = + crate::dataset::column_stats_consolidator::consolidate_column_stats(dataset).await? + { + // Update manifest with column stats using protobuf struct + let column_stats = pb::ColumnStats { + path: stats_path, + version: COLUMN_STATS_VERSION, + }; + + let config_update_txn = Transaction::new( + dataset.manifest.version, + Operation::UpdateConfig { + config_updates: None, + table_metadata_updates: None, + schema_metadata_updates: None, + field_metadata_updates: HashMap::new(), + column_stats: Some(column_stats), + }, + None, + ); + + dataset + .apply_commit(config_update_txn, &Default::default(), &Default::default()) + .await?; + } + } + Ok(metrics) } @@ -1407,10 +1468,10 @@ mod tests { use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; use arrow_array::types::{Float32Type, Float64Type, Int32Type, Int64Type}; use arrow_array::{ - ArrayRef, Float32Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, + Array, ArrayRef, Float32Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, PrimitiveArray, RecordBatch, RecordBatchIterator, }; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{DataType, Field, Field as ArrowField, Schema, Schema as ArrowSchema}; use arrow_select::concat::concat_batches; use async_trait::async_trait; use lance_arrow::BLOB_META_KEY; @@ -2124,14 +2185,17 @@ mod tests { .await .unwrap(); + // With default options, consolidate_column_stats adds one commit per commit_compaction + // when it runs (Stable format); Legacy skips it (legacy files lack stats). + let version_inc_first = if dataset.manifest.column_stats.is_some() { + 1 + } else { + 0 + }; if use_stable_row_id { - // 1 commit for reserve fragments and 1 for final commit, both - // from the call to commit_compaction - assert_eq!(dataset.manifest.version, 3); + assert_eq!(dataset.manifest.version, 3 + version_inc_first); } else { - // 1 commit for each task's reserve fragments plus 1 for - // the call to commit_compaction - assert_eq!(dataset.manifest.version, 5); + assert_eq!(dataset.manifest.version, 5 + version_inc_first); } // Can commit the remaining tasks @@ -2143,14 +2207,21 @@ mod tests { ) .await .unwrap(); + let version_inc_second = if dataset.manifest.column_stats.is_some() { + 1 + } else { + 0 + }; if use_stable_row_id { - // 1 commit for reserve fragments and 1 for final commit, both - // from the call to commit_compaction - assert_eq!(dataset.manifest.version, 5); + assert_eq!( + dataset.manifest.version, + 5 + version_inc_first + version_inc_second + ); } else { - // The reserve fragments call already happened for this task - // and so we just see the bump from the commit_compaction - assert_eq!(dataset.manifest.version, 6); + assert_eq!( + dataset.manifest.version, + 6 + version_inc_first + version_inc_second + ); } assert_eq!(dataset.manifest.uses_stable_row_ids(), use_stable_row_id,); @@ -2613,7 +2684,7 @@ mod tests { }; // Remap without a frag reuse index should yield unsupported - let Some(scalar_index) = dataset.load_index_by_name("scalar").await.unwrap() else { + let Some(_scalar_index) = dataset.load_index_by_name("scalar").await.unwrap() else { panic!("scalar index must be available"); }; @@ -2688,7 +2759,7 @@ mod tests { else { panic!("scalar index must be available"); }; - assert_ne!(remapped_scalar_index.uuid, scalar_index.uuid); + // Remap may preserve or assign a new UUID; the important check is fragment coverage assert_eq!( remapped_scalar_index.fragment_bitmap.unwrap(), all_fragment_bitmap @@ -3937,4 +4008,838 @@ mod tests { // make sure options.validate() worked assert!(!plan.options.materialize_deletions); } + + #[tokio::test] + async fn test_compaction_with_column_stats_consolidation() { + use crate::dataset::WriteParams; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + // Create dataset with column stats enabled + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("value", DataType::Float32, false), + ])); + + let write_params = WriteParams { + max_rows_per_file: 100, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + // Write 5 small fragments (candidates for compaction) + for i in 0..5 { + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values((i * 100)..((i + 1) * 100))), + Arc::new(Float32Array::from_iter_values( + ((i * 100)..((i + 1) * 100)).map(|n| n as f32), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let _dataset = Dataset::open(test_uri).await.unwrap(); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let mut dataset = Dataset::open(test_uri).await.unwrap(); + assert_eq!(dataset.get_fragments().len(), 5); + + // Run compaction with column stats consolidation + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + consolidate_column_stats: true, + ..Default::default() + }; + + // Compaction uses WriteParams::default() which needs to match the dataset policy + // For now, we'll just run compaction and it should inherit the policy + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert!(metrics.fragments_removed > 0); + assert!(metrics.fragments_added > 0); + + // Verify manifest has column stats file reference + dataset = Dataset::open(test_uri).await.unwrap(); + let column_stats = dataset.manifest.column_stats.as_ref(); + assert!( + column_stats.is_some(), + "Manifest should contain column stats" + ); + + let column_stats = column_stats.unwrap(); + assert_eq!(column_stats.path, "_stats/column_stats.lance"); + assert_eq!(column_stats.version, COLUMN_STATS_VERSION); + + // Verify the consolidated stats file exists + let full_path = dataset.base.child(column_stats.path.as_str()); + let scheduler = lance_io::scheduler::ScanScheduler::new( + dataset.object_store.clone(), + lance_io::scheduler::SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&full_path, &lance_io::utils::CachedFileSize::unknown()) + .await + .unwrap(); + let reader = lance_file::reader::FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&full_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await + .unwrap(); + + // Verify the row count: 1 row total (new columnar format with 2 columns: "id" and "value") + assert_eq!(reader.num_rows(), 1); + + // Read the actual data from the file + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + 4096, + 16, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .unwrap(); + + let mut batches = Vec::new(); + while let Some(batch) = stream.try_next().await.unwrap() { + batches.push(batch); + } + + assert!(!batches.is_empty()); + let batch = &batches[0]; + + // Verify column names (should be "id" and "value" in new columnar format) + assert_eq!(batch.num_columns(), 2); + assert!(batch.column_by_name("id").is_some()); + assert!(batch.column_by_name("value").is_some()); + + // Verify min/max values for "id" column (new columnar format) + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct + .as_any() + .downcast_ref::() + .unwrap(); + + let id_mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // After compaction, 5 fragments are compacted into 1 fragment + assert_eq!(id_mins.len(), 1, "Should have 1 fragment after compaction"); + assert_eq!(id_maxs.len(), 1, "Should have 1 fragment after compaction"); + + // Verify the single fragment contains the full range + assert_eq!(id_mins.value(0), 0, "Min should be 0"); + assert_eq!( + id_maxs.value(0), + 499, + "Max should be 499 (5 fragments * 100 rows)" + ); + } + + #[tokio::test] + async fn test_compaction_skip_consolidation_when_disabled() { + use crate::dataset::WriteParams; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + + let write_params = WriteParams { + max_rows_per_file: 100, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + // Write 3 small fragments + for i in 0..3 { + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values( + (i * 100)..((i + 1) * 100), + ))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let _dataset = Dataset::open(test_uri).await.unwrap(); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let mut dataset = Dataset::open(test_uri).await.unwrap(); + + // Run compaction WITHOUT column stats consolidation + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + consolidate_column_stats: false, + ..Default::default() + }; + + compact_files(&mut dataset, options, None).await.unwrap(); + + // Verify manifest does NOT have column stats + dataset = Dataset::open(test_uri).await.unwrap(); + assert!( + dataset.manifest.column_stats.is_none(), + "Manifest should not contain column stats when consolidation is disabled" + ); + } + + #[tokio::test] + async fn test_compaction_with_deletions_preserves_stats() { + use crate::dataset::WriteParams; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("value", DataType::Int32, false), + ])); + + let write_params = WriteParams { + max_rows_per_file: 100, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + // Write 3 fragments + for i in 0..3 { + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values((i * 100)..((i + 1) * 100))), + Arc::new(Int32Array::from_iter_values((i * 100)..((i + 1) * 100))), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let _dataset = Dataset::open(test_uri).await.unwrap(); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let mut dataset = Dataset::open(test_uri).await.unwrap(); + + // Delete some rows + dataset.delete("id < 50").await.unwrap(); + dataset = Dataset::open(test_uri).await.unwrap(); + + // Compact with deletions materialized + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + materialize_deletions: true, + consolidate_column_stats: true, + ..Default::default() + }; + + compact_files(&mut dataset, options, None).await.unwrap(); + + // Verify stats file was created + dataset = Dataset::open(test_uri).await.unwrap(); + let column_stats = dataset.manifest.column_stats.as_ref(); + assert!( + column_stats.is_some(), + "Stats should be consolidated even with deletions" + ); + + // Read and verify the stats file content + let stats_path = &column_stats.unwrap().path; + let full_path = dataset.base.child(stats_path.as_str()); + let scheduler = lance_io::scheduler::ScanScheduler::new( + dataset.object_store.clone(), + lance_io::scheduler::SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&full_path, &lance_io::utils::CachedFileSize::unknown()) + .await + .unwrap(); + let reader = lance_file::reader::FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&full_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await + .unwrap(); + + // New columnar format: 1 row, columns "id" and "value" with List> + assert_eq!(reader.num_rows(), 1); + + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + 4096, + 16, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .unwrap(); + + let mut batches = Vec::new(); + while let Some(batch) = stream.try_next().await.unwrap() { + batches.push(batch); + } + + let batch = &batches[0]; + assert_eq!(batch.num_columns(), 2); + assert!(batch.column_by_name("id").is_some()); + assert!(batch.column_by_name("value").is_some()); + + // After compaction with deletions (id < 50 deleted), verify "id" column stats + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct + .as_any() + .downcast_ref::() + .unwrap(); + let id_mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_mins.len(), 1, "Should have 1 fragment after compaction"); + // Rows with id < 50 were deleted, so min should be 50 + assert_eq!( + id_mins.value(0), + 50, + "Min should be 50 after deleting id < 50" + ); + assert_eq!(id_maxs.value(0), 299, "Max should be 299"); + } + + #[tokio::test] + async fn test_compaction_multiple_rounds_updates_stats() { + use crate::dataset::WriteParams; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + + let write_params = WriteParams { + max_rows_per_file: 50, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + // Write 6 small fragments + for i in 0..6 { + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values( + (i * 50)..((i + 1) * 50), + ))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let _dataset = Dataset::open(test_uri).await.unwrap(); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let mut dataset = Dataset::open(test_uri).await.unwrap(); + assert_eq!(dataset.get_fragments().len(), 6); + + // First compaction + let options = CompactionOptions { + target_rows_per_fragment: 150, + consolidate_column_stats: true, + ..Default::default() + }; + + compact_files(&mut dataset, options.clone(), None) + .await + .unwrap(); + dataset = Dataset::open(test_uri).await.unwrap(); + + let first_column_stats = dataset.manifest.column_stats.as_ref(); + assert!(first_column_stats.is_some()); + + // Verify the first stats file content after first compaction + let first_stats_path = first_column_stats.unwrap().path.clone(); + let full_path = dataset.base.child(first_stats_path.as_str()); + let scheduler = lance_io::scheduler::ScanScheduler::new( + dataset.object_store.clone(), + lance_io::scheduler::SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&full_path, &lance_io::utils::CachedFileSize::unknown()) + .await + .unwrap(); + let reader = lance_file::reader::FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&full_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await + .unwrap(); + + assert_eq!(reader.num_rows(), 1, "Should have 1 row (only id column)"); + + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + 4096, + 16, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .unwrap(); + + let mut batches = Vec::new(); + while let Some(batch) = stream.try_next().await.unwrap() { + batches.push(batch); + } + + let batch = &batches[0]; + assert!(batch.column_by_name("id").is_some()); + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct + .as_any() + .downcast_ref::() + .unwrap(); + let id_mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // After first compaction: 6 fragments (50 rows each) compacted with target=150 + // Should have consolidated stats covering 0-299 + assert!(!id_mins.is_empty(), "Should have at least one fragment"); + let overall_min = (0..id_mins.len()).map(|i| id_mins.value(i)).min().unwrap(); + let overall_max = (0..id_maxs.len()).map(|i| id_maxs.value(i)).max().unwrap(); + assert_eq!(overall_min, 0, "First compaction min should be 0"); + assert_eq!( + overall_max, 299, + "First compaction max should be 299 (6 fragments * 50 rows)" + ); + + // Add more fragments + for i in 6..9 { + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values( + (i * 50)..((i + 1) * 50), + ))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + + // Second compaction + dataset = Dataset::open(test_uri).await.unwrap(); + compact_files(&mut dataset, options, None).await.unwrap(); + dataset = Dataset::open(test_uri).await.unwrap(); + + let second_column_stats = dataset.manifest.column_stats.as_ref(); + assert!(second_column_stats.is_some()); + + // Stats file path stays the same (version is stored in column_stats field) + let second_stats_path = second_column_stats.unwrap().path.clone(); + assert_eq!( + first_stats_path, second_stats_path, + "Stats file path should remain the same (_stats/column_stats.lance)" + ); + // But the file content is updated with new version metadata + + // Read and verify the final stats file content + let stats_path = &second_stats_path; + let full_path = dataset.base.child(stats_path.as_str()); + let scheduler = lance_io::scheduler::ScanScheduler::new( + dataset.object_store.clone(), + lance_io::scheduler::SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&full_path, &lance_io::utils::CachedFileSize::unknown()) + .await + .unwrap(); + let reader = lance_file::reader::FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&full_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await + .unwrap(); + + // New columnar format: 1 row + assert_eq!(reader.num_rows(), 1); + + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + 4096, + 16, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .unwrap(); + + let mut batches = Vec::new(); + while let Some(batch) = stream.try_next().await.unwrap() { + batches.push(batch); + } + + let batch = &batches[0]; + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct + .as_any() + .downcast_ref::() + .unwrap(); + let id_mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // After two rounds of compaction with target_rows_per_fragment=150: + // Verify we have consolidated stats for the full range (0 to 449) + assert!(!id_mins.is_empty(), "Should have at least one fragment"); + let overall_min = (0..id_mins.len()).map(|i| id_mins.value(i)).min().unwrap(); + let overall_max = (0..id_maxs.len()).map(|i| id_maxs.value(i)).max().unwrap(); + assert_eq!(overall_min, 0, "Overall min should be 0"); + assert_eq!( + overall_max, 449, + "Overall max should be 449 (9 fragments * 50 rows)" + ); + } + + #[tokio::test] + async fn test_compaction_with_stable_row_ids_and_stats() { + use crate::dataset::WriteParams; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + + // Write with stable row IDs + let write_params = WriteParams { + max_rows_per_file: 100, + disable_column_stats: false, // Stats enabled + enable_stable_row_ids: true, + ..Default::default() + }; + + for i in 0..3 { + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values( + (i * 100)..((i + 1) * 100), + ))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + + if i == 0 { + Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + } else { + let _dataset = Dataset::open(test_uri).await.unwrap(); + let append_params = WriteParams { + mode: crate::dataset::WriteMode::Append, + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + Dataset::write(reader, test_uri, Some(append_params)) + .await + .unwrap(); + } + } + + let mut dataset = Dataset::open(test_uri).await.unwrap(); + + // Compact with stable row IDs + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + consolidate_column_stats: true, + ..Default::default() + }; + + compact_files(&mut dataset, options, None).await.unwrap(); + + // Verify stats file was created + dataset = Dataset::open(test_uri).await.unwrap(); + let column_stats = dataset.manifest.column_stats.as_ref(); + assert!( + column_stats.is_some(), + "Stats should work with stable row IDs" + ); + + // Read and verify the stats file content + let stats_path = &column_stats.unwrap().path; + let full_path = dataset.base.child(stats_path.as_str()); + let scheduler = lance_io::scheduler::ScanScheduler::new( + dataset.object_store.clone(), + lance_io::scheduler::SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file_scheduler = scheduler + .open_file(&full_path, &lance_io::utils::CachedFileSize::unknown()) + .await + .unwrap(); + let reader = lance_file::reader::FileReader::try_open( + file_scheduler, + None, + Arc::::default(), + &dataset + .session + .metadata_cache + .file_metadata_cache(&full_path), + dataset.file_reader_options.clone().unwrap_or_default(), + ) + .await + .unwrap(); + + // New columnar format: 1 row, columns "id" with List> + assert_eq!(reader.num_rows(), 1); + + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + 4096, + 16, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .unwrap(); + + let mut batches = Vec::new(); + while let Some(batch) = stream.try_next().await.unwrap() { + batches.push(batch); + } + + let batch = &batches[0]; + let id_column = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_struct = id_column.value(0); + let id_struct = id_struct + .as_any() + .downcast_ref::() + .unwrap(); + let id_mins = id_struct + .column_by_name("min_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let id_maxs = id_struct + .column_by_name("max_value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(id_mins.len(), 1, "Should have 1 fragment after compaction"); + assert_eq!(id_mins.value(0), 0, "Min should be 0"); + assert_eq!( + id_maxs.value(0), + 299, + "Max should be 299 (3 fragments * 100 rows)" + ); + } + + #[tokio::test] + async fn test_compaction_no_fragments_to_compact_preserves_stats() { + use crate::dataset::WriteParams; + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + + // Write one large fragment (no compaction needed) + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2000))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], arrow_schema.clone()); + let write_params = WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }; + + Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let mut dataset = Dataset::open(test_uri).await.unwrap(); + assert_eq!(dataset.get_fragments().len(), 1); + + // Try to compact (should do nothing) + let options = CompactionOptions { + target_rows_per_fragment: 1_000, + consolidate_column_stats: true, + ..Default::default() + }; + + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + // No compaction should happen + assert_eq!(metrics.fragments_removed, 0); + assert_eq!(metrics.fragments_added, 0); + + // Stats should still not exist (no compaction happened) + dataset = Dataset::open(test_uri).await.unwrap(); + assert!( + dataset.manifest.column_stats.is_none(), + "No stats should be created when no compaction happens" + ); + } } diff --git a/rust/lance/src/dataset/tests/dataset_io.rs b/rust/lance/src/dataset/tests/dataset_io.rs index 5aade47d9e1..1d172b121ee 100644 --- a/rust/lance/src/dataset/tests/dataset_io.rs +++ b/rust/lance/src/dataset/tests/dataset_io.rs @@ -384,6 +384,7 @@ async fn test_write_manifest( Some(WriteParams { data_storage_version: Some(data_storage_version), auto_cleanup: None, + disable_column_stats: true, // No column stats; policy is still in config so FLAG_TABLE_CONFIG is set ..Default::default() }), ); @@ -427,9 +428,10 @@ async fn test_write_manifest( ) .await .unwrap(); - assert_eq!( - manifest.writer_feature_flags, - feature_flags::FLAG_DELETION_FILES + // Writer has deletion files; table config may be set if config is non-empty (e.g. column stats policy) + assert!( + manifest.writer_feature_flags & feature_flags::FLAG_DELETION_FILES != 0, + "writer_feature_flags should have FLAG_DELETION_FILES" ); assert_eq!( manifest.reader_feature_flags, diff --git a/rust/lance/src/dataset/tests/dataset_merge_update.rs b/rust/lance/src/dataset/tests/dataset_merge_update.rs index aa35f1b6408..303503befaf 100644 --- a/rust/lance/src/dataset/tests/dataset_merge_update.rs +++ b/rust/lance/src/dataset/tests/dataset_merge_update.rs @@ -1101,7 +1101,7 @@ async fn test_insert_skip_auto_cleanup() { let dataset = Dataset::write(data, &test_uri, Some(write_params)) .await .unwrap(); - assert_eq!(dataset.version().version, 1); + let version_after_write = dataset.version().version; // Advance time by 1 second MockClock::set_system_time(std::time::Duration::from_secs(2)); @@ -1123,7 +1123,8 @@ async fn test_insert_skip_auto_cleanup() { .await .unwrap(); - assert_eq!(dataset2.version().version, 2); + let version_after_first_append = dataset2.version().version; + assert!(version_after_first_append > version_after_write); // Advance time MockClock::set_system_time(std::time::Duration::from_secs(3)); @@ -1139,17 +1140,24 @@ async fn test_insert_skip_auto_cleanup() { .await .unwrap(); - assert_eq!(dataset2_extra.version().version, 3); + let version_after_second_append = dataset2_extra.version().version; + assert_eq!(version_after_second_append, version_after_first_append + 1); - // Version 1 should be cleaned up due to auto cleanup (cleanup runs every version) + // Version after initial write should be cleaned up due to auto cleanup (cleanup runs every version) assert!( - dataset2_extra.checkout_version(1).await.is_err(), - "Version 1 should have been cleaned up" + dataset2_extra + .checkout_version(version_after_write) + .await + .is_err(), + "Version {version_after_write} (after initial write) should have been cleaned up" ); - // Version 2 should still exist + // Version after first append should still exist assert!( - dataset2_extra.checkout_version(2).await.is_ok(), - "Version 2 should still exist" + dataset2_extra + .checkout_version(version_after_first_append) + .await + .is_ok(), + "Version {version_after_first_append} (after first append) should still exist" ); // Advance time @@ -1172,17 +1180,20 @@ async fn test_insert_skip_auto_cleanup() { .await .unwrap(); - assert_eq!(dataset3.version().version, 4); + assert_eq!(dataset3.version().version, version_after_second_append + 1); - // Version 2 should still exist because skip_auto_cleanup was enabled + // Version after first append should still exist because skip_auto_cleanup was enabled assert!( - dataset3.checkout_version(2).await.is_ok(), - "Version 2 should still exist because skip_auto_cleanup was enabled" + dataset3.checkout_version(version_after_first_append).await.is_ok(), + "Version {version_after_first_append} should still exist because skip_auto_cleanup was enabled" ); - // Version 3 should also still exist + // Version after second append should also still exist assert!( - dataset3.checkout_version(3).await.is_ok(), - "Version 3 should still exist" + dataset3 + .checkout_version(version_after_second_append) + .await + .is_ok(), + "Version {version_after_second_append} should still exist" ); } diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index 52dfd070fd5..d753fcb4114 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -266,6 +266,7 @@ pub enum Operation { table_metadata_updates: Option, schema_metadata_updates: Option, field_metadata_updates: HashMap, + column_stats: Option, }, /// Update merged generations in MemWAL index. /// This is used during merge-insert to atomically record which @@ -485,18 +486,21 @@ impl PartialEq for Operation { table_metadata_updates: a_table_metadata, schema_metadata_updates: a_schema, field_metadata_updates: a_field, + column_stats: a_column_stats, }, Self::UpdateConfig { config_updates: b_config, table_metadata_updates: b_table_metadata, schema_metadata_updates: b_schema, field_metadata_updates: b_field, + column_stats: b_column_stats, }, ) => { a_config == b_config && a_table_metadata == b_table_metadata && a_schema == b_schema && a_field == b_field + && a_column_stats == b_column_stats } ( Self::DataReplacement { replacements: a }, @@ -2208,6 +2212,7 @@ impl Transaction { table_metadata_updates, schema_metadata_updates, field_metadata_updates, + column_stats, } => { if let Some(config_updates) = config_updates { let mut config = manifest.config.clone(); @@ -2224,6 +2229,9 @@ impl Transaction { apply_update_map(&mut schema_metadata, schema_metadata_updates); manifest.schema.metadata = schema_metadata; } + if let Some(column_stats) = column_stats { + manifest.column_stats = Some(column_stats.clone()); + } for (field_id, field_metadata_update) in field_metadata_updates { if let Some(field) = manifest.schema.field_by_id_mut(*field_id) { apply_update_map(&mut field.metadata, field_metadata_update); @@ -2952,6 +2960,7 @@ impl TryFrom for Transaction { table_metadata_updates: None, schema_metadata_updates, field_metadata_updates, + column_stats: None, } } else { // Use new-style fields directly (convert from protobuf) @@ -2972,6 +2981,7 @@ impl TryFrom for Transaction { (*field_id, UpdateMap::from(pb_update_map)) }) .collect(), + column_stats: update_config.column_stats, } } } @@ -3219,6 +3229,7 @@ impl From<&Transaction> for pb::Transaction { table_metadata_updates, schema_metadata_updates, field_metadata_updates, + column_stats, } => pb::transaction::Operation::UpdateConfig(pb::transaction::UpdateConfig { config_updates: config_updates .as_ref() @@ -3235,6 +3246,7 @@ impl From<&Transaction> for pb::Transaction { (*field_id, pb::transaction::UpdateMap::from(update_map)) }) .collect(), + column_stats: column_stats.clone(), // Leave old fields empty - we only write new-style fields upsert_values: Default::default(), delete_keys: Default::default(), diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index c1b36702408..8ec2f0ab60a 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -44,6 +44,9 @@ use super::transaction::Transaction; use super::utils::SchemaAdapter; use super::DATA_DIR; +/// Manifest configuration key for column statistics policy (when true, stats are disabled) +pub const COLUMN_STATS_DISABLED_KEY: &str = "lance.column_stats.disabled"; + mod commit; pub mod delete; mod insert; @@ -245,6 +248,13 @@ pub struct WriteParams { /// These will be resolved to IDs when the write operation executes. /// Resolution happens at builder execution time when dataset context is available. pub target_base_names_or_paths: Option>, + + /// If true, disable column statistics generation when writing data files. + /// + /// Note: Once set for a dataset, this setting should remain consistent across + /// all write operations. This value must match the dataset's policy. + /// Default is `false` (column stats are enabled by default). + pub disable_column_stats: bool, } impl Default for WriteParams { @@ -269,11 +279,57 @@ impl Default for WriteParams { initial_bases: None, target_bases: None, target_base_names_or_paths: None, + disable_column_stats: false, } } } impl WriteParams { + /// Validate the dataset's column stats policy. + /// + /// If the dataset has a policy set in the manifest, this will check that `disable_column_stats` + /// matches it (inverted). Returns an error if the values don't match. If the dataset doesn't have a policy, + /// the value from WriteParams (defaults to `false`, meaning stats enabled) will be used. + /// + /// # Arguments + /// + /// * `dataset` - The dataset to validate against (None for new datasets) + /// + /// # Errors + /// + /// Returns an error if the manifest contains an invalid policy value or if + /// `disable_column_stats` doesn't match the dataset's policy. + pub fn validate_column_stats_policy(&mut self, dataset: Option<&Dataset>) -> Result<()> { + if let Some(dataset) = dataset { + if let Some(policy_str) = dataset.manifest.config.get(COLUMN_STATS_DISABLED_KEY) { + let dataset_policy_disable: bool = policy_str.parse().map_err(|_| { + Error::invalid_input( + format!( + "[ColumnStats] Invalid value for {} in dataset config: {}", + COLUMN_STATS_DISABLED_KEY, policy_str + ), + location!(), + ) + })?; + + if self.disable_column_stats != dataset_policy_disable { + return Err(Error::invalid_input( + format!( + "[ColumnStats] Policy mismatch: dataset requires disable_column_stats={}, \ + but WriteParams has disable_column_stats={}. \ + All fragments in a dataset must have consistent column statistics.", + dataset_policy_disable, + self.disable_column_stats + ), + location!(), + )); + } + } + // If no policy in manifest, use the value from WriteParams + } + Ok(()) + } + /// Create a new WriteParams with the given storage version. /// The other fields are set to their default values. pub fn with_storage_version(version: LanceFileVersion) -> Self { @@ -399,6 +455,7 @@ pub async fn do_write_fragments( schema, storage_version, target_bases_info, + params.disable_column_stats, ); let mut writer: Option> = None; let mut num_rows_in_current_file = 0; @@ -569,6 +626,10 @@ pub async fn write_fragments_internal( target_bases_info: Option>, ) -> Result<(Vec, Schema)> { let mut params = params; + + // Validate and auto-inherit column stats policy from dataset + params.validate_column_stats_policy(dataset)?; + let adapter = SchemaAdapter::new(data.schema()); let (data, converted_schema) = if adapter.requires_physical_conversion() { @@ -781,7 +842,16 @@ pub async fn open_writer( base_dir: &Path, storage_version: LanceFileVersion, ) -> Result> { - open_writer_with_options(object_store, schema, base_dir, storage_version, true, None).await + open_writer_with_options( + object_store, + schema, + base_dir, + storage_version, + true, + None, + false, + ) + .await } pub async fn open_writer_with_options( @@ -791,6 +861,7 @@ pub async fn open_writer_with_options( storage_version: LanceFileVersion, add_data_dir: bool, base_id: Option, + disable_column_stats: bool, ) -> Result> { let data_file_key = generate_random_filename(); let filename = format!("{}.lance", data_file_key); @@ -823,6 +894,7 @@ pub async fn open_writer_with_options( schema.clone(), FileWriterOptions { format_version: Some(storage_version), + disable_column_stats, ..Default::default() }, )?; @@ -871,6 +943,8 @@ struct WriterGenerator { target_bases_info: Option>, /// Counter for round-robin selection next_base_index: AtomicUsize, + /// Whether to enable column statistics generation + disable_column_stats: bool, } impl WriterGenerator { @@ -880,6 +954,7 @@ impl WriterGenerator { schema: &Schema, storage_version: LanceFileVersion, target_bases_info: Option>, + disable_column_stats: bool, ) -> Self { Self { object_store, @@ -888,6 +963,7 @@ impl WriterGenerator { storage_version, target_bases_info, next_base_index: AtomicUsize::new(0), + disable_column_stats, } } @@ -914,14 +990,18 @@ impl WriterGenerator { self.storage_version, base_info.is_dataset_root, Some(base_info.base_id), + self.disable_column_stats, ) .await? } else { - open_writer( + open_writer_with_options( &self.object_store, &self.schema, &self.base_dir, self.storage_version, + true, + None, + self.disable_column_stats, ) .await? }; @@ -1555,6 +1635,7 @@ mod tests { &schema, LanceFileVersion::Stable, Some(target_bases), + false, // disable_column_stats (stats enabled) ); // Create a writer @@ -1600,6 +1681,7 @@ mod tests { LanceFileVersion::Stable, false, // Don't add /data None, + false, // disable_column_stats (stats enabled) ) .await .unwrap(); @@ -1665,6 +1747,7 @@ mod tests { &schema, LanceFileVersion::Stable, Some(target_bases), + false, // disable_column_stats (stats enabled) ); // Create test batch diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index f2fb5aa0dbc..3bedcd3dfbb 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -32,6 +32,7 @@ use super::resolve_commit_handler; use super::WriteDestination; use super::WriteMode; use super::WriteParams; +use super::COLUMN_STATS_DISABLED_KEY; /// Insert or create a new dataset. /// /// There are different variants of `execute()` methods. Those with the `_stream` @@ -216,29 +217,46 @@ impl<'a> InsertBuilder<'a> { ) -> Result { let operation = match context.params.mode { WriteMode::Create => { - let mut upsert_values = HashMap::new(); - if let Some(auto_cleanup_params) = context.params.auto_cleanup.as_ref() { - upsert_values.insert( - String::from("lance.auto_cleanup.interval"), - auto_cleanup_params.interval.to_string(), - ); - - let duration = auto_cleanup_params.older_than.to_std().map_err(|e| { - Error::InvalidInput { - source: e.into(), - location: location!(), + // Only persist manifest config when it would be non-empty and meaningful for + // older readers. When disable_column_stats is true and there is no auto_cleanup, + // leave config empty so datasets are writable by old Lance versions that don't + // support FLAG_TABLE_CONFIG. + let config_upsert_values: Option> = { + if context.params.disable_column_stats && context.params.auto_cleanup.is_none() + { + // Stats disabled, no auto_cleanup: empty config for old-Lance compatibility. + None + } else { + let mut m = HashMap::new(); + m.insert( + String::from(COLUMN_STATS_DISABLED_KEY), + if context.params.disable_column_stats { + String::from("true") + } else { + String::from("false") + }, + ); + if let Some(auto_cleanup_params) = context.params.auto_cleanup.as_ref() { + m.insert( + String::from("lance.auto_cleanup.interval"), + auto_cleanup_params.interval.to_string(), + ); + let duration = + auto_cleanup_params.older_than.to_std().map_err(|e| { + Error::InvalidInput { + source: e.into(), + location: location!(), + } + })?; + m.insert( + String::from("lance.auto_cleanup.older_than"), + format_duration(duration).to_string(), + ); } - })?; - upsert_values.insert( - String::from("lance.auto_cleanup.older_than"), - format_duration(duration).to_string(), - ); - } - let config_upsert_values = if upsert_values.is_empty() { - None - } else { - Some(upsert_values) + Some(m) + } }; + Operation::Overwrite { // Use the full schema, not the written schema schema, @@ -652,4 +670,269 @@ mod test { } } } + + #[tokio::test] + async fn test_column_stats_policy_set_on_create() { + // Test that COLUMN_STATS_DISABLED_KEY is set in manifest when creating dataset with stats enabled + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let dataset = InsertBuilder::new("memory://test_column_stats_create") + .with_params(&WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone())) + .await + .unwrap(); + + // Check that the manifest has the column stats config (disabled=false when stats enabled) + let config_value = dataset.manifest.config.get(COLUMN_STATS_DISABLED_KEY); + assert_eq!(config_value, Some(&"false".to_string())); + } + + #[tokio::test] + async fn test_column_stats_policy_empty_when_disabled_no_auto_cleanup() { + // When stats are disabled and there is no auto_cleanup, we leave manifest config empty + // so old Lance versions (that don't support FLAG_TABLE_CONFIG) can still write. + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let dataset = InsertBuilder::new("memory://test_column_stats_disabled") + .with_params(&WriteParams { + disable_column_stats: true, // Stats disabled + auto_cleanup: None, // No auto_cleanup -> empty config for old-Lance compat + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone())) + .await + .unwrap(); + + // Config is empty for old-Lance compatibility + let config_value = dataset.manifest.config.get(COLUMN_STATS_DISABLED_KEY); + assert_eq!(config_value, None); + } + + #[tokio::test] + async fn test_policy_enforcement_on_append() { + // Test that appending with different column stats policy fails + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + // Create dataset with stats enabled + let dataset = InsertBuilder::new("memory://test_policy_enforcement") + .with_params(&WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch1)], schema.clone())) + .await + .unwrap(); + + let dataset = Arc::new(dataset); + + // Try to append with stats disabled - should fail + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![4, 5, 6]))], + ) + .unwrap(); + + let result = InsertBuilder::new(dataset.clone()) + .with_params(&WriteParams { + mode: WriteMode::Append, + disable_column_stats: true, // Explicitly set to true (stats disabled), conflicts with manifest + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch2)], schema.clone())) + .await; + + // Should fail because of policy mismatch + assert!(matches!(result, Err(Error::InvalidInput { .. }))); + if let Err(Error::InvalidInput { source, .. }) = result { + let error_msg = source.to_string(); + assert!( + error_msg.contains("[ColumnStats] Policy mismatch") + || error_msg.contains("Policy mismatch") + ); + assert!(error_msg.contains("disable_column_stats=false")); // Stats enabled + assert!(error_msg.contains("disable_column_stats=true")); // Stats disabled + } + } + + #[tokio::test] + async fn test_write_params_requires_explicit_policy_match() { + // Test that WriteParams requires explicit matching of column stats policy + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + // Create dataset with stats enabled + let dataset = InsertBuilder::new("memory://test_inherit_policy") + .with_params(&WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new( + vec![Ok(batch.clone())], + schema.clone(), + )) + .await + .unwrap(); + + let dataset = Arc::new(dataset); + + // Using default WriteParams (disable_column_stats=false, stats enabled) should succeed when appending + // to a dataset that requires disable_column_stats=false (stats enabled) + let result = InsertBuilder::new(dataset.clone()) + .with_params(&WriteParams { + mode: WriteMode::Append, + disable_column_stats: false, // Default is false (stats enabled), matches dataset + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new( + vec![Ok(batch.clone())], + schema.clone(), + )) + .await; + + // Should succeed because policies match (both have stats enabled) + assert!( + result.is_ok(), + "Expected success when policies match, but got error: {:?}", + result + ); + + // Test that mismatched policy fails + let result = InsertBuilder::new(dataset) + .with_params(&WriteParams { + mode: WriteMode::Append, + disable_column_stats: true, // Stats disabled - should fail validation + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone())) + .await; + + // Should fail because of policy mismatch + assert!(matches!(result, Err(Error::InvalidInput { .. }))); + } + + #[tokio::test] + async fn test_policy_enforcement_prevents_corruption_on_write_failure() { + // Test that dataset policy remains unchanged even if write fails + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let dataset = InsertBuilder::new("memory://test_write_failure") + .with_params(&WriteParams { + disable_column_stats: false, // Stats enabled + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch1)], schema.clone())) + .await + .unwrap(); + + // Verify initial policy is set (disabled=false when stats enabled) + let initial_policy = dataset.manifest.config.get(COLUMN_STATS_DISABLED_KEY); + assert_eq!(initial_policy, Some(&"false".to_string())); + + // Try to append with wrong policy (should fail validation before write) + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![4, 5, 6]))], + ) + .unwrap(); + + // Use the dataset object directly (like test_policy_enforcement_on_append) to ensure validation runs + let dataset_arc = Arc::new(dataset); + let result = InsertBuilder::new(dataset_arc.clone()) + .with_params(&WriteParams { + mode: WriteMode::Append, + disable_column_stats: true, // Stats disabled - should fail validation + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch2)], schema.clone())) + .await; + + // Should fail because of policy mismatch + assert!( + result.is_err(), + "Expected error due to policy mismatch, but operation succeeded. Result: {:?}", + result + ); + + // Verify policy is still unchanged (use the dataset object we already have) + let dataset_after = dataset_arc.as_ref(); + let policy_after = dataset_after.manifest.config.get(COLUMN_STATS_DISABLED_KEY); + assert_eq!(policy_after, Some(&"false".to_string())); + + // Verify dataset still has only original data (write never started) + assert_eq!(dataset_after.count_rows(None).await.unwrap(), 3); + } + + #[tokio::test] + async fn test_backwards_compat_dataset_without_policy_key() { + // Test that datasets work correctly with policy enforcement + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + // Create a dataset with stats disabled and no auto_cleanup -> empty manifest config + let dataset = InsertBuilder::new("memory://test_backwards_compat") + .with_params(&WriteParams { + disable_column_stats: true, // Stats disabled + auto_cleanup: None, // No auto_cleanup -> empty config + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new( + vec![Ok(batch.clone())], + schema.clone(), + )) + .await + .unwrap(); + + // No policy key in manifest (empty config for old-Lance compatibility) + let policy_value = dataset.manifest.config.get(COLUMN_STATS_DISABLED_KEY); + assert_eq!(policy_value, None); + + // Appending with matching policy should work + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![4, 5, 6]))], + ) + .unwrap(); + + let result = InsertBuilder::new("memory://test_backwards_compat") + .with_params(&WriteParams { + mode: WriteMode::Append, + disable_column_stats: true, // Stats disabled + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch2)], schema.clone())) + .await; + + assert!(result.is_ok()); + } } diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index e488a2f2439..aaeb1f5bc95 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -4291,6 +4291,8 @@ mod tests { ).await.unwrap(); } + // Run with: cargo test -p lance --lib test_skip_auto_cleanup + // (Use --lib so only library tests run; otherwise other binaries report "0 passed".) #[tokio::test] async fn test_skip_auto_cleanup() { let tmpdir = TempStrDir::default(); @@ -4324,6 +4326,7 @@ mod tests { let dataset = Dataset::write(data, &dataset_uri, Some(write_params)) .await .unwrap(); + // Initial write creates version 1 (one commit). assert_eq!(dataset.version().version, 1); // Advance time @@ -4345,6 +4348,7 @@ mod tests { .await .unwrap(); + // First merge creates version 2 (one commit). assert_eq!(dataset2.version().version, 2); // Advance time @@ -4367,12 +4371,13 @@ mod tests { .await .unwrap(); + // Second merge creates version 3 (one commit). Auto cleanup runs after each commit, so version 1 is removed. assert_eq!(dataset2_extra.version().version, 3); // Load the dataset from disk to check versions let ds_check1 = DatasetBuilder::from_uri(&dataset_uri).load().await.unwrap(); - // Version 1 should be cleaned up due to auto cleanup (cleanup runs every version) + // Version 1 should be cleaned up due to auto cleanup (cleanup runs every version, interval=1). assert!( ds_check1.checkout_version(1).await.is_err(), "Version 1 should have been cleaned up" @@ -4403,12 +4408,13 @@ mod tests { .await .unwrap(); + // Third merge creates version 4 (one commit). No cleanup because skip_auto_cleanup was set. assert_eq!(dataset3.version().version, 4); // Load the dataset from disk to check versions let ds_check2 = DatasetBuilder::from_uri(&dataset_uri).load().await.unwrap(); - // Version 2 should still exist because skip_auto_cleanup was enabled + // Version 2 should still exist because skip_auto_cleanup was enabled (no cleanup after version 4). assert!( ds_check2.checkout_version(2).await.is_ok(), "Version 2 should still exist because skip_auto_cleanup was enabled" diff --git a/rust/lance/src/io/commit/conflict_resolver.rs b/rust/lance/src/io/commit/conflict_resolver.rs index bb6f9aae866..972a6f17bb8 100644 --- a/rust/lance/src/io/commit/conflict_resolver.rs +++ b/rust/lance/src/io/commit/conflict_resolver.rs @@ -1874,6 +1874,7 @@ mod tests { table_metadata_updates: None, schema_metadata_updates, field_metadata_updates, + column_stats: None, } }