From 4dedab810b6bd0d8803c055c18709b453d4e3834 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 12:44:37 +0100 Subject: [PATCH 01/29] upgrade arrow-rs (temp to github url until 57.1.0 is released) --- Cargo.toml | 20 +- .../src/expr/visitors/page_index_evaluator.rs | 191 ++++++------------ .../src/writer/file_writer/parquet_writer.rs | 36 +--- 3 files changed, 71 insertions(+), 176 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c10c01d94a..5ac52bcd2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [workspace] -exclude = ["bindings/python"] +exclude = ["bindings/python", "crates/integrations/datafusion"] members = [ "crates/catalog/*", "crates/examples", @@ -42,14 +42,14 @@ rust-version = "1.87" anyhow = "1.0.72" apache-avro = { version = "0.20", features = ["zstandard"] } array-init = "2" -arrow-arith = "56.2" -arrow-array = "56.2" -arrow-buffer = "56.2" -arrow-cast = "56.2" -arrow-ord = "56.2" -arrow-schema = "56.2" -arrow-select = "56.2" -arrow-string = "56.2" +arrow-arith = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-array = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-cast = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-ord = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-schema = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-select = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-string = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } as-any = "0.3.2" async-trait = "0.1.89" aws-config = "1.8.7" @@ -101,7 +101,7 @@ num-bigint = "0.4.6" once_cell = "1.20" opendal = "0.54.0" ordered-float = "4" -parquet = "56.2" +parquet = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } pilota = "0.11.10" port_scanner = "0.1.5" pretty_assertions = "1.4" diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index 3745d94d18..f008c8a86b 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -23,7 +23,7 @@ use fnv::FnvHashSet; use ordered_float::OrderedFloat; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::file::metadata::RowGroupMetaData; -use parquet::file::page_index::index::Index; +use parquet::file::page_index::column_index::ColumnIndexMetaData as Index; use parquet::file::page_index::offset_index::OffsetIndexMetaData; use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit}; @@ -250,117 +250,105 @@ impl<'a> PageIndexEvaluator<'a> { Index::NONE => { return Ok(None); } - Index::BOOLEAN(idx) => idx - .indexes - .iter() + Index::BOOLEAN(idx) => (0..idx.num_pages() as usize) .zip(row_counts.iter()) - .map(|(item, &row_count)| { + .map(|(page_idx, &row_count)| { predicate( - item.min.map(|val| { + idx.min_value(page_idx).copied().map(|val| { Datum::new(field_type.clone(), PrimitiveLiteral::Boolean(val)) }), - item.max.map(|val| { + idx.max_value(page_idx).copied().map(|val| { Datum::new(field_type.clone(), PrimitiveLiteral::Boolean(val)) }), - PageNullCount::from_row_and_null_counts(row_count, item.null_count), + PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), ) }) .collect(), - Index::INT32(idx) => idx - .indexes - .iter() + Index::INT32(idx) => (0..idx.num_pages() as usize) .zip(row_counts.iter()) - .map(|(item, &row_count)| { + .map(|(page_idx, &row_count)| { predicate( - item.min + idx.min_value(page_idx).copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Int(val))), - item.max + idx.max_value(page_idx).copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Int(val))), - PageNullCount::from_row_and_null_counts(row_count, item.null_count), + PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), ) }) .collect(), - Index::INT64(idx) => idx - .indexes - .iter() + Index::INT64(idx) => (0..idx.num_pages() as usize) .zip(row_counts.iter()) - .map(|(item, &row_count)| { + .map(|(page_idx, &row_count)| { predicate( - item.min + idx.min_value(page_idx).copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Long(val))), - item.max + idx.max_value(page_idx).copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Long(val))), - PageNullCount::from_row_and_null_counts(row_count, item.null_count), + PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), ) }) .collect(), - Index::FLOAT(idx) => idx - .indexes - .iter() + Index::FLOAT(idx) => (0..idx.num_pages() as usize) .zip(row_counts.iter()) - .map(|(item, &row_count)| { + .map(|(page_idx, &row_count)| { predicate( - item.min.map(|val| { + idx.min_value(page_idx).copied().map(|val| { Datum::new( field_type.clone(), PrimitiveLiteral::Float(OrderedFloat::from(val)), ) }), - item.max.map(|val| { + idx.max_value(page_idx).copied().map(|val| { Datum::new( field_type.clone(), PrimitiveLiteral::Float(OrderedFloat::from(val)), ) }), - PageNullCount::from_row_and_null_counts(row_count, item.null_count), + PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), ) }) .collect(), - Index::DOUBLE(idx) => idx - .indexes - .iter() + Index::DOUBLE(idx) => (0..idx.num_pages() as usize) .zip(row_counts.iter()) - .map(|(item, &row_count)| { + .map(|(page_idx, &row_count)| { predicate( - item.min.map(|val| { + idx.min_value(page_idx).copied().map(|val| { Datum::new( field_type.clone(), PrimitiveLiteral::Double(OrderedFloat::from(val)), ) }), - item.max.map(|val| { + idx.max_value(page_idx).copied().map(|val| { Datum::new( field_type.clone(), PrimitiveLiteral::Double(OrderedFloat::from(val)), ) }), - PageNullCount::from_row_and_null_counts(row_count, item.null_count), + PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), ) }) .collect(), - Index::BYTE_ARRAY(idx) => idx - .indexes - .iter() + Index::BYTE_ARRAY(idx) => (0..idx.num_pages() as usize) .zip(row_counts.iter()) - .map(|(item, &row_count)| { + .map(|(page_idx, &row_count)| { predicate( - item.min.clone().map(|val| { + idx.min_value(page_idx).map(|val| { Datum::new( field_type.clone(), PrimitiveLiteral::String( - String::from_utf8(val.data().to_vec()).unwrap(), + String::from_utf8(val.to_vec()).unwrap(), ), ) }), - item.max.clone().map(|val| { + idx.max_value(page_idx).map(|val| { Datum::new( field_type.clone(), PrimitiveLiteral::String( - String::from_utf8(val.data().to_vec()).unwrap(), + String::from_utf8(val.to_vec()).unwrap(), ), ) }), - PageNullCount::from_row_and_null_counts(row_count, item.null_count), + PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), ) }) .collect(), @@ -788,13 +776,12 @@ mod tests { use std::sync::Arc; use parquet::arrow::arrow_reader::RowSelector; - use parquet::basic::{LogicalType as ParquetLogicalType, Type as ParquetPhysicalType}; + use parquet::basic::{BoundaryOrder, LogicalType as ParquetLogicalType, Type as ParquetPhysicalType}; use parquet::data_type::ByteArray; use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; - use parquet::file::page_index::index::{Index, NativeIndex, PageIndex}; - use parquet::file::page_index::offset_index::OffsetIndexMetaData; + use parquet::file::page_index::column_index::{ColumnIndexBuilder, ColumnIndexMetaData as Index}; + use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use parquet::file::statistics::Statistics; - use parquet::format::{BoundaryOrder, PageLocation}; use parquet::schema::types::{ ColumnDescriptor, ColumnPath, SchemaDescriptor, Type as parquetSchemaType, }; @@ -1313,94 +1300,34 @@ mod tests { } fn create_page_index() -> Result<(Vec, Vec)> { - let idx_float = Index::FLOAT(NativeIndex:: { - indexes: vec![ - PageIndex { - min: None, - max: None, - null_count: Some(1024), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: Some(0.0), - max: Some(10.0), - null_count: Some(0), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: Some(10.0), - max: Some(20.0), - null_count: Some(1), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: None, - max: None, - null_count: None, - repetition_level_histogram: None, - definition_level_histogram: None, - }, - ], - boundary_order: BoundaryOrder(0), // UNORDERED - }); - - let idx_string = Index::BYTE_ARRAY(NativeIndex:: { - indexes: vec![ - PageIndex { - min: Some("AA".into()), - max: Some("DD".into()), - null_count: Some(0), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: Some("DE".into()), - max: Some("DE".into()), - null_count: Some(0), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: Some("DF".into()), - max: Some("UJ".into()), - null_count: Some(1), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: None, - max: None, - null_count: Some(48), - repetition_level_histogram: None, - definition_level_histogram: None, - }, - PageIndex { - min: None, - max: None, - null_count: None, - repetition_level_histogram: None, - definition_level_histogram: None, - }, - ], - boundary_order: BoundaryOrder(0), // UNORDERED - }); + let mut idx_float_builder = ColumnIndexBuilder::new(); + idx_float_builder.append(None, None, Some(1024)); + idx_float_builder.append(Some(&0.0f32), Some(&10.0f32), Some(0)); + idx_float_builder.append(Some(&10.0f32), Some(&20.0f32), Some(1)); + idx_float_builder.append(None, None, None); + let idx_float = idx_float_builder.build_to_thrift(); + + let mut idx_string_builder = ColumnIndexBuilder::new(); + idx_string_builder.append(Some(&ByteArray::from("AA")), Some(&ByteArray::from("DD")), Some(0)); + idx_string_builder.append(Some(&ByteArray::from("DE")), Some(&ByteArray::from("DE")), Some(0)); + idx_string_builder.append(Some(&ByteArray::from("DF")), Some(&ByteArray::from("UJ")), Some(1)); + idx_string_builder.append(None, None, Some(48)); + idx_string_builder.append(None, None, None); + let idx_string = idx_string_builder.build_to_thrift(); let page_locs_float = vec![ - PageLocation::new(0, 1024, 0), - PageLocation::new(1024, 1024, 1024), - PageLocation::new(2048, 1024, 2048), - PageLocation::new(3072, 1024, 3072), + PageLocation { offset: 0, compressed_page_size: 1024, first_row_index: 0 }, + PageLocation { offset: 1024, compressed_page_size: 1024, first_row_index: 1024 }, + PageLocation { offset: 2048, compressed_page_size: 1024, first_row_index: 2048 }, + PageLocation { offset: 3072, compressed_page_size: 1024, first_row_index: 3072 }, ]; let page_locs_string = vec![ - PageLocation::new(0, 512, 0), - PageLocation::new(512, 512, 512), - PageLocation::new(1024, 2976, 1024), - PageLocation::new(4000, 48, 4000), - PageLocation::new(4048, 48, 4048), + PageLocation { offset: 0, compressed_page_size: 512, first_row_index: 0 }, + PageLocation { offset: 512, compressed_page_size: 512, first_row_index: 512 }, + PageLocation { offset: 1024, compressed_page_size: 2976, first_row_index: 1024 }, + PageLocation { offset: 4000, compressed_page_size: 48, first_row_index: 4000 }, + PageLocation { offset: 4048, compressed_page_size: 48, first_row_index: 4048 }, ]; Ok((vec![idx_float, idx_string], vec![ diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs index 3e9d1715c9..8edffa2ab8 100644 --- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs +++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs @@ -27,12 +27,9 @@ use itertools::Itertools; use parquet::arrow::AsyncArrowWriter; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::async_writer::AsyncFileWriter as ArrowAsyncFileWriter; -use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; +use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::WriterProperties; use parquet::file::statistics::Statistics; -use parquet::format::FileMetaData; -use parquet::thrift::{TCompactOutputProtocol, TSerializable}; -use thrift::protocol::TOutputProtocol; use super::{FileWriter, FileWriterBuilder}; use crate::arrow::{ @@ -349,28 +346,6 @@ impl ParquetWriter { Ok(data_files) } - fn thrift_to_parquet_metadata(&self, file_metadata: FileMetaData) -> Result { - let mut buffer = Vec::new(); - { - let mut protocol = TCompactOutputProtocol::new(&mut buffer); - file_metadata - .write_to_out_protocol(&mut protocol) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "Failed to write parquet metadata") - .with_source(err) - })?; - - protocol.flush().map_err(|err| { - Error::new(ErrorKind::Unexpected, "Failed to flush protocol").with_source(err) - })?; - } - - let parquet_metadata = ParquetMetaDataReader::decode_metadata(&buffer).map_err(|err| { - Error::new(ErrorKind::Unexpected, "Failed to decode parquet metadata").with_source(err) - })?; - - Ok(parquet_metadata) - } /// `ParquetMetadata` to data file builder pub(crate) fn parquet_to_data_file_builder( @@ -564,14 +539,7 @@ impl FileWriter for ParquetWriter { })?; Ok(vec![]) } else { - let parquet_metadata = - Arc::new(self.thrift_to_parquet_metadata(metadata).map_err(|err| { - Error::new( - ErrorKind::Unexpected, - "Failed to convert metadata from thrift to parquet.", - ) - .with_source(err) - })?); + let parquet_metadata = Arc::new(metadata); Ok(vec![Self::parquet_to_data_file_builder( self.schema, From c1b3f8591e64dc9dc28e57764408729ee11d4016 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 12:45:58 +0100 Subject: [PATCH 02/29] Add cargo.lock; Disable datafusion since of dependency mismatch --- Cargo.lock | 2262 +---------------- Cargo.toml | 8 +- crates/integration_tests/Cargo.toml | 4 +- .../tests/shared_tests/mod.rs | 2 +- 4 files changed, 69 insertions(+), 2207 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 62478f32a0..33fde8721d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,56 +84,12 @@ dependencies = [ "libc", ] -[[package]] -name = "anstream" -version = "0.6.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "is_terminal_polyfill", - "utf8parse", -] - [[package]] name = "anstyle" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" -[[package]] -name = "anstyle-parse" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" -dependencies = [ - "windows-sys 0.60.2", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" -dependencies = [ - "anstyle", - "once_cell_polyfill", - "windows-sys 0.60.2", -] - [[package]] name = "anyhow" version = "1.0.100" @@ -148,8 +104,6 @@ checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" dependencies = [ "bigdecimal", "bon", - "bzip2 0.6.1", - "crc32fast", "digest", "log", "miniz_oxide", @@ -160,158 +114,102 @@ dependencies = [ "serde", "serde_bytes", "serde_json", - "snap", - "strum 0.27.2", - "strum_macros 0.27.2", + "strum", + "strum_macros", "thiserror 2.0.17", "uuid", - "xz2", "zstd", ] -[[package]] -name = "ar_archive_writer" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" -dependencies = [ - "object", -] - [[package]] name = "array-init" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc" -[[package]] -name = "arrayref" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" - [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "arrow" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" -dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - [[package]] name = "arrow-arith" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "ahash 0.8.12", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "chrono-tz", "half", "hashbrown 0.16.0", - "num", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", "atoi", "base64 0.22.1", "chrono", - "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] -[[package]] -name = "arrow-csv" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" -dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "regex", -] - [[package]] name = "arrow-data" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-ipc" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "arrow-array", "arrow-buffer", @@ -319,37 +217,12 @@ dependencies = [ "arrow-schema", "arrow-select", "flatbuffers", - "lz4_flex", - "zstd", -] - -[[package]] -name = "arrow-json" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "indexmap 2.12.0", - "lexical-core", - "memchr", - "num", - "serde", - "serde_json", - "simdutf8", ] [[package]] name = "arrow-ord" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "arrow-array", "arrow-buffer", @@ -358,48 +231,28 @@ dependencies = [ "arrow-select", ] -[[package]] -name = "arrow-row" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half", -] - [[package]] name = "arrow-schema" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" -dependencies = [ - "serde", - "serde_json", -] +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" [[package]] name = "arrow-select" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "arrow-array", "arrow-buffer", @@ -407,7 +260,7 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] @@ -452,23 +305,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "async-compression" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" -dependencies = [ - "bzip2 0.5.2", - "flate2", - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "xz2", - "zstd", - "zstd-safe", -] - [[package]] name = "async-executor" version = "1.13.3" @@ -1088,7 +924,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.13.0", + "itertools", "log", "prettyplease", "proc-macro2", @@ -1120,28 +956,6 @@ dependencies = [ "wyz", ] -[[package]] -name = "blake2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" -dependencies = [ - "digest", -] - -[[package]] -name = "blake3" -version = "1.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" -dependencies = [ - "arrayref", - "arrayvec", - "cc", - "cfg-if", - "constant_time_eq", -] - [[package]] name = "block-buffer" version = "0.10.4" @@ -1301,34 +1115,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" -dependencies = [ - "libbz2-rs-sys", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "cbc" version = "0.1.2" @@ -1385,16 +1171,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "chrono-tz" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" -dependencies = [ - "chrono", - "phf", -] - [[package]] name = "cipher" version = "0.4.4" @@ -1416,55 +1192,6 @@ dependencies = [ "libloading", ] -[[package]] -name = "clap" -version = "4.5.50" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" -dependencies = [ - "clap_builder", - "clap_derive", -] - -[[package]] -name = "clap_builder" -version = "4.5.50" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" -dependencies = [ - "anstream", - "anstyle", - "clap_lex", - "strsim", -] - -[[package]] -name = "clap_derive" -version = "4.5.49" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.108", -] - -[[package]] -name = "clap_lex" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" - -[[package]] -name = "clipboard-win" -version = "5.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bde03770d3df201d4fb868f2c9c59e66a3e4e2bd06692a0fe701e7103c7e84d4" -dependencies = [ - "error-code", -] - [[package]] name = "cmake" version = "0.1.54" @@ -1474,12 +1201,6 @@ dependencies = [ "cc", ] -[[package]] -name = "colorchoice" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" - [[package]] name = "colored" version = "3.0.0" @@ -1489,17 +1210,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "comfy-table" -version = "7.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" -dependencies = [ - "strum 0.26.3", - "strum_macros 0.26.4", - "unicode-width 0.2.2", -] - [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1509,19 +1219,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "console" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4" -dependencies = [ - "encode_unicode", - "libc", - "once_cell", - "unicode-width 0.2.2", - "windows-sys 0.61.2", -] - [[package]] name = "const-oid" version = "0.9.6" @@ -1548,12 +1245,6 @@ dependencies = [ "tiny-keccak", ] -[[package]] -name = "constant_time_eq" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" - [[package]] name = "core-foundation" version = "0.9.4" @@ -1671,27 +1362,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "csv" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde_core", -] - -[[package]] -name = "csv-core" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" -dependencies = [ - "memchr", -] - [[package]] name = "ctor" version = "0.2.9" @@ -1787,778 +1457,10 @@ dependencies = [ ] [[package]] -name = "datafusion" -version = "50.3.0" +name = "der" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af15bb3c6ffa33011ef579f6b0bcbe7c26584688bd6c994f548e44df67f011a" -dependencies = [ - "arrow", - "arrow-ipc", - "arrow-schema", - "async-trait", - "bytes", - "bzip2 0.6.1", - "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-datasource-avro", - "datafusion-datasource-csv", - "datafusion-datasource-json", - "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-nested", - "datafusion-functions-table", - "datafusion-functions-window", - "datafusion-optimizer", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", - "flate2", - "futures", - "hex", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "rand 0.9.2", - "regex", - "sqlparser", - "tempfile", - "tokio", - "url", - "uuid", - "xz2", - "zstd", -] - -[[package]] -name = "datafusion-catalog" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187622262ad8f7d16d3be9202b4c1e0116f1c9aa387e5074245538b755261621" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "tokio", -] - -[[package]] -name = "datafusion-catalog-listing" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9657314f0a32efd0382b9a46fdeb2d233273ece64baa68a7c45f5a192daf0f83" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "log", - "object_store", - "tokio", -] - -[[package]] -name = "datafusion-cli" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a0b9c821d14e79070f42ea3a6d6618ced04d94277f0a32301918d7a022c250f" -dependencies = [ - "arrow", - "async-trait", - "aws-config", - "aws-credential-types", - "clap", - "datafusion", - "dirs", - "env_logger", - "futures", - "log", - "mimalloc", - "object_store", - "parking_lot", - "parquet", - "regex", - "rustyline", - "tokio", - "url", -] - -[[package]] -name = "datafusion-common" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a83760d9a13122d025fbdb1d5d5aaf93dd9ada5e90ea229add92aa30898b2d1" -dependencies = [ - "ahash 0.8.12", - "apache-avro", - "arrow", - "arrow-ipc", - "base64 0.22.1", - "chrono", - "half", - "hashbrown 0.14.5", - "hex", - "indexmap 2.12.0", - "libc", - "log", - "object_store", - "parquet", - "paste", - "recursive", - "sqlparser", - "tokio", - "web-time", -] - -[[package]] -name = "datafusion-common-runtime" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b6234a6c7173fe5db1c6c35c01a12b2aa0f803a3007feee53483218817f8b1e" -dependencies = [ - "futures", - "log", - "tokio", -] - -[[package]] -name = "datafusion-datasource" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7256c9cb27a78709dd42d0c80f0178494637209cac6e29d5c93edd09b6721b86" -dependencies = [ - "arrow", - "async-compression", - "async-trait", - "bytes", - "bzip2 0.6.1", - "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "flate2", - "futures", - "glob", - "itertools 0.14.0", - "log", - "object_store", - "parquet", - "rand 0.9.2", - "tempfile", - "tokio", - "tokio-util", - "url", - "xz2", - "zstd", -] - -[[package]] -name = "datafusion-datasource-avro" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10d40b6953ebc9099b37adfd12fde97eb73ff0cee44355c6dea64b8a4537d561" -dependencies = [ - "apache-avro", - "arrow", - "async-trait", - "bytes", - "chrono", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "num-traits", - "object_store", - "tokio", -] - -[[package]] -name = "datafusion-datasource-csv" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64533a90f78e1684bfb113d200b540f18f268134622d7c96bbebc91354d04825" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "regex", - "tokio", -] - -[[package]] -name = "datafusion-datasource-json" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d7ebeb12c77df0aacad26f21b0d033aeede423a64b2b352f53048a75bf1d6e6" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "serde_json", - "tokio", -] - -[[package]] -name = "datafusion-datasource-parquet" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09e783c4c7d7faa1199af2df4761c68530634521b176a8d1331ddbc5a5c75133" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "hex", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "rand 0.9.2", - "tokio", -] - -[[package]] -name = "datafusion-doc" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99ee6b1d9a80d13f9deb2291f45c07044b8e62fb540dbde2453a18be17a36429" - -[[package]] -name = "datafusion-execution" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4cec0a57653bec7b933fb248d3ffa3fa3ab3bd33bd140dc917f714ac036f531" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-expr", - "futures", - "log", - "object_store", - "parking_lot", - "parquet", - "rand 0.9.2", - "tempfile", - "url", -] - -[[package]] -name = "datafusion-expr" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef76910bdca909722586389156d0aa4da4020e1631994d50fadd8ad4b1aa05fe" -dependencies = [ - "arrow", - "async-trait", - "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr-common", - "indexmap 2.12.0", - "paste", - "recursive", - "serde_json", - "sqlparser", -] - -[[package]] -name = "datafusion-expr-common" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d155ccbda29591ca71a1344dd6bed26c65a4438072b400df9db59447f590bb6" -dependencies = [ - "arrow", - "datafusion-common", - "indexmap 2.12.0", - "itertools 0.14.0", - "paste", -] - -[[package]] -name = "datafusion-functions" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7de2782136bd6014670fd84fe3b0ca3b3e4106c96403c3ae05c0598577139977" -dependencies = [ - "arrow", - "arrow-buffer", - "base64 0.22.1", - "blake2", - "blake3", - "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-macros", - "hex", - "itertools 0.14.0", - "log", - "md-5", - "rand 0.9.2", - "regex", - "sha2", - "unicode-segmentation", - "uuid", -] - -[[package]] -name = "datafusion-functions-aggregate" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07331fc13603a9da97b74fd8a273f4238222943dffdbbed1c4c6f862a30105bf" -dependencies = [ - "ahash 0.8.12", - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "half", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-aggregate-common" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5951e572a8610b89968a09b5420515a121fbc305c0258651f318dc07c97ab17" -dependencies = [ - "ahash 0.8.12", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-functions-nested" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdacca9302c3d8fc03f3e94f338767e786a88a33f5ebad6ffc0e7b50364b9ea3" -dependencies = [ - "arrow", - "arrow-ord", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr-common", - "itertools 0.14.0", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-table" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c37ff8a99434fbbad604a7e0669717c58c7c4f14c472d45067c4b016621d981" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-plan", - "parking_lot", - "paste", -] - -[[package]] -name = "datafusion-functions-window" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e2aea7c79c926cffabb13dc27309d4eaeb130f4a21c8ba91cdd241c813652b" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-window-common" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fead257ab5fd2ffc3b40fda64da307e20de0040fe43d49197241d9de82a487f" -dependencies = [ - "datafusion-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-macros" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec6f637bce95efac05cdfb9b6c19579ed4aa5f6b94d951cfa5bb054b7bb4f730" -dependencies = [ - "datafusion-expr", - "quote", - "syn 2.0.108", -] - -[[package]] -name = "datafusion-optimizer" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6583ef666ae000a613a837e69e456681a9faa96347bf3877661e9e89e141d8a" -dependencies = [ - "arrow", - "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "indexmap 2.12.0", - "itertools 0.14.0", - "log", - "recursive", - "regex", - "regex-syntax", -] - -[[package]] -name = "datafusion-physical-expr" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8668103361a272cbbe3a61f72eca60c9b7c706e87cc3565bcf21e2b277b84f6" -dependencies = [ - "ahash 0.8.12", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", - "half", - "hashbrown 0.14.5", - "indexmap 2.12.0", - "itertools 0.14.0", - "log", - "parking_lot", - "paste", - "petgraph 0.8.3", -] - -[[package]] -name = "datafusion-physical-expr-adapter" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "815acced725d30601b397e39958e0e55630e0a10d66ef7769c14ae6597298bb0" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-functions", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "itertools 0.14.0", -] - -[[package]] -name = "datafusion-physical-expr-common" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6652fe7b5bf87e85ed175f571745305565da2c0b599d98e697bcbedc7baa47c3" -dependencies = [ - "ahash 0.8.12", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "hashbrown 0.14.5", - "itertools 0.14.0", -] - -[[package]] -name = "datafusion-physical-optimizer" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49b7d623eb6162a3332b564a0907ba00895c505d101b99af78345f1acf929b5c" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "itertools 0.14.0", - "log", - "recursive", -] - -[[package]] -name = "datafusion-physical-plan" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2f7f778a1a838dec124efb96eae6144237d546945587557c9e6936b3414558c" -dependencies = [ - "ahash 0.8.12", - "arrow", - "arrow-ord", - "arrow-schema", - "async-trait", - "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "futures", - "half", - "hashbrown 0.14.5", - "indexmap 2.12.0", - "itertools 0.14.0", - "log", - "parking_lot", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "datafusion-pruning" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd1e59e2ca14fe3c30f141600b10ad8815e2856caa59ebbd0e3e07cd3d127a65" -dependencies = [ - "arrow", - "arrow-schema", - "datafusion-common", - "datafusion-datasource", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "itertools 0.14.0", - "log", -] - -[[package]] -name = "datafusion-session" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21ef8e2745583619bd7a49474e8f45fbe98ebb31a133f27802217125a7b3d58d" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-sql", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "tokio", -] - -[[package]] -name = "datafusion-spark" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613efb6666a7d42fcb922b90cd0daa2b25ea486d141350e5d3e86e46df28309a" -dependencies = [ - "arrow", - "chrono", - "crc32fast", - "datafusion-catalog", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions", - "datafusion-macros", - "log", - "sha1", - "url", - "xxhash-rust", -] - -[[package]] -name = "datafusion-sql" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89abd9868770386fede29e5a4b14f49c0bf48d652c3b9d7a8a0332329b87d50b" -dependencies = [ - "arrow", - "bigdecimal", - "datafusion-common", - "datafusion-expr", - "indexmap 2.12.0", - "log", - "recursive", - "regex", - "sqlparser", -] - -[[package]] -name = "datafusion-sqllogictest" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17598193dd875ca895400c51ccab1c30fceb1855220dc60aa415a4db7c95a2d7" -dependencies = [ - "arrow", - "async-trait", - "bigdecimal", - "clap", - "datafusion", - "datafusion-spark", - "datafusion-substrait", - "futures", - "half", - "indicatif", - "itertools 0.14.0", - "log", - "object_store", - "rust_decimal", - "sqllogictest", - "sqlparser", - "tempfile", - "thiserror 2.0.17", - "tokio", -] - -[[package]] -name = "datafusion-substrait" -version = "50.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaa011a3814d91a03ab655ad41bbe5e57b203b2859281af8fe2c30aebbbcc5d9" -dependencies = [ - "async-recursion", - "async-trait", - "chrono", - "datafusion", - "itertools 0.14.0", - "object_store", - "pbjson-types", - "prost", - "substrait", - "tokio", - "url", - "uuid", -] - -[[package]] -name = "der" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" dependencies = [ "const-oid", "pem-rfc7468", @@ -2624,27 +1526,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "dirs" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" -dependencies = [ - "libc", - "option-ext", - "redox_users", - "windows-sys 0.59.0", -] - [[package]] name = "displaydoc" version = "0.2.5" @@ -2695,18 +1576,6 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" -[[package]] -name = "educe" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7bc049e1bd8cdeb31b68bbd586a9464ecf9f3944af3958a7a9d0f8b9799417" -dependencies = [ - "enum-ordinalize", - "proc-macro2", - "quote", - "syn 2.0.108", -] - [[package]] name = "either" version = "1.15.0" @@ -2716,61 +1585,6 @@ dependencies = [ "serde", ] -[[package]] -name = "encode_unicode" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" - -[[package]] -name = "endian-type" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" - -[[package]] -name = "enum-ordinalize" -version = "4.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0" -dependencies = [ - "enum-ordinalize-derive", -] - -[[package]] -name = "enum-ordinalize-derive" -version = "4.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.108", -] - -[[package]] -name = "env_filter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "jiff", - "log", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -2787,18 +1601,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "error-code" -version = "3.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59" - -[[package]] -name = "escape8259" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5692dd7b5a1978a5aeb0ce83b7655c58ca8efdcb79d21036ea249da95afec2c6" - [[package]] name = "etcetera" version = "0.8.0" @@ -2841,12 +1643,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "fallible-iterator" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" - [[package]] name = "fastrand" version = "2.3.0" @@ -2865,29 +1661,12 @@ dependencies = [ "simdutf8", ] -[[package]] -name = "fd-lock" -version = "4.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" -dependencies = [ - "cfg-if", - "rustix", - "windows-sys 0.59.0", -] - [[package]] name = "find-msvc-tools" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" -[[package]] -name = "fixedbitset" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" - [[package]] name = "flatbuffers" version = "25.9.23" @@ -2947,15 +1726,6 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619" -[[package]] -name = "fs-err" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62d91fd049c123429b018c47887d3f75a265540dd3c30ba9cb7bae9197edb03a" -dependencies = [ - "autocfg", -] - [[package]] name = "fs_extra" version = "1.3.0" @@ -3200,10 +1970,6 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -dependencies = [ - "ahash 0.8.12", - "allocator-api2", -] [[package]] name = "hashbrown" @@ -3356,12 +2122,6 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" -[[package]] -name = "humantime" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" - [[package]] name = "hyper" version = "0.14.32" @@ -3520,7 +2280,7 @@ dependencies = [ "fnv", "futures", "iceberg_test_utils", - "itertools 0.13.0", + "itertools", "minijinja", "mockall", "moka", @@ -3544,7 +2304,7 @@ dependencies = [ "serde_repr", "serde_with", "smol", - "strum 0.27.2", + "strum", "tempfile", "thrift", "tokio", @@ -3629,7 +2389,7 @@ dependencies = [ "http 1.3.1", "iceberg", "iceberg_test_utils", - "itertools 0.13.0", + "itertools", "mockito", "port_scanner", "reqwest", @@ -3652,7 +2412,7 @@ dependencies = [ "aws-sdk-s3tables", "iceberg", "iceberg_test_utils", - "itertools 0.13.0", + "itertools", "tokio", ] @@ -3662,28 +2422,12 @@ version = "0.7.0" dependencies = [ "async-trait", "iceberg", - "itertools 0.13.0", + "itertools", "regex", "sqlx", - "strum 0.27.2", - "tempfile", - "tokio", -] - -[[package]] -name = "iceberg-datafusion" -version = "0.7.0" -dependencies = [ - "anyhow", - "async-trait", - "datafusion", - "expect-test", - "futures", - "iceberg", - "parquet", + "strum", "tempfile", "tokio", - "uuid", ] [[package]] @@ -3703,11 +2447,9 @@ dependencies = [ "arrow-array", "arrow-schema", "ctor", - "datafusion", "futures", "iceberg", "iceberg-catalog-rest", - "iceberg-datafusion", "iceberg_test_utils", "ordered-float 2.10.1", "parquet", @@ -3715,50 +2457,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "iceberg-playground" -version = "0.7.0" -dependencies = [ - "anyhow", - "clap", - "datafusion", - "datafusion-cli", - "dirs", - "fs-err", - "home", - "iceberg", - "iceberg-catalog-rest", - "iceberg-datafusion", - "mimalloc", - "stacker", - "tokio", - "toml", - "tracing", - "tracing-subscriber", -] - -[[package]] -name = "iceberg-sqllogictest" -version = "0.7.0" -dependencies = [ - "anyhow", - "async-trait", - "datafusion", - "datafusion-sqllogictest", - "enum-ordinalize", - "env_logger", - "iceberg", - "iceberg-datafusion", - "indicatif", - "libtest-mimic", - "log", - "serde", - "sqllogictest", - "tokio", - "toml", - "tracing", -] - [[package]] name = "iceberg_test_utils" version = "0.7.0" @@ -3898,19 +2596,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "indicatif" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade6dfcba0dfb62ad59e59e7241ec8912af34fd29e0e743e3db992bd278e8b65" -dependencies = [ - "console", - "portable-atomic", - "unicode-width 0.2.2", - "unit-prefix", - "web-time", -] - [[package]] name = "inout" version = "0.1.4" @@ -3953,12 +2638,6 @@ dependencies = [ "serde", ] -[[package]] -name = "is_terminal_polyfill" -version = "1.70.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" - [[package]] name = "itertools" version = "0.13.0" @@ -3968,45 +2647,12 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" -dependencies = [ - "either", -] - [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" -[[package]] -name = "jiff" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" -dependencies = [ - "jiff-static", - "log", - "portable-atomic", - "portable-atomic-util", - "serde", -] - -[[package]] -name = "jiff-static" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.108", -] - [[package]] name = "jobserver" version = "0.1.34" @@ -4108,12 +2754,6 @@ dependencies = [ "lexical-util", ] -[[package]] -name = "libbz2-rs-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" - [[package]] name = "libc" version = "0.2.177" @@ -4136,16 +2776,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" -[[package]] -name = "libmimalloc-sys" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "libredox" version = "0.1.10" @@ -4168,18 +2798,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "libtest-mimic" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" -dependencies = [ - "anstream", - "anstyle", - "clap", - "escape8259", -] - [[package]] name = "libz-rs-sys" version = "0.5.2" @@ -4241,24 +2859,13 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.11.5" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" dependencies = [ "twox-hash", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "md-5" version = "0.10.6" @@ -4297,15 +2904,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "mimalloc" -version = "0.1.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1ee66a4b64c74f4ef288bcbb9192ad9c3feaad75193129ac8509af543894fd8" -dependencies = [ - "libmimalloc-sys", -] - [[package]] name = "minijinja" version = "2.12.0" @@ -4434,13 +3032,7 @@ dependencies = [ "proc-macro2", "quote", "syn 2.0.108", -] - -[[package]] -name = "multimap" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +] [[package]] name = "munge" @@ -4474,15 +3066,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" -[[package]] -name = "nibble_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" -dependencies = [ - "smallvec", -] - [[package]] name = "nix" version = "0.29.0" @@ -4496,18 +3079,6 @@ dependencies = [ "memoffset", ] -[[package]] -name = "nix" -version = "0.30.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" -dependencies = [ - "bitflags", - "cfg-if", - "cfg_aliases", - "libc", -] - [[package]] name = "nom" version = "7.1.3" @@ -4527,20 +3098,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -4604,17 +3161,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -4657,64 +3203,12 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "object" -version = "0.32.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" -dependencies = [ - "memchr", -] - -[[package]] -name = "object_store" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" -dependencies = [ - "async-trait", - "base64 0.22.1", - "bytes", - "chrono", - "form_urlencoded", - "futures", - "http 1.3.1", - "http-body-util", - "humantime", - "hyper 1.7.0", - "itertools 0.14.0", - "md-5", - "parking_lot", - "percent-encoding", - "quick-xml 0.38.3", - "rand 0.9.2", - "reqwest", - "ring", - "rustls-pemfile 2.2.0", - "serde", - "serde_json", - "serde_urlencoded", - "thiserror 2.0.17", - "tokio", - "tracing", - "url", - "walkdir", - "wasm-bindgen-futures", - "web-time", -] - [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" -[[package]] -name = "once_cell_polyfill" -version = "1.70.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" - [[package]] name = "opendal" version = "0.54.1" @@ -4749,12 +3243,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" -[[package]] -name = "option-ext" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" - [[package]] name = "ordered-float" version = "2.10.1" @@ -4791,12 +3279,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" -[[package]] -name = "owo-colors" -version = "4.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" - [[package]] name = "parking" version = "2.2.1" @@ -4828,9 +3310,8 @@ dependencies = [ [[package]] name = "parquet" -version = "56.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +version = "57.0.0" +source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4849,11 +3330,10 @@ dependencies = [ "half", "hashbrown 0.16.0", "lz4_flex", - "num", "num-bigint", - "object_store", + "num-integer", + "num-traits", "paste", - "ring", "seq-macro", "simdutf8", "snap", @@ -4869,43 +3349,6 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" -[[package]] -name = "pbjson" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68" -dependencies = [ - "base64 0.21.7", - "serde", -] - -[[package]] -name = "pbjson-build" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" -dependencies = [ - "heck", - "itertools 0.13.0", - "prost", - "prost-types", -] - -[[package]] -name = "pbjson-types" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887" -dependencies = [ - "bytes", - "chrono", - "pbjson", - "pbjson-build", - "prost", - "prost-build", - "serde", -] - [[package]] name = "pbkdf2" version = "0.12.2" @@ -4941,46 +3384,6 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" -[[package]] -name = "petgraph" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" -dependencies = [ - "fixedbitset", - "indexmap 2.12.0", -] - -[[package]] -name = "petgraph" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" -dependencies = [ - "fixedbitset", - "hashbrown 0.15.5", - "indexmap 2.12.0", - "serde", -] - -[[package]] -name = "phf" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" -dependencies = [ - "siphasher", -] - [[package]] name = "pilota" version = "0.11.10" @@ -5116,44 +3519,6 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" -[[package]] -name = "portable-atomic-util" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" -dependencies = [ - "portable-atomic", -] - -[[package]] -name = "postgres-protocol" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbef655056b916eb868048276cfd5d6a7dea4f81560dfd047f97c8c6fe3fcfd4" -dependencies = [ - "base64 0.22.1", - "byteorder", - "bytes", - "fallible-iterator", - "hmac", - "md-5", - "memchr", - "rand 0.9.2", - "sha2", - "stringprep", -] - -[[package]] -name = "postgres-types" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef4605b7c057056dd35baeb6ac0c0338e4975b1f2bef0f65da953285eb007095" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol", -] - [[package]] name = "potential_utf" version = "0.1.4" @@ -5230,7 +3595,7 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" dependencies = [ - "toml_edit 0.23.7", + "toml_edit", ] [[package]] @@ -5242,68 +3607,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "prost" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-build" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" -dependencies = [ - "heck", - "itertools 0.14.0", - "log", - "multimap", - "once_cell", - "petgraph 0.7.1", - "prettyplease", - "prost", - "prost-types", - "regex", - "syn 2.0.108", - "tempfile", -] - -[[package]] -name = "prost-derive" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" -dependencies = [ - "anyhow", - "itertools 0.14.0", - "proc-macro2", - "quote", - "syn 2.0.108", -] - -[[package]] -name = "prost-types" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" -dependencies = [ - "prost", -] - -[[package]] -name = "psm" -version = "0.1.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" -dependencies = [ - "ar_archive_writer", - "cc", -] - [[package]] name = "ptr_meta" version = "0.1.4" @@ -5446,16 +3749,6 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" -[[package]] -name = "radix_trie" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" -dependencies = [ - "endian-type", - "nibble_vec", -] - [[package]] name = "rancor" version = "0.1.1" @@ -5526,26 +3819,6 @@ dependencies = [ "getrandom 0.3.4", ] -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.108", -] - [[package]] name = "redox_syscall" version = "0.5.18" @@ -5555,17 +3828,6 @@ dependencies = [ "bitflags", ] -[[package]] -name = "redox_users" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" -dependencies = [ - "getrandom 0.2.16", - "libredox", - "thiserror 2.0.17", -] - [[package]] name = "ref-cast" version = "1.0.25" @@ -5621,16 +3883,6 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" -[[package]] -name = "regress" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145bb27393fe455dd64d6cbc8d059adfa392590a45eadf079c01b11857e7b010" -dependencies = [ - "hashbrown 0.15.5", - "memchr", -] - [[package]] name = "rend" version = "0.4.2" @@ -5688,7 +3940,6 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", "http-body-util", @@ -5701,7 +3952,6 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.34", - "rustls-native-certs 0.8.2", "rustls-pki-types", "serde", "serde_json", @@ -5844,7 +4094,6 @@ dependencies = [ "borsh", "bytes", "num-traits", - "postgres-types", "rand 0.8.5", "rkyv 0.7.45", "serde", @@ -5916,7 +4165,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" dependencies = [ "openssl-probe", - "rustls-pemfile 1.0.4", + "rustls-pemfile", "schannel", "security-framework 2.11.1", ] @@ -5930,25 +4179,16 @@ dependencies = [ "openssl-probe", "rustls-pki-types", "schannel", - "security-framework 3.5.1", -] - -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", + "security-framework 3.5.1", ] [[package]] name = "rustls-pemfile" -version = "2.2.0" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "rustls-pki-types", + "base64 0.21.7", ] [[package]] @@ -5989,28 +4229,6 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" -[[package]] -name = "rustyline" -version = "17.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" -dependencies = [ - "bitflags", - "cfg-if", - "clipboard-win", - "fd-lock", - "home", - "libc", - "log", - "memchr", - "nix 0.30.1", - "radix_trie", - "unicode-segmentation", - "unicode-width 0.2.2", - "utf8parse", - "windows-sys 0.60.2", -] - [[package]] name = "ryu" version = "1.0.20" @@ -6026,15 +4244,6 @@ dependencies = [ "cipher", ] -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - [[package]] name = "schannel" version = "0.1.28" @@ -6044,18 +4253,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "schemars" -version = "0.8.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" -dependencies = [ - "dyn-clone", - "schemars_derive", - "serde", - "serde_json", -] - [[package]] name = "schemars" version = "0.9.0" @@ -6080,18 +4277,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "schemars_derive" -version = "0.8.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" -dependencies = [ - "proc-macro2", - "quote", - "serde_derive_internals", - "syn 2.0.108", -] - [[package]] name = "scopeguard" version = "1.2.0" @@ -6166,10 +4351,6 @@ name = "semver" version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" -dependencies = [ - "serde", - "serde_core", -] [[package]] name = "seq-macro" @@ -6217,17 +4398,6 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "serde_derive_internals" -version = "0.29.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.108", -] - [[package]] name = "serde_json" version = "1.0.145" @@ -6252,27 +4422,6 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "serde_spanned" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" -dependencies = [ - "serde", -] - -[[package]] -name = "serde_tokenstream" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1" -dependencies = [ - "proc-macro2", - "quote", - "serde", - "syn 2.0.108", -] - [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -6316,19 +4465,6 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "serde_yaml" -version = "0.9.34+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" -dependencies = [ - "indexmap 2.12.0", - "itoa", - "ryu", - "serde", - "unsafe-libyaml", -] - [[package]] name = "sha1" version = "0.10.6" @@ -6415,12 +4551,6 @@ dependencies = [ "time", ] -[[package]] -name = "siphasher" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" - [[package]] name = "slab" version = "0.4.11" @@ -6536,53 +4666,6 @@ dependencies = [ "der", ] -[[package]] -name = "sqllogictest" -version = "0.28.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3566426f72a13e393aa34ca3d542c5b0eb86da4c0db137ee9b5cfccc6179e52d" -dependencies = [ - "async-trait", - "educe", - "fs-err", - "futures", - "glob", - "humantime", - "itertools 0.13.0", - "libtest-mimic", - "md-5", - "owo-colors", - "rand 0.8.5", - "regex", - "similar", - "subst", - "tempfile", - "thiserror 2.0.17", - "tracing", -] - -[[package]] -name = "sqlparser" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" -dependencies = [ - "log", - "recursive", - "sqlparser_derive", -] - -[[package]] -name = "sqlparser_derive" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.108", -] - [[package]] name = "sqlx" version = "0.8.6" @@ -6776,19 +4859,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "stacker" -version = "0.1.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" -dependencies = [ - "cc", - "cfg-if", - "libc", - "psm", - "windows-sys 0.59.0", -] - [[package]] name = "stringprep" version = "0.1.5" @@ -6806,32 +4876,13 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" dependencies = [ - "strum_macros 0.27.2", -] - -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.108", + "strum_macros", ] [[package]] @@ -6846,41 +4897,6 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "subst" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a9a86e5144f63c2d18334698269a8bfae6eece345c70b64821ea5b35054ec99" -dependencies = [ - "memchr", - "unicode-width 0.1.14", -] - -[[package]] -name = "substrait" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6d24c270c6c672a86c183c3a8439ba46c1936f93cf7296aa692de3b0ff0228" -dependencies = [ - "heck", - "pbjson", - "pbjson-build", - "pbjson-types", - "prettyplease", - "prost", - "prost-build", - "prost-types", - "regress", - "schemars 0.8.22", - "semver", - "serde", - "serde_json", - "serde_yaml", - "syn 2.0.108", - "typify", - "walkdir", -] - [[package]] name = "subtle" version = "2.6.1" @@ -7168,27 +5184,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml" -version = "0.8.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" -dependencies = [ - "serde", - "serde_spanned", - "toml_datetime 0.6.11", - "toml_edit 0.22.27", -] - -[[package]] -name = "toml_datetime" -version = "0.6.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" -dependencies = [ - "serde", -] - [[package]] name = "toml_datetime" version = "0.7.3" @@ -7198,20 +5193,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "toml_edit" -version = "0.22.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" -dependencies = [ - "indexmap 2.12.0", - "serde", - "serde_spanned", - "toml_datetime 0.6.11", - "toml_write", - "winnow", -] - [[package]] name = "toml_edit" version = "0.23.7" @@ -7219,7 +5200,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d" dependencies = [ "indexmap 2.12.0", - "toml_datetime 0.7.3", + "toml_datetime", "toml_parser", "winnow", ] @@ -7233,12 +5214,6 @@ dependencies = [ "winnow", ] -[[package]] -name = "toml_write" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" - [[package]] name = "tower" version = "0.5.2" @@ -7380,53 +5355,6 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" -[[package]] -name = "typify" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7144144e97e987c94758a3017c920a027feac0799df325d6df4fc8f08d02068e" -dependencies = [ - "typify-impl", - "typify-macro", -] - -[[package]] -name = "typify-impl" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "062879d46aa4c9dfe0d33b035bbaf512da192131645d05deacb7033ec8581a09" -dependencies = [ - "heck", - "log", - "proc-macro2", - "quote", - "regress", - "schemars 0.8.22", - "semver", - "serde", - "serde_json", - "syn 2.0.108", - "thiserror 2.0.17", - "unicode-ident", -] - -[[package]] -name = "typify-macro" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9708a3ceb6660ba3f8d2b8f0567e7d4b8b198e2b94d093b8a6077a751425de9e" -dependencies = [ - "proc-macro2", - "quote", - "schemars 0.8.22", - "semver", - "serde", - "serde_json", - "serde_tokenstream", - "syn 2.0.108", - "typify-impl", -] - [[package]] name = "unicode-bidi" version = "0.3.18" @@ -7454,36 +5382,6 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" -[[package]] -name = "unicode-segmentation" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" - -[[package]] -name = "unicode-width" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" - -[[package]] -name = "unicode-width" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" - -[[package]] -name = "unit-prefix" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "323402cff2dd658f39ca17c789b502021b3f18707c91cdf22e3838e1b4023817" - -[[package]] -name = "unsafe-libyaml" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" - [[package]] name = "untrusted" version = "0.9.0" @@ -7514,12 +5412,6 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" -[[package]] -name = "utf8parse" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" - [[package]] name = "uuid" version = "1.18.1" @@ -7564,7 +5456,7 @@ dependencies = [ "metainfo", "motore", "mur3", - "nix 0.29.0", + "nix", "once_cell", "pin-project", "rand 0.9.2", @@ -7612,16 +5504,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - [[package]] name = "want" version = "0.3.1" @@ -7771,15 +5653,6 @@ dependencies = [ "wasite", ] -[[package]] -name = "winapi-util" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" -dependencies = [ - "windows-sys 0.48.0", -] - [[package]] name = "windows-core" version = "0.62.2" @@ -8106,21 +5979,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "xxhash-rust" -version = "0.8.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" - -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yansi" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index 5ac52bcd2e..d97a059e27 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,14 +16,18 @@ # under the License. [workspace] -exclude = ["bindings/python", "crates/integrations/datafusion"] +exclude = [ + "bindings/python", + "crates/integrations/datafusion", + "crates/integrations/playground", + "crates/sqllogictest", +] members = [ "crates/catalog/*", "crates/examples", "crates/iceberg", "crates/integration_tests", "crates/integrations/*", - "crates/sqllogictest", "crates/test_utils", ] resolver = "2" diff --git a/crates/integration_tests/Cargo.toml b/crates/integration_tests/Cargo.toml index 07eea5f375..1ceb95416f 100644 --- a/crates/integration_tests/Cargo.toml +++ b/crates/integration_tests/Cargo.toml @@ -28,11 +28,11 @@ version = { workspace = true } arrow-array = { workspace = true } arrow-schema = { workspace = true } ctor = { workspace = true } -datafusion = { workspace = true } +# datafusion = { workspace = true } futures = { workspace = true } iceberg = { workspace = true } iceberg-catalog-rest = { workspace = true } -iceberg-datafusion = { workspace = true } +# iceberg-datafusion = { workspace = true } iceberg_test_utils = { path = "../test_utils", features = ["tests"] } parquet = { workspace = true } tokio = { workspace = true } diff --git a/crates/integration_tests/tests/shared_tests/mod.rs b/crates/integration_tests/tests/shared_tests/mod.rs index 065b14d5da..d26812ffff 100644 --- a/crates/integration_tests/tests/shared_tests/mod.rs +++ b/crates/integration_tests/tests/shared_tests/mod.rs @@ -26,7 +26,7 @@ use crate::get_shared_containers; mod append_data_file_test; mod append_partition_data_file_test; mod conflict_commit_test; -mod datafusion; +// mod datafusion; mod read_evolved_schema; mod read_positional_deletes; mod scan_all_type; From f430f94856f0e12c7a2ae33782f953a1bbaf4034 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 12:57:49 +0100 Subject: [PATCH 03/29] fix unit tests --- crates/iceberg/src/arrow/reader.rs | 2 +- .../src/expr/visitors/page_index_evaluator.rs | 34 +++++++++---------- crates/iceberg/src/inspect/manifests.rs | 24 ++++++------- crates/iceberg/src/inspect/snapshots.rs | 14 ++++---- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 27395c155a..4bebf9b612 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -2002,7 +2002,7 @@ message schema { assert_eq!(err.kind(), ErrorKind::DataInvalid); assert_eq!( err.to_string(), - "DataInvalid => Unsupported Arrow data type: Duration(Microsecond)".to_string() + "DataInvalid => Unsupported Arrow data type: Duration(µs)" ); // Omitting field c2, we still get an error due to c3 being selected diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index f008c8a86b..1c7338fcb8 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -776,10 +776,10 @@ mod tests { use std::sync::Arc; use parquet::arrow::arrow_reader::RowSelector; - use parquet::basic::{BoundaryOrder, LogicalType as ParquetLogicalType, Type as ParquetPhysicalType}; + use parquet::basic::{LogicalType as ParquetLogicalType, Type as ParquetPhysicalType}; use parquet::data_type::ByteArray; - use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; - use parquet::file::page_index::column_index::{ColumnIndexBuilder, ColumnIndexMetaData as Index}; + use parquet::file::metadata::{ColumnChunkMetaData, ColumnIndexBuilder, RowGroupMetaData}; + use parquet::file::page_index::column_index::ColumnIndexMetaData as Index; use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use parquet::file::statistics::Statistics; use parquet::schema::types::{ @@ -1300,20 +1300,20 @@ mod tests { } fn create_page_index() -> Result<(Vec, Vec)> { - let mut idx_float_builder = ColumnIndexBuilder::new(); - idx_float_builder.append(None, None, Some(1024)); - idx_float_builder.append(Some(&0.0f32), Some(&10.0f32), Some(0)); - idx_float_builder.append(Some(&10.0f32), Some(&20.0f32), Some(1)); - idx_float_builder.append(None, None, None); - let idx_float = idx_float_builder.build_to_thrift(); - - let mut idx_string_builder = ColumnIndexBuilder::new(); - idx_string_builder.append(Some(&ByteArray::from("AA")), Some(&ByteArray::from("DD")), Some(0)); - idx_string_builder.append(Some(&ByteArray::from("DE")), Some(&ByteArray::from("DE")), Some(0)); - idx_string_builder.append(Some(&ByteArray::from("DF")), Some(&ByteArray::from("UJ")), Some(1)); - idx_string_builder.append(None, None, Some(48)); - idx_string_builder.append(None, None, None); - let idx_string = idx_string_builder.build_to_thrift(); + let mut idx_float_builder = ColumnIndexBuilder::new(ParquetPhysicalType::FLOAT); + idx_float_builder.append(true, vec![], vec![], 1024); + idx_float_builder.append(false, 0.0f32.to_le_bytes().to_vec(), 10.0f32.to_le_bytes().to_vec(), 0); + idx_float_builder.append(false, 10.0f32.to_le_bytes().to_vec(), 20.0f32.to_le_bytes().to_vec(), 1); + idx_float_builder.append(true, vec![], vec![], -1); + let idx_float = idx_float_builder.build().unwrap(); + + let mut idx_string_builder = ColumnIndexBuilder::new(ParquetPhysicalType::BYTE_ARRAY); + idx_string_builder.append(false, b"AA".to_vec(), b"DD".to_vec(), 0); + idx_string_builder.append(false, b"DE".to_vec(), b"DE".to_vec(), 0); + idx_string_builder.append(false, b"DF".to_vec(), b"UJ".to_vec(), 1); + idx_string_builder.append(false, vec![], vec![], 48); + idx_string_builder.append(true, vec![], vec![], -1); + let idx_string = idx_string_builder.build().unwrap(); let page_locs_float = vec![ PageLocation { offset: 0, compressed_page_size: 1024, first_row_index: 0 }, diff --git a/crates/iceberg/src/inspect/manifests.rs b/crates/iceberg/src/inspect/manifests.rs index 60854b8bae..d85d9fe834 100644 --- a/crates/iceberg/src/inspect/manifests.rs +++ b/crates/iceberg/src/inspect/manifests.rs @@ -296,18 +296,18 @@ mod tests { check_record_batches( record_batch.try_collect::>().await.unwrap(), expect![[r#" - Field { name: "content", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "14"} }, - Field { name: "path", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1"} }, - Field { name: "length", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "2"} }, - Field { name: "partition_spec_id", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "3"} }, - Field { name: "added_snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "4"} }, - Field { name: "added_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "5"} }, - Field { name: "existing_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "6"} }, - Field { name: "deleted_data_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "7"} }, - Field { name: "added_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "15"} }, - Field { name: "existing_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "16"} }, - Field { name: "deleted_delete_files_count", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "17"} }, - Field { name: "partition_summaries", data_type: List(Field { name: "item", data_type: Struct([Field { name: "contains_null", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "10"} }, Field { name: "contains_nan", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "11"} }, Field { name: "lower_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "12"} }, Field { name: "upper_bound", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "13"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "9"} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "8"} }"#]], + Field { "content": Int32, metadata: {"PARQUET:field_id": "14"} }, + Field { "path": Utf8, metadata: {"PARQUET:field_id": "1"} }, + Field { "length": Int64, metadata: {"PARQUET:field_id": "2"} }, + Field { "partition_spec_id": Int32, metadata: {"PARQUET:field_id": "3"} }, + Field { "added_snapshot_id": Int64, metadata: {"PARQUET:field_id": "4"} }, + Field { "added_data_files_count": Int32, metadata: {"PARQUET:field_id": "5"} }, + Field { "existing_data_files_count": Int32, metadata: {"PARQUET:field_id": "6"} }, + Field { "deleted_data_files_count": Int32, metadata: {"PARQUET:field_id": "7"} }, + Field { "added_delete_files_count": Int32, metadata: {"PARQUET:field_id": "15"} }, + Field { "existing_delete_files_count": Int32, metadata: {"PARQUET:field_id": "16"} }, + Field { "deleted_delete_files_count": Int32, metadata: {"PARQUET:field_id": "17"} }, + Field { "partition_summaries": List(Struct("contains_null": Boolean, metadata: {"PARQUET:field_id": "10"}, "contains_nan": nullable Boolean, metadata: {"PARQUET:field_id": "11"}, "lower_bound": nullable Utf8, metadata: {"PARQUET:field_id": "12"}, "upper_bound": nullable Utf8, metadata: {"PARQUET:field_id": "13"}), metadata: {"PARQUET:field_id": "9"}), metadata: {"PARQUET:field_id": "8"} }"#]], expect![[r#" content: PrimitiveArray [ diff --git a/crates/iceberg/src/inspect/snapshots.rs b/crates/iceberg/src/inspect/snapshots.rs index 6081ec165b..479478b074 100644 --- a/crates/iceberg/src/inspect/snapshots.rs +++ b/crates/iceberg/src/inspect/snapshots.rs @@ -151,14 +151,14 @@ mod tests { check_record_batches( batch_stream.try_collect::>().await.unwrap(), expect![[r#" - Field { name: "committed_at", data_type: Timestamp(Microsecond, Some("+00:00")), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "1"} }, - Field { name: "snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "2"} }, - Field { name: "parent_id", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "3"} }, - Field { name: "operation", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "4"} }, - Field { name: "manifest_list", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "5"} }, - Field { name: "summary", data_type: Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "7"} }, Field { name: "value", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "8"} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"PARQUET:field_id": "6"} }"#]], + Field { "committed_at": Timestamp(µs, "+00:00"), metadata: {"PARQUET:field_id": "1"} }, + Field { "snapshot_id": Int64, metadata: {"PARQUET:field_id": "2"} }, + Field { "parent_id": nullable Int64, metadata: {"PARQUET:field_id": "3"} }, + Field { "operation": nullable Utf8, metadata: {"PARQUET:field_id": "4"} }, + Field { "manifest_list": nullable Utf8, metadata: {"PARQUET:field_id": "5"} }, + Field { "summary": nullable Map("key_value": Struct("key": Utf8, metadata: {"PARQUET:field_id": "7"}, "value": nullable Utf8, metadata: {"PARQUET:field_id": "8"}), unsorted), metadata: {"PARQUET:field_id": "6"} }"#]], expect![[r#" - committed_at: PrimitiveArray + committed_at: PrimitiveArray [ 2018-01-04T21:22:35.770+00:00, 2019-04-12T20:29:15.770+00:00, From 47ee5e1eb689b63cf612380d3d921e82e03ee30b Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:07:08 +0100 Subject: [PATCH 04/29] ignore datafusion crates with rustfmt too --- rustfmt.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rustfmt.toml b/rustfmt.toml index 91d924daf1..bd6ae9c048 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -24,3 +24,9 @@ imports_granularity = "Module" overflow_delimited_expr = true trailing_comma = "Vertical" where_single_line = true + +ignore = [ + "crates/integrations/datafusion", + "crates/integrations/playground", + "crates/sqllogictest", +] From e3c5d4825bc899ebc7535c5ba87b2d164d65f20c Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:14:43 +0100 Subject: [PATCH 05/29] make ignored cargos standalone --- crates/integrations/datafusion/Cargo.toml | 12 ++++++------ crates/integrations/playground/Cargo.toml | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/integrations/datafusion/Cargo.toml b/crates/integrations/datafusion/Cargo.toml index 0ee1738b4f..0d4f0e20bd 100644 --- a/crates/integrations/datafusion/Cargo.toml +++ b/crates/integrations/datafusion/Cargo.toml @@ -16,17 +16,17 @@ # under the License. [package] -edition = { workspace = true } -homepage = { workspace = true } +edition = "2024" +homepage = "https://rust.iceberg.apache.org/" name = "iceberg-datafusion" -rust-version = { workspace = true } -version = { workspace = true } +rust-version = "1.87" +version = "0.7.0" categories = ["database"] description = "Apache Iceberg DataFusion Integration" keywords = ["iceberg", "integrations", "datafusion"] -license = { workspace = true } -repository = { workspace = true } +license = "Apache-2.0" +repository = "https://github.com/apache/iceberg-rust" [dependencies] anyhow = { workspace = true } diff --git a/crates/integrations/playground/Cargo.toml b/crates/integrations/playground/Cargo.toml index 3f6774be19..0cb9985315 100644 --- a/crates/integrations/playground/Cargo.toml +++ b/crates/integrations/playground/Cargo.toml @@ -17,14 +17,14 @@ [package] description = "Apache iceberg client" -edition.workspace = true -homepage.workspace = true -license.workspace = true +edition = "2024" +homepage = "https://rust.iceberg.apache.org/" +license = "Apache-2.0" name = "iceberg-playground" readme = "README.md" -repository.workspace = true -rust-version.workspace = true -version.workspace = true +repository = "https://github.com/apache/iceberg-rust" +rust-version = "1.87" +version = "0.7.0" [dependencies] anyhow = { workspace = true } From 5536adcaf6222abb9f2b60459c40a80688d7ca30 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:15:53 +0100 Subject: [PATCH 06/29] -||- --- crates/sqllogictest/Cargo.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/sqllogictest/Cargo.toml b/crates/sqllogictest/Cargo.toml index e826ad7ae0..25b63da0e7 100644 --- a/crates/sqllogictest/Cargo.toml +++ b/crates/sqllogictest/Cargo.toml @@ -16,13 +16,13 @@ # under the License. [package] -edition = { workspace = true } -homepage = { workspace = true } -license = { workspace = true } +edition = "2024" +homepage = "https://rust.iceberg.apache.org/" +license = "Apache-2.0" name = "iceberg-sqllogictest" -repository = { workspace = true } -rust-version = { workspace = true } -version = { workspace = true } +repository = "https://github.com/apache/iceberg-rust" +rust-version = "1.87" +version = "0.7.0" [dependencies] anyhow = { workspace = true } From e634cdb8aface84aa71df848295e7c3304adb251 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:16:05 +0100 Subject: [PATCH 07/29] ignore in rustfmt requires nightly --- rustfmt.toml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/rustfmt.toml b/rustfmt.toml index bd6ae9c048..91d924daf1 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -24,9 +24,3 @@ imports_granularity = "Module" overflow_delimited_expr = true trailing_comma = "Vertical" where_single_line = true - -ignore = [ - "crates/integrations/datafusion", - "crates/integrations/playground", - "crates/sqllogictest", -] From 31f5c92c20e15030b2402031b1ffe79b06f4e92b Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:21:29 +0100 Subject: [PATCH 08/29] comment out dependencies --- crates/integrations/datafusion/Cargo.toml | 23 ++++++++-------- crates/integrations/playground/Cargo.toml | 33 ++++++++++++----------- crates/sqllogictest/Cargo.toml | 33 ++++++++++++----------- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/crates/integrations/datafusion/Cargo.toml b/crates/integrations/datafusion/Cargo.toml index 0d4f0e20bd..adf83d7f5d 100644 --- a/crates/integrations/datafusion/Cargo.toml +++ b/crates/integrations/datafusion/Cargo.toml @@ -29,16 +29,17 @@ license = "Apache-2.0" repository = "https://github.com/apache/iceberg-rust" [dependencies] -anyhow = { workspace = true } -async-trait = { workspace = true } -datafusion = { workspace = true } -futures = { workspace = true } -iceberg = { workspace = true } -parquet = { workspace = true } -tokio = { workspace = true } -uuid = { workspace = true } +# Dependencies commented out - this crate is excluded from workspace +# anyhow = { workspace = true } +# async-trait = { workspace = true } +# datafusion = { workspace = true } +# futures = { workspace = true } +# iceberg = { workspace = true } +# parquet = { workspace = true } +# tokio = { workspace = true } +# uuid = { workspace = true } [dev-dependencies] -expect-test = { workspace = true } -parquet = { workspace = true } -tempfile = { workspace = true } +# expect-test = { workspace = true } +# parquet = { workspace = true } +# tempfile = { workspace = true } diff --git a/crates/integrations/playground/Cargo.toml b/crates/integrations/playground/Cargo.toml index 0cb9985315..37af89a896 100644 --- a/crates/integrations/playground/Cargo.toml +++ b/crates/integrations/playground/Cargo.toml @@ -27,22 +27,23 @@ rust-version = "1.87" version = "0.7.0" [dependencies] -anyhow = { workspace = true } -clap = { workspace = true } -datafusion = { workspace = true } -datafusion-cli = { workspace = true } -dirs = { workspace = true } -fs-err = { workspace = true } -home = { workspace = true } -iceberg = { workspace = true } -iceberg-catalog-rest = { workspace = true } -iceberg-datafusion = { workspace = true } -mimalloc = { workspace = true } -stacker = { workspace = true } -tokio = { workspace = true } -toml = { workspace = true } -tracing = { workspace = true } -tracing-subscriber = { workspace = true } +# Dependencies commented out - this crate is excluded from workspace +# anyhow = { workspace = true } +# clap = { workspace = true } +# datafusion = { workspace = true } +# datafusion-cli = { workspace = true } +# dirs = { workspace = true } +# fs-err = { workspace = true } +# home = { workspace = true } +# iceberg = { workspace = true } +# iceberg-catalog-rest = { workspace = true } +# iceberg-datafusion = { workspace = true } +# mimalloc = { workspace = true } +# stacker = { workspace = true } +# tokio = { workspace = true } +# toml = { workspace = true } +# tracing = { workspace = true } +# tracing-subscriber = { workspace = true } [package.metadata.cargo-machete] # These dependencies are added to ensure minimal dependency version diff --git a/crates/sqllogictest/Cargo.toml b/crates/sqllogictest/Cargo.toml index 25b63da0e7..35006dfb5b 100644 --- a/crates/sqllogictest/Cargo.toml +++ b/crates/sqllogictest/Cargo.toml @@ -25,24 +25,25 @@ rust-version = "1.87" version = "0.7.0" [dependencies] -anyhow = { workspace = true } -async-trait = { workspace = true } -datafusion = { workspace = true } -datafusion-sqllogictest = { workspace = true } -enum-ordinalize = { workspace = true } -env_logger = { workspace = true } -iceberg = { workspace = true } -iceberg-datafusion = { workspace = true } -indicatif = { workspace = true } -log = { workspace = true } -sqllogictest = { workspace = true } -toml = { workspace = true } -serde = { workspace = true } -tracing = { workspace = true } -tokio = { workspace = true } +# Dependencies commented out - this crate is excluded from workspace +# anyhow = { workspace = true } +# async-trait = { workspace = true } +# datafusion = { workspace = true } +# datafusion-sqllogictest = { workspace = true } +# enum-ordinalize = { workspace = true } +# env_logger = { workspace = true } +# iceberg = { workspace = true } +# iceberg-datafusion = { workspace = true } +# indicatif = { workspace = true } +# log = { workspace = true } +# sqllogictest = { workspace = true } +# toml = { workspace = true } +# serde = { workspace = true } +# tracing = { workspace = true } +# tokio = { workspace = true } [dev-dependencies] -libtest-mimic = { workspace = true } +# libtest-mimic = { workspace = true } [[test]] harness = false From 140c916d6272a5bc371b0ff08251cbaf81aeb5fe Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:23:59 +0100 Subject: [PATCH 09/29] cargo fmt --- .../src/expr/visitors/page_index_evaluator.rs | 141 +++++++++++++----- .../src/writer/file_writer/parquet_writer.rs | 1 - 2 files changed, 104 insertions(+), 38 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index 1c7338fcb8..efb0e9aafa 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -260,7 +260,10 @@ impl<'a> PageIndexEvaluator<'a> { idx.max_value(page_idx).copied().map(|val| { Datum::new(field_type.clone(), PrimitiveLiteral::Boolean(val)) }), - PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), + PageNullCount::from_row_and_null_counts( + row_count, + idx.null_count(page_idx), + ), ) }) .collect(), @@ -268,11 +271,16 @@ impl<'a> PageIndexEvaluator<'a> { .zip(row_counts.iter()) .map(|(page_idx, &row_count)| { predicate( - idx.min_value(page_idx).copied() + idx.min_value(page_idx) + .copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Int(val))), - idx.max_value(page_idx).copied() + idx.max_value(page_idx) + .copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Int(val))), - PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), + PageNullCount::from_row_and_null_counts( + row_count, + idx.null_count(page_idx), + ), ) }) .collect(), @@ -280,11 +288,16 @@ impl<'a> PageIndexEvaluator<'a> { .zip(row_counts.iter()) .map(|(page_idx, &row_count)| { predicate( - idx.min_value(page_idx).copied() + idx.min_value(page_idx) + .copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Long(val))), - idx.max_value(page_idx).copied() + idx.max_value(page_idx) + .copied() .map(|val| Datum::new(field_type.clone(), PrimitiveLiteral::Long(val))), - PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), + PageNullCount::from_row_and_null_counts( + row_count, + idx.null_count(page_idx), + ), ) }) .collect(), @@ -304,7 +317,10 @@ impl<'a> PageIndexEvaluator<'a> { PrimitiveLiteral::Float(OrderedFloat::from(val)), ) }), - PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), + PageNullCount::from_row_and_null_counts( + row_count, + idx.null_count(page_idx), + ), ) }) .collect(), @@ -324,7 +340,10 @@ impl<'a> PageIndexEvaluator<'a> { PrimitiveLiteral::Double(OrderedFloat::from(val)), ) }), - PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), + PageNullCount::from_row_and_null_counts( + row_count, + idx.null_count(page_idx), + ), ) }) .collect(), @@ -335,20 +354,19 @@ impl<'a> PageIndexEvaluator<'a> { idx.min_value(page_idx).map(|val| { Datum::new( field_type.clone(), - PrimitiveLiteral::String( - String::from_utf8(val.to_vec()).unwrap(), - ), + PrimitiveLiteral::String(String::from_utf8(val.to_vec()).unwrap()), ) }), idx.max_value(page_idx).map(|val| { Datum::new( field_type.clone(), - PrimitiveLiteral::String( - String::from_utf8(val.to_vec()).unwrap(), - ), + PrimitiveLiteral::String(String::from_utf8(val.to_vec()).unwrap()), ) }), - PageNullCount::from_row_and_null_counts(row_count, idx.null_count(page_idx)), + PageNullCount::from_row_and_null_counts( + row_count, + idx.null_count(page_idx), + ), ) }) .collect(), @@ -1302,8 +1320,18 @@ mod tests { fn create_page_index() -> Result<(Vec, Vec)> { let mut idx_float_builder = ColumnIndexBuilder::new(ParquetPhysicalType::FLOAT); idx_float_builder.append(true, vec![], vec![], 1024); - idx_float_builder.append(false, 0.0f32.to_le_bytes().to_vec(), 10.0f32.to_le_bytes().to_vec(), 0); - idx_float_builder.append(false, 10.0f32.to_le_bytes().to_vec(), 20.0f32.to_le_bytes().to_vec(), 1); + idx_float_builder.append( + false, + 0.0f32.to_le_bytes().to_vec(), + 10.0f32.to_le_bytes().to_vec(), + 0, + ); + idx_float_builder.append( + false, + 10.0f32.to_le_bytes().to_vec(), + 20.0f32.to_le_bytes().to_vec(), + 1, + ); idx_float_builder.append(true, vec![], vec![], -1); let idx_float = idx_float_builder.build().unwrap(); @@ -1316,29 +1344,68 @@ mod tests { let idx_string = idx_string_builder.build().unwrap(); let page_locs_float = vec![ - PageLocation { offset: 0, compressed_page_size: 1024, first_row_index: 0 }, - PageLocation { offset: 1024, compressed_page_size: 1024, first_row_index: 1024 }, - PageLocation { offset: 2048, compressed_page_size: 1024, first_row_index: 2048 }, - PageLocation { offset: 3072, compressed_page_size: 1024, first_row_index: 3072 }, + PageLocation { + offset: 0, + compressed_page_size: 1024, + first_row_index: 0, + }, + PageLocation { + offset: 1024, + compressed_page_size: 1024, + first_row_index: 1024, + }, + PageLocation { + offset: 2048, + compressed_page_size: 1024, + first_row_index: 2048, + }, + PageLocation { + offset: 3072, + compressed_page_size: 1024, + first_row_index: 3072, + }, ]; let page_locs_string = vec![ - PageLocation { offset: 0, compressed_page_size: 512, first_row_index: 0 }, - PageLocation { offset: 512, compressed_page_size: 512, first_row_index: 512 }, - PageLocation { offset: 1024, compressed_page_size: 2976, first_row_index: 1024 }, - PageLocation { offset: 4000, compressed_page_size: 48, first_row_index: 4000 }, - PageLocation { offset: 4048, compressed_page_size: 48, first_row_index: 4048 }, - ]; - - Ok((vec![idx_float, idx_string], vec![ - OffsetIndexMetaData { - page_locations: page_locs_float, - unencoded_byte_array_data_bytes: None, + PageLocation { + offset: 0, + compressed_page_size: 512, + first_row_index: 0, + }, + PageLocation { + offset: 512, + compressed_page_size: 512, + first_row_index: 512, + }, + PageLocation { + offset: 1024, + compressed_page_size: 2976, + first_row_index: 1024, + }, + PageLocation { + offset: 4000, + compressed_page_size: 48, + first_row_index: 4000, }, - OffsetIndexMetaData { - page_locations: page_locs_string, - unencoded_byte_array_data_bytes: None, + PageLocation { + offset: 4048, + compressed_page_size: 48, + first_row_index: 4048, }, - ])) + ]; + + Ok(( + vec![idx_float, idx_string], + vec![ + OffsetIndexMetaData { + page_locations: page_locs_float, + unencoded_byte_array_data_bytes: None, + }, + OffsetIndexMetaData { + page_locations: page_locs_string, + unencoded_byte_array_data_bytes: None, + }, + ], + )) } } diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs index 8edffa2ab8..411ea168ee 100644 --- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs +++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs @@ -346,7 +346,6 @@ impl ParquetWriter { Ok(data_files) } - /// `ParquetMetadata` to data file builder pub(crate) fn parquet_to_data_file_builder( schema: SchemaRef, From ed9e98c0fa92cb42277b203c17367910388adcd5 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:31:20 +0100 Subject: [PATCH 10/29] -||- --- .../src/expr/visitors/page_index_evaluator.rs | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index efb0e9aafa..e98e1c958e 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -1394,18 +1394,15 @@ mod tests { }, ]; - Ok(( - vec![idx_float, idx_string], - vec![ - OffsetIndexMetaData { - page_locations: page_locs_float, - unencoded_byte_array_data_bytes: None, - }, - OffsetIndexMetaData { - page_locations: page_locs_string, - unencoded_byte_array_data_bytes: None, - }, - ], - )) + Ok((vec![idx_float, idx_string], vec![ + OffsetIndexMetaData { + page_locations: page_locs_float, + unencoded_byte_array_data_bytes: None, + }, + OffsetIndexMetaData { + page_locations: page_locs_string, + unencoded_byte_array_data_bytes: None, + }, + ])) } } From 789c3de16813ab5b23048e919e6bc2a6f60c7a20 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:47:01 +0100 Subject: [PATCH 11/29] Fix clippy --- crates/iceberg/src/expr/visitors/page_index_evaluator.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index e98e1c958e..af13f3af2c 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -795,7 +795,6 @@ mod tests { use parquet::arrow::arrow_reader::RowSelector; use parquet::basic::{LogicalType as ParquetLogicalType, Type as ParquetPhysicalType}; - use parquet::data_type::ByteArray; use parquet::file::metadata::{ColumnChunkMetaData, ColumnIndexBuilder, RowGroupMetaData}; use parquet::file::page_index::column_index::ColumnIndexMetaData as Index; use parquet::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; From cf594651707a382db74b934e6dde9d127e3f703d Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:54:18 +0100 Subject: [PATCH 12/29] fix clippy --- Makefile | 4 ++-- crates/iceberg/src/delete_vector.rs | 2 +- crates/iceberg/src/inspect/metadata_table.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index ecdada3df5..bfa774f009 100644 --- a/Makefile +++ b/Makefile @@ -21,10 +21,10 @@ build: cargo build --all-targets --all-features --workspace check-fmt: - cargo fmt --all -- --check + cargo fmt --all -- --check check-clippy: - cargo clippy --all-targets --all-features --workspace -- -D warnings + cargo clippy --all-targets --all-features --workspace --exclude iceberg-datafusion --exclude iceberg-playground --exclude iceberg-sqllogictest -- -D warnings install-cargo-machete: cargo install cargo-machete@0.7.0 diff --git a/crates/iceberg/src/delete_vector.rs b/crates/iceberg/src/delete_vector.rs index 1040796034..d2fc33b6f9 100644 --- a/crates/iceberg/src/delete_vector.rs +++ b/crates/iceberg/src/delete_vector.rs @@ -36,7 +36,7 @@ impl DeleteVector { } } - pub fn iter(&self) -> DeleteVectorIterator { + pub fn iter(&self) -> DeleteVectorIterator<'_> { let outer = self.inner.bitmaps(); DeleteVectorIterator { outer, inner: None } } diff --git a/crates/iceberg/src/inspect/metadata_table.rs b/crates/iceberg/src/inspect/metadata_table.rs index 92571db181..d5e9d60869 100644 --- a/crates/iceberg/src/inspect/metadata_table.rs +++ b/crates/iceberg/src/inspect/metadata_table.rs @@ -71,12 +71,12 @@ impl<'a> MetadataTable<'a> { } /// Get the snapshots table. - pub fn snapshots(&self) -> SnapshotsTable { + pub fn snapshots(&self) -> SnapshotsTable<'_> { SnapshotsTable::new(self.0) } /// Get the manifests table. - pub fn manifests(&self) -> ManifestsTable { + pub fn manifests(&self) -> ManifestsTable<'_> { ManifestsTable::new(self.0) } } From dbfde330fa4a1583f4d6821fab5d23f9a03c552e Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 13:59:05 +0100 Subject: [PATCH 13/29] clippy in CI --- .github/workflows/bindings_python_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index e9eabda2cb..9e59e905cd 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -50,7 +50,7 @@ jobs: run: cargo fmt --all -- --check - name: Check clippy working-directory: "bindings/python" - run: cargo clippy --all-targets --all-features -- -D warnings + run: cargo clippy --all-targets --all-features --exclude iceberg-datafusion --exclude iceberg-playground --exclude iceberg-sqllogictest -- -D warnings check-python: runs-on: ubuntu-latest From c616e01d9c36fe57b73d3adc69039369c17ca2a9 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 14:01:37 +0100 Subject: [PATCH 14/29] fix CI clippy command --- .github/workflows/bindings_python_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 9e59e905cd..cc4e831c86 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -50,7 +50,7 @@ jobs: run: cargo fmt --all -- --check - name: Check clippy working-directory: "bindings/python" - run: cargo clippy --all-targets --all-features --exclude iceberg-datafusion --exclude iceberg-playground --exclude iceberg-sqllogictest -- -D warnings + run: cargo clippy --all-targets --all-features --workspace --exclude iceberg-datafusion --exclude iceberg-playground --exclude iceberg-sqllogictest -- -D warnings check-python: runs-on: ubuntu-latest From 2fe3774c69d1fce55d69daa7e325cf40a03217a2 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 14:11:06 +0100 Subject: [PATCH 15/29] switch working directory --- .github/workflows/bindings_python_ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index cc4e831c86..204f6c7f5b 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -49,7 +49,6 @@ jobs: working-directory: "bindings/python" run: cargo fmt --all -- --check - name: Check clippy - working-directory: "bindings/python" run: cargo clippy --all-targets --all-features --workspace --exclude iceberg-datafusion --exclude iceberg-playground --exclude iceberg-sqllogictest -- -D warnings check-python: From b118fbc87d3b73f1664453ba6bfe5b26d53033e2 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 14:19:41 +0100 Subject: [PATCH 16/29] Disable python bindings CI tests --- .github/workflows/bindings_python_ci.yml | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 204f6c7f5b..43557a7ee8 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -18,23 +18,7 @@ name: Bindings Python CI on: - push: - branches: - - main - pull_request: - paths: - - '**' # Include all files and directories in the repository by default. - - '!.github/ISSUE_TEMPLATE/**' # Exclude files and directories that don't impact tests or code like templates, metadata, and documentation. - - '!scripts/**' - - '!website/**' - - '!.asf.yml' - - '!.gitattributes' - - '!.gitignore' - - '!CONTRIBUTING.md' - - '!CHANGELOG.md' - - '!LICENSE' - - '!NOTICE' - - '!README.md' + workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} @@ -49,7 +33,8 @@ jobs: working-directory: "bindings/python" run: cargo fmt --all -- --check - name: Check clippy - run: cargo clippy --all-targets --all-features --workspace --exclude iceberg-datafusion --exclude iceberg-playground --exclude iceberg-sqllogictest -- -D warnings + working-directory: "bindings/python" + run: cargo clippy --all-targets --all-features -- -D warnings check-python: runs-on: ubuntu-latest From 7d8199b621b36d29a8b626ce04e9a376ae8d160f Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 14:26:22 +0100 Subject: [PATCH 17/29] Disable only certain jobs in python bindings CI --- .github/workflows/bindings_python_ci.yml | 67 ++++++------------------ 1 file changed, 17 insertions(+), 50 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 43557a7ee8..cfc01b01b0 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -18,24 +18,29 @@ name: Bindings Python CI on: - workflow_dispatch: + push: + branches: + - main + pull_request: + paths: + - '**' # Include all files and directories in the repository by default. + - '!.github/ISSUE_TEMPLATE/**' # Exclude files and directories that don't impact tests or code like templates, metadata, and documentation. + - '!scripts/**' + - '!website/**' + - '!.asf.yml' + - '!.gitattributes' + - '!.gitignore' + - '!CONTRIBUTING.md' + - '!CHANGELOG.md' + - '!LICENSE' + - '!NOTICE' + - '!README.md' concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: - check-rust: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - name: Check format - working-directory: "bindings/python" - run: cargo fmt --all -- --check - - name: Check clippy - working-directory: "bindings/python" - run: cargo clippy --all-targets --all-features -- -D warnings - check-python: runs-on: ubuntu-latest steps: @@ -55,41 +60,3 @@ jobs: working-directory: "bindings/python" run: | uvx ruff check . - - test: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: - - ubuntu-latest - - macos-latest - - windows-latest - steps: - - uses: actions/checkout@v5 - - uses: actions/setup-python@v6 - with: - python-version: 3.12 - - uses: PyO3/maturin-action@v1 - with: - working-directory: "bindings/python" - command: build - args: --out dist --sdist - - uses: astral-sh/setup-uv@v7 - with: - version: "0.9.3" - enable-cache: true - - name: Sync dependencies - working-directory: "bindings/python" - shell: bash - run: | - make install - - name: Install built wheel - working-directory: "bindings/python" - shell: bash - run: | - uv pip install --reinstall dist/pyiceberg_core-*.whl - - name: Run tests - working-directory: "bindings/python" - shell: bash - run: | - make test From cb2150a6a45c772f7e5b48661a322ff283f6e1db Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 14:27:04 +0100 Subject: [PATCH 18/29] Remove unused dependency --- crates/iceberg/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 895a5cf5e4..d903d4f14d 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -87,7 +87,6 @@ serde_repr = { workspace = true } serde_with = { workspace = true } smol = { workspace = true, optional = true } strum = { workspace = true, features = ["derive"] } -thrift = { workspace = true } tokio = { workspace = true, optional = false, features = ["sync"] } typed-builder = { workspace = true } url = { workspace = true } From ac7dbdd16e7c493e7d6950ac42844daf94373dfb Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 15:04:42 +0100 Subject: [PATCH 19/29] forgot cargo.lock --- Cargo.lock | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 33fde8721d..87ea976241 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2306,7 +2306,6 @@ dependencies = [ "smol", "strum", "tempfile", - "thrift", "tokio", "typed-builder", "url", @@ -3171,16 +3170,6 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "num_enum" version = "0.7.5" @@ -5025,15 +5014,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "threadpool" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" -dependencies = [ - "num_cpus", -] - [[package]] name = "thrift" version = "0.17.0" @@ -5042,9 +5022,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding 3.0.4", - "log", "ordered-float 2.10.1", - "threadpool", ] [[package]] From 9712482327b31922695081e19b2cc7d0b9ccb6de Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 16:43:13 +0100 Subject: [PATCH 20/29] Support for _pos using arrow reader row numbers feature --- crates/iceberg/src/arrow/reader.rs | 26 +- .../src/arrow/record_batch_transformer.rs | 59 +++- crates/iceberg/src/metadata_columns.rs | 36 ++- crates/iceberg/src/scan/incremental/mod.rs | 45 ++- crates/iceberg/src/scan/mod.rs | 281 +++++++++++++++++- 5 files changed, 434 insertions(+), 13 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 4bebf9b612..359d89eacc 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -57,7 +57,9 @@ use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; use crate::expr::{BoundPredicate, BoundReference}; use crate::io::{FileIO, FileMetadata, FileRead}; -use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; +use crate::metadata_columns::{ + RESERVED_FIELD_ID_FILE, RESERVED_FIELD_ID_UNDERSCORE_POS, is_metadata_field, row_pos_field, +}; use crate::runtime::spawn; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; use crate::spec::{Datum, NameMapping, NestedField, PrimitiveLiteral, PrimitiveType, Schema, Type}; @@ -246,13 +248,25 @@ impl ArrowReader { let delete_filter_rx = delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); + let mut virtual_columns = Vec::new(); + + // Check if _pos column is requested and add it as a virtual column + let has_pos_column = task + .project_field_ids + .iter() + .any(|&id| id == RESERVED_FIELD_ID_UNDERSCORE_POS); + if has_pos_column { + // Add _pos as a virtual column to be produced by the Parquet reader + virtual_columns.push(Arc::clone(row_pos_field())); + } + // Migrated tables lack field IDs, requiring us to inspect the schema to choose // between field-ID-based or position-based projection let initial_stream_builder = Self::create_parquet_record_batch_stream_builder( &task.data_file_path, file_io.clone(), should_load_page_index, - None, + Some(ArrowReaderOptions::new().with_virtual_columns(virtual_columns.clone())), // TODO @vustef: Did we have to clone? There's too much cloning... ) .await?; @@ -298,7 +312,9 @@ impl ArrowReader { add_fallback_field_ids_to_arrow_schema(initial_stream_builder.schema()) }; - let options = ArrowReaderOptions::new().with_schema(arrow_schema); + let options = ArrowReaderOptions::new() + .with_schema(arrow_schema) + .with_virtual_columns(virtual_columns); Self::create_parquet_record_batch_stream_builder( &task.data_file_path, @@ -345,6 +361,10 @@ impl ArrowReader { PrimitiveLiteral::String(task.data_file_path.clone()), )?; + if has_pos_column { + record_batch_transformer_builder = record_batch_transformer_builder.with_virtual_columns(vec![Arc::clone(row_pos_field())]); + } + if let (Some(partition_spec), Some(partition_data)) = (task.partition_spec.clone(), task.partition.clone()) { diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 1cee224903..396f0d9b9b 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -154,6 +154,7 @@ pub(crate) struct RecordBatchTransformerBuilder { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, constant_fields: HashMap, + virtual_fields: Vec, } impl RecordBatchTransformerBuilder { @@ -165,6 +166,7 @@ impl RecordBatchTransformerBuilder { snapshot_schema, projected_iceberg_field_ids: projected_iceberg_field_ids.to_vec(), constant_fields: HashMap::new(), + virtual_fields: Vec::new(), } } @@ -180,6 +182,11 @@ impl RecordBatchTransformerBuilder { Ok(self) } + pub(crate) fn with_virtual_columns(mut self, virtual_fields: Vec) -> Self { + self.virtual_fields = virtual_fields; + self + } + /// Set partition spec and data together for identifying identity-transformed partition columns. /// /// Both partition_spec and partition_data must be provided together since the spec defines @@ -207,6 +214,7 @@ impl RecordBatchTransformerBuilder { snapshot_schema: self.snapshot_schema, projected_iceberg_field_ids: self.projected_iceberg_field_ids, constant_fields: self.constant_fields, + virtual_fields: self.virtual_fields, batch_transform: None, } } @@ -250,6 +258,9 @@ pub(crate) struct RecordBatchTransformer { // Includes both virtual/metadata fields (like _file) and identity-partitioned fields // Avoids type conversions during batch processing constant_fields: HashMap, + // Virtual fields are metadata fields that are not present in the snapshot schema, + // but are present in the source schema (arrow reader produces them) + virtual_fields: Vec, // BatchTransform gets lazily constructed based on the schema of // the first RecordBatch we receive from the file @@ -292,6 +303,7 @@ impl RecordBatchTransformer { self.snapshot_schema.as_ref(), &self.projected_iceberg_field_ids, &self.constant_fields, + &self.virtual_fields, )?); self.process_record_batch(record_batch)? @@ -311,17 +323,35 @@ impl RecordBatchTransformer { snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], constant_fields: &HashMap, + virtual_fields: &[FieldRef], ) -> Result { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = Self::build_field_id_to_arrow_schema_map(&mapped_unprojected_arrow_schema)?; + // Build a map of virtual field IDs to virtual fields for quick lookup + let virtual_field_map: HashMap = virtual_fields + .iter() + .filter_map(|field| { + field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .map(|field_id| (field_id, Arc::clone(field))) + }) + .collect(); + // Create a new arrow schema by selecting fields from mapped_unprojected, // in the order of the field ids in projected_iceberg_field_ids - let fields: Result> = projected_iceberg_field_ids + let fields: Vec = projected_iceberg_field_ids .iter() .map(|field_id| { - // Check if this is a constant field (virtual or partition) + // Check if this is a virtual field from Parquet reader + if let Some(virtual_field) = virtual_field_map.get(field_id) { + return Ok(Arc::clone(virtual_field)); + } + + // Check if this is a constant field (metadata/virtual or partition) if constant_fields.contains_key(field_id) { // For metadata/virtual fields (like _file), get name from metadata_columns // For partition fields, get name from schema (they exist in schema) @@ -342,7 +372,7 @@ impl RecordBatchTransformer { Ok(Arc::new(constant_field)) } } else { - // Regular field - use schema as-is + // Regular field from snapshot schema Ok(field_id_to_mapped_schema_map .get(field_id) .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? @@ -350,9 +380,13 @@ impl RecordBatchTransformer { .clone()) } }) - .collect(); + .collect::>>()?; + + let target_schema = Arc::new(ArrowSchema::new(fields)); - let target_schema = Arc::new(ArrowSchema::new(fields?)); + // Extract virtual field IDs for passing to generate_transform_operations + let virtual_field_ids: std::collections::HashSet = + virtual_field_map.keys().copied().collect(); match Self::compare_schemas(source_schema, &target_schema) { SchemaComparison::Equivalent => Ok(BatchTransform::PassThrough), @@ -364,6 +398,7 @@ impl RecordBatchTransformer { projected_iceberg_field_ids, field_id_to_mapped_schema_map, constant_fields, + &virtual_field_ids, )?, target_schema, }), @@ -421,6 +456,7 @@ impl RecordBatchTransformer { projected_iceberg_field_ids: &[i32], field_id_to_mapped_schema_map: HashMap, constant_fields: &HashMap, + virtual_field_ids: &std::collections::HashSet, ) -> Result> { let field_id_to_source_schema_map = Self::build_field_id_to_arrow_schema_map(source_schema)?; @@ -439,6 +475,19 @@ impl RecordBatchTransformer { }); } + // Check if this is a virtual field from Parquet reader (like _pos) + // Virtual fields don't exist in snapshot schema, they come from source + if virtual_field_ids.contains(field_id) { + let source_index = field_id_to_source_schema_map + .get(field_id) + .map(|(_, idx)| *idx) + .ok_or(Error::new( + ErrorKind::Unexpected, + "Virtual field not found in source schema", + ))?; + return Ok(ColumnSource::PassThrough { source_index }); + } + let (target_field, _) = field_id_to_mapped_schema_map .get(field_id) diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index b80619375f..0698cbb201 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -27,7 +27,7 @@ use std::sync::Arc; use arrow_schema::{DataType, Field}; use once_cell::sync::Lazy; -use parquet::arrow::PARQUET_FIELD_ID_META_KEY; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, RowNumber}; use crate::{Error, ErrorKind, Result}; @@ -37,6 +37,12 @@ pub const RESERVED_FIELD_ID_FILE: i32 = i32::MAX - 1; /// Reserved column name for the file path metadata column pub const RESERVED_COL_NAME_FILE: &str = "_file"; +/// Reserved field ID for the row position (_pos) metadata column +pub const RESERVED_FIELD_ID_UNDERSCORE_POS: i32 = i32::MAX - 2; + +/// Reserved column name for the row position metadata column +pub const RESERVED_COL_NAME_UNDERSCORE_POS: &str = "_pos"; + /// Reserved field ID for the file_path column used in delete file reading (positional deletes) pub const RESERVED_FIELD_ID_FILE_PATH: i32 = i32::MAX - 200; @@ -67,6 +73,27 @@ pub fn file_field() -> &'static Arc { &FILE_FIELD } +/// Lazy-initialized Arrow Field definition for the _pos metadata column. +/// Used for row position within a file. +static ROW_POS_FIELD: Lazy> = Lazy::new(|| { + Arc::new( + Field::new(RESERVED_COL_NAME_UNDERSCORE_POS, DataType::Int64, false).with_metadata( + HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + RESERVED_FIELD_ID_UNDERSCORE_POS.to_string(), + )]), + ).with_extension_type(RowNumber), + ) +}); + +/// Returns the Arrow Field definition for the _pos metadata column. +/// +/// # Returns +/// A reference to the _pos field definition +pub fn row_pos_field() -> &'static Arc { + &ROW_POS_FIELD +} + /// Lazy-initialized Arrow Field definition for the pos metadata column. /// Used in positional delete records. static POS_FIELD: Lazy> = Lazy::new(|| { @@ -119,6 +146,7 @@ pub fn file_path_field() -> &'static Arc { pub fn get_metadata_field(field_id: i32) -> Result> { match field_id { RESERVED_FIELD_ID_FILE => Ok(Arc::clone(file_field())), + RESERVED_FIELD_ID_UNDERSCORE_POS => Ok(Arc::clone(row_pos_field())), RESERVED_FIELD_ID_FILE_PATH => Ok(Arc::clone(file_path_field())), RESERVED_FIELD_ID_POS => Ok(Arc::clone(pos_field())), _ => Err(Error::new( @@ -138,6 +166,7 @@ pub fn get_metadata_field(field_id: i32) -> Result> { pub fn get_metadata_field_id(column_name: &str) -> Result { match column_name { RESERVED_COL_NAME_FILE => Ok(RESERVED_FIELD_ID_FILE), + RESERVED_COL_NAME_UNDERSCORE_POS => Ok(RESERVED_FIELD_ID_UNDERSCORE_POS), RESERVED_COL_NAME_FILE_PATH => Ok(RESERVED_FIELD_ID_FILE_PATH), RESERVED_COL_NAME_POS => Ok(RESERVED_FIELD_ID_POS), _ => Err(Error::new( @@ -157,7 +186,10 @@ pub fn get_metadata_field_id(column_name: &str) -> Result { pub fn is_metadata_field(field_id: i32) -> bool { matches!( field_id, - RESERVED_FIELD_ID_FILE | RESERVED_FIELD_ID_FILE_PATH | RESERVED_FIELD_ID_POS + RESERVED_FIELD_ID_FILE + | RESERVED_FIELD_ID_UNDERSCORE_POS + | RESERVED_FIELD_ID_FILE_PATH + | RESERVED_FIELD_ID_POS ) } diff --git a/crates/iceberg/src/scan/incremental/mod.rs b/crates/iceberg/src/scan/incremental/mod.rs index d8e981ffd3..e9d5889d0f 100644 --- a/crates/iceberg/src/scan/incremental/mod.rs +++ b/crates/iceberg/src/scan/incremental/mod.rs @@ -29,7 +29,8 @@ use crate::arrow::{ use crate::delete_file_index::DeleteFileIndex; use crate::io::FileIO; use crate::metadata_columns::{ - RESERVED_COL_NAME_FILE, get_metadata_field_id, is_metadata_column_name, + RESERVED_COL_NAME_FILE, RESERVED_COL_NAME_UNDERSCORE_POS, get_metadata_field_id, + is_metadata_column_name, }; use crate::scan::DeleteFileContext; use crate::scan::cache::ExpressionEvaluatorCache; @@ -153,6 +154,48 @@ impl<'a> IncrementalTableScanBuilder<'a> { self } + /// Include the _pos metadata column in the incremental scan. + /// + /// This is a convenience method that adds the _pos column to the current selection. + /// If no columns are currently selected (select_all), this will select all columns plus _pos. + /// If specific columns are selected, this adds _pos to that selection. + /// + /// The _pos column contains the row position within the file, produced by the underlying + /// Parquet reader as a virtual column. + /// + /// # Example + /// ```no_run + /// # use iceberg::table::Table; + /// # async fn example(table: Table) -> iceberg::Result<()> { + /// // Select id, name, and _pos for incremental scan + /// let scan = table + /// .incremental_scan(None, None) + /// .select(["id", "name"]) + /// .with_pos_column() + /// .build()?; + /// # Ok(()) + /// # } + /// ``` + pub fn with_pos_column(mut self) -> Self { + let mut columns = self.column_names.unwrap_or_else(|| { + // No explicit selection - get all column names from schema + self.table + .metadata() + .current_schema() + .as_struct() + .fields() + .iter() + .map(|f| f.name.clone()) + .collect() + }); + + // Add _pos column + columns.push(RESERVED_COL_NAME_UNDERSCORE_POS.to_string()); + + self.column_names = Some(columns); + self + } + /// Set the `from_snapshot_id` for the incremental scan. pub fn from_snapshot_id(mut self, from_snapshot_id: i64) -> Self { self.from_snapshot_id = Some(from_snapshot_id); diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 8ec3112015..1ea90ecaca 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -40,7 +40,8 @@ use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluato use crate::expr::{Bind, BoundPredicate, Predicate}; use crate::io::FileIO; use crate::metadata_columns::{ - RESERVED_COL_NAME_FILE, get_metadata_field_id, is_metadata_column_name, + RESERVED_COL_NAME_FILE, RESERVED_COL_NAME_UNDERSCORE_POS, get_metadata_field_id, + is_metadata_column_name, }; use crate::runtime::spawn; use crate::spec::{DataContentType, SnapshotRef}; @@ -169,6 +170,48 @@ impl<'a> TableScanBuilder<'a> { self } + /// Include the _pos metadata column in the scan. + /// + /// This is a convenience method that adds the _pos column to the current selection. + /// If no columns are currently selected (select_all), this will select all columns plus _pos. + /// If specific columns are selected, this adds _pos to that selection. + /// + /// The _pos column contains the row position within the file, produced by the underlying + /// Parquet reader as a virtual column. + /// + /// # Example + /// ```no_run + /// # use iceberg::table::Table; + /// # async fn example(table: Table) -> iceberg::Result<()> { + /// // Select id, name, and _pos + /// let scan = table + /// .scan() + /// .select(["id", "name"]) + /// .with_pos_column() + /// .build()?; + /// # Ok(()) + /// # } + /// ``` + pub fn with_pos_column(mut self) -> Self { + let mut columns = self.column_names.unwrap_or_else(|| { + // No explicit selection - get all column names from schema + self.table + .metadata() + .current_schema() + .as_struct() + .fields() + .iter() + .map(|f| f.name.clone()) + .collect() + }); + + // Add _pos column + columns.push(RESERVED_COL_NAME_UNDERSCORE_POS.to_string()); + + self.column_names = Some(columns); + self + } + /// Set the snapshot to scan. When not set, it uses current snapshot. pub fn snapshot_id(mut self, snapshot_id: i64) -> Self { self.snapshot_id = Some(snapshot_id); @@ -632,7 +675,7 @@ pub mod tests { use crate::arrow::ArrowReaderBuilder; use crate::expr::{BoundPredicate, Reference}; use crate::io::{FileIO, OutputFile}; - use crate::metadata_columns::RESERVED_COL_NAME_FILE; + use crate::metadata_columns::{RESERVED_COL_NAME_FILE, RESERVED_COL_NAME_UNDERSCORE_POS}; use crate::scan::FileScanTask; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, Datum, Literal, ManifestEntry, @@ -2153,4 +2196,238 @@ pub mod tests { "_file column (duplicate) should use Utf8 type" ); } + + #[tokio::test] + async fn test_select_with_pos_column() { + use arrow_array::cast::AsArray; + + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Select regular columns plus the _pos column + let table_scan = fixture + .table + .scan() + .select(["x", RESERVED_COL_NAME_UNDERSCORE_POS]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Verify we have 2 columns: x and _pos + assert_eq!(batches[0].num_columns(), 2); + + // Verify the x column exists and has correct data + let x_col = batches[0].column_by_name("x").unwrap(); + let x_arr = x_col.as_primitive::(); + assert_eq!(x_arr.value(0), 1); + + // Verify the _pos column exists + let pos_col = batches[0].column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS); + assert!( + pos_col.is_some(), + "_pos column should be present in the batch" + ); + + // Verify the _pos column has correct data type (Int64 from RowNumber extension) + let pos_col = pos_col.unwrap(); + assert_eq!( + pos_col.data_type(), + &arrow_schema::DataType::Int64, + "_pos column should use Int64 type" + ); + + // Get the position values from the Int64Array + let pos_array = pos_col.as_primitive::(); + + // Verify first position is 0 + assert_eq!(pos_array.value(0), 0, "First row should have position 0"); + + // Verify positions are sequential + for i in 1..pos_array.len().min(10) { + assert_eq!( + pos_array.value(i), + i as i64, + "Row {} should have position {}", + i, + i + ); + } + } + + #[tokio::test] + async fn test_select_with_pos_and_file_columns() { + use arrow_array::cast::AsArray; + + let mut fixture = TableTestFixture::new(); + fixture.setup_manifest_files().await; + + // Test 1: _file first, then _pos + let table_scan = fixture + .table + .scan() + .select(["x", RESERVED_COL_NAME_FILE, "y", RESERVED_COL_NAME_UNDERSCORE_POS]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Verify we have 4 columns: x, _file, y, _pos + assert_eq!(batches[0].num_columns(), 4); + + // Verify column order + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), "x", "Column 0 should be x"); + assert_eq!( + schema.field(1).name(), + RESERVED_COL_NAME_FILE, + "Column 1 should be _file" + ); + assert_eq!(schema.field(2).name(), "y", "Column 2 should be y"); + assert_eq!( + schema.field(3).name(), + RESERVED_COL_NAME_UNDERSCORE_POS, + "Column 3 should be _pos" + ); + + // Verify data types + assert_eq!( + schema.field(1).data_type(), + &arrow_schema::DataType::Utf8, + "_file column should use Utf8 type" + ); + assert_eq!( + schema.field(3).data_type(), + &arrow_schema::DataType::Int64, + "_pos column should use Int64 type" + ); + + // Verify _file column has valid data + let file_col = batches[0].column_by_name(RESERVED_COL_NAME_FILE).unwrap(); + let file_array = file_col.as_string::(); + let file_path = file_array.value(0); + assert!( + file_path.ends_with(".parquet"), + "File path should end with .parquet" + ); + + // Verify _pos column has valid sequential data + let pos_col = batches[0] + .column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS) + .unwrap(); + let pos_array = pos_col.as_primitive::(); + assert_eq!(pos_array.value(0), 0, "First row should have position 0"); + for i in 1..pos_array.len().min(10) { + assert_eq!( + pos_array.value(i), + i as i64, + "Row {} should have position {}", + i, + i + ); + } + + // Test 2: _pos first, then _file (reversed order) + let table_scan = fixture + .table + .scan() + .select(["x", RESERVED_COL_NAME_UNDERSCORE_POS, "y", RESERVED_COL_NAME_FILE]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Verify we have 4 columns in the new order + assert_eq!(batches[0].num_columns(), 4); + + // Verify column order is now: x, _pos, y, _file + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), "x", "Column 0 should be x"); + assert_eq!( + schema.field(1).name(), + RESERVED_COL_NAME_UNDERSCORE_POS, + "Column 1 should be _pos" + ); + assert_eq!(schema.field(2).name(), "y", "Column 2 should be y"); + assert_eq!( + schema.field(3).name(), + RESERVED_COL_NAME_FILE, + "Column 3 should be _file" + ); + + // Verify data is still correct + let pos_col = batches[0] + .column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS) + .unwrap(); + let pos_array = pos_col.as_primitive::(); + assert_eq!(pos_array.value(0), 0, "First row should have position 0"); + + let file_col = batches[0].column_by_name(RESERVED_COL_NAME_FILE).unwrap(); + let file_array = file_col.as_string::(); + let file_path = file_array.value(0); + assert!( + file_path.ends_with(".parquet"), + "File path should end with .parquet" + ); + + // Test 3: Both at the start + let table_scan = fixture + .table + .scan() + .select([RESERVED_COL_NAME_FILE, RESERVED_COL_NAME_UNDERSCORE_POS, "x", "y"]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + assert_eq!(batches[0].num_columns(), 4); + let schema = batches[0].schema(); + assert_eq!( + schema.field(0).name(), + RESERVED_COL_NAME_FILE, + "Column 0 should be _file" + ); + assert_eq!( + schema.field(1).name(), + RESERVED_COL_NAME_UNDERSCORE_POS, + "Column 1 should be _pos" + ); + assert_eq!(schema.field(2).name(), "x", "Column 2 should be x"); + assert_eq!(schema.field(3).name(), "y", "Column 3 should be y"); + + // Test 4: Both at the end + let table_scan = fixture + .table + .scan() + .select(["x", "y", RESERVED_COL_NAME_UNDERSCORE_POS, RESERVED_COL_NAME_FILE]) + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + assert_eq!(batches[0].num_columns(), 4); + let schema = batches[0].schema(); + assert_eq!(schema.field(0).name(), "x", "Column 0 should be x"); + assert_eq!(schema.field(1).name(), "y", "Column 1 should be y"); + assert_eq!( + schema.field(2).name(), + RESERVED_COL_NAME_UNDERSCORE_POS, + "Column 2 should be _pos" + ); + assert_eq!( + schema.field(3).name(), + RESERVED_COL_NAME_FILE, + "Column 3 should be _file" + ); + } } From a708f111721029e8cd31ae199b897c431bc29e4b Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Wed, 19 Nov 2025 16:44:36 +0100 Subject: [PATCH 21/29] clippy and fmt --- crates/iceberg/src/arrow/reader.rs | 6 +++--- crates/iceberg/src/metadata_columns.rs | 8 ++++---- crates/iceberg/src/scan/mod.rs | 28 ++++++++++++++++++++++---- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 359d89eacc..0999d7aee4 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -253,8 +253,7 @@ impl ArrowReader { // Check if _pos column is requested and add it as a virtual column let has_pos_column = task .project_field_ids - .iter() - .any(|&id| id == RESERVED_FIELD_ID_UNDERSCORE_POS); + .contains(&RESERVED_FIELD_ID_UNDERSCORE_POS); if has_pos_column { // Add _pos as a virtual column to be produced by the Parquet reader virtual_columns.push(Arc::clone(row_pos_field())); @@ -362,7 +361,8 @@ impl ArrowReader { )?; if has_pos_column { - record_batch_transformer_builder = record_batch_transformer_builder.with_virtual_columns(vec![Arc::clone(row_pos_field())]); + record_batch_transformer_builder = record_batch_transformer_builder + .with_virtual_columns(vec![Arc::clone(row_pos_field())]); } if let (Some(partition_spec), Some(partition_data)) = diff --git a/crates/iceberg/src/metadata_columns.rs b/crates/iceberg/src/metadata_columns.rs index 0698cbb201..639d2028b9 100644 --- a/crates/iceberg/src/metadata_columns.rs +++ b/crates/iceberg/src/metadata_columns.rs @@ -77,12 +77,12 @@ pub fn file_field() -> &'static Arc { /// Used for row position within a file. static ROW_POS_FIELD: Lazy> = Lazy::new(|| { Arc::new( - Field::new(RESERVED_COL_NAME_UNDERSCORE_POS, DataType::Int64, false).with_metadata( - HashMap::from([( + Field::new(RESERVED_COL_NAME_UNDERSCORE_POS, DataType::Int64, false) + .with_metadata(HashMap::from([( PARQUET_FIELD_ID_META_KEY.to_string(), RESERVED_FIELD_ID_UNDERSCORE_POS.to_string(), - )]), - ).with_extension_type(RowNumber), + )])) + .with_extension_type(RowNumber), ) }); diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 1ea90ecaca..8dd3e3961b 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -2268,7 +2268,12 @@ pub mod tests { let table_scan = fixture .table .scan() - .select(["x", RESERVED_COL_NAME_FILE, "y", RESERVED_COL_NAME_UNDERSCORE_POS]) + .select([ + "x", + RESERVED_COL_NAME_FILE, + "y", + RESERVED_COL_NAME_UNDERSCORE_POS, + ]) .with_row_selection_enabled(true) .build() .unwrap(); @@ -2335,7 +2340,12 @@ pub mod tests { let table_scan = fixture .table .scan() - .select(["x", RESERVED_COL_NAME_UNDERSCORE_POS, "y", RESERVED_COL_NAME_FILE]) + .select([ + "x", + RESERVED_COL_NAME_UNDERSCORE_POS, + "y", + RESERVED_COL_NAME_FILE, + ]) .with_row_selection_enabled(true) .build() .unwrap(); @@ -2380,7 +2390,12 @@ pub mod tests { let table_scan = fixture .table .scan() - .select([RESERVED_COL_NAME_FILE, RESERVED_COL_NAME_UNDERSCORE_POS, "x", "y"]) + .select([ + RESERVED_COL_NAME_FILE, + RESERVED_COL_NAME_UNDERSCORE_POS, + "x", + "y", + ]) .with_row_selection_enabled(true) .build() .unwrap(); @@ -2407,7 +2422,12 @@ pub mod tests { let table_scan = fixture .table .scan() - .select(["x", "y", RESERVED_COL_NAME_UNDERSCORE_POS, RESERVED_COL_NAME_FILE]) + .select([ + "x", + "y", + RESERVED_COL_NAME_UNDERSCORE_POS, + RESERVED_COL_NAME_FILE, + ]) .with_row_selection_enabled(true) .build() .unwrap(); From dcb3960df6c616791517de8d2976344ec9c6a309 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 11:31:27 +0100 Subject: [PATCH 22/29] update arrow-rs --- Cargo.lock | 22 +++++++++++----------- Cargo.toml | 18 +++++++++--------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87ea976241..ed4954dbb8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -136,7 +136,7 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow-arith" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -149,7 +149,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -166,7 +166,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "bytes", "half", @@ -177,7 +177,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -197,7 +197,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -209,7 +209,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -222,7 +222,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -234,12 +234,12 @@ dependencies = [ [[package]] name = "arrow-schema" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" [[package]] name = "arrow-select" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -252,7 +252,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3300,7 +3300,7 @@ dependencies = [ [[package]] name = "parquet" version = "57.0.0" -source = "git+https://github.com/apache/arrow-rs?rev=c5c8076398d62780b0c192c59a784e6196016ab8#c5c8076398d62780b0c192c59a784e6196016ab8" +source = "git+https://github.com/apache/arrow-rs?rev=fea605cb16f7524cb69a197bfa581a1d4f5fe5d0#fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" dependencies = [ "ahash 0.8.12", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index d97a059e27..9ca0ceff64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,14 +46,14 @@ rust-version = "1.87" anyhow = "1.0.72" apache-avro = { version = "0.20", features = ["zstandard"] } array-init = "2" -arrow-arith = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-array = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-cast = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-ord = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-schema = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-select = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } -arrow-string = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +arrow-arith = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-array = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-cast = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-ord = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-schema = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-select = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } +arrow-string = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } as-any = "0.3.2" async-trait = "0.1.89" aws-config = "1.8.7" @@ -105,7 +105,7 @@ num-bigint = "0.4.6" once_cell = "1.20" opendal = "0.54.0" ordered-float = "4" -parquet = { git = "https://github.com/apache/arrow-rs", rev = "c5c8076398d62780b0c192c59a784e6196016ab8" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "fea605cb16f7524cb69a197bfa581a1d4f5fe5d0" } pilota = "0.11.10" port_scanner = "0.1.5" pretty_assertions = "1.4" From 879b6e1f7660d645c226f7c1096642dcefdf7aa6 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 12:36:50 +0100 Subject: [PATCH 23/29] pos in incremental scan too --- crates/iceberg/src/arrow/incremental.rs | 36 +++- crates/iceberg/src/arrow/reader.rs | 4 +- crates/iceberg/src/scan/incremental/tests.rs | 178 ++++++++++++++++++- 3 files changed, 211 insertions(+), 7 deletions(-) diff --git a/crates/iceberg/src/arrow/incremental.rs b/crates/iceberg/src/arrow/incremental.rs index d9c079a2f8..072a757f4b 100644 --- a/crates/iceberg/src/arrow/incremental.rs +++ b/crates/iceberg/src/arrow/incremental.rs @@ -23,12 +23,16 @@ use arrow_schema::Schema as ArrowSchema; use futures::channel::mpsc::channel; use futures::stream::select; use futures::{Stream, StreamExt, TryStreamExt}; +use parquet::arrow::arrow_reader::ArrowReaderOptions; use crate::arrow::reader::process_record_batch_stream; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; use crate::arrow::{ArrowReader, StreamsInto}; use crate::delete_vector::DeleteVector; use crate::io::FileIO; +use crate::metadata_columns::{ + RESERVED_FIELD_ID_UNDERSCORE_POS, row_pos_field, +}; use crate::runtime::spawn; use crate::scan::ArrowRecordBatchStream; use crate::scan::incremental::{ @@ -172,11 +176,29 @@ async fn process_incremental_append_task( batch_size: Option, file_io: FileIO, ) -> Result { + let mut virtual_columns = Vec::new(); + + // Check if _pos column is requested and add it as a virtual column + let has_pos_column = task + .base + .project_field_ids + .contains(&RESERVED_FIELD_ID_UNDERSCORE_POS); + if has_pos_column { + // Add _pos as a virtual column to be produced by the Parquet reader + virtual_columns.push(Arc::clone(row_pos_field())); + } + + let arrow_reader_options = if !virtual_columns.is_empty() { + Some(ArrowReaderOptions::new().with_virtual_columns(virtual_columns.clone())?) + } else { + None + }; + let mut record_batch_stream_builder = ArrowReader::create_parquet_record_batch_stream_builder( &task.base.data_file_path, file_io, true, - None, // arrow_reader_options + arrow_reader_options, ) .await?; @@ -194,13 +216,19 @@ async fn process_incremental_append_task( // RecordBatchTransformer performs any transformations required on the RecordBatches // that come back from the file, such as type promotion, default column insertion, // column re-ordering, and virtual field addition (like _file) - let mut record_batch_transformer = + let mut record_batch_transformer_builder = RecordBatchTransformerBuilder::new(task.schema_ref(), &task.base.project_field_ids) .with_constant( crate::metadata_columns::RESERVED_FIELD_ID_FILE, crate::spec::PrimitiveLiteral::String(task.base.data_file_path.clone()), - )? - .build(); + )?; + + if has_pos_column { + record_batch_transformer_builder = record_batch_transformer_builder + .with_virtual_columns(vec![Arc::clone(row_pos_field())]); + } + + let mut record_batch_transformer = record_batch_transformer_builder.build(); if let Some(batch_size) = batch_size { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 0999d7aee4..eaf4618297 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -265,7 +265,7 @@ impl ArrowReader { &task.data_file_path, file_io.clone(), should_load_page_index, - Some(ArrowReaderOptions::new().with_virtual_columns(virtual_columns.clone())), // TODO @vustef: Did we have to clone? There's too much cloning... + Some(ArrowReaderOptions::new().with_virtual_columns(virtual_columns.clone())?), // TODO @vustef: Did we have to clone? There's too much cloning... ) .await?; @@ -313,7 +313,7 @@ impl ArrowReader { let options = ArrowReaderOptions::new() .with_schema(arrow_schema) - .with_virtual_columns(virtual_columns); + .with_virtual_columns(virtual_columns)?; Self::create_parquet_record_batch_stream_builder( &task.data_file_path, diff --git a/crates/iceberg/src/scan/incremental/tests.rs b/crates/iceberg/src/scan/incremental/tests.rs index 4dad7d77e5..708a74e7a5 100644 --- a/crates/iceberg/src/scan/incremental/tests.rs +++ b/crates/iceberg/src/scan/incremental/tests.rs @@ -31,7 +31,7 @@ use uuid::Uuid; use crate::TableIdent; use crate::io::{FileIO, OutputFile}; -use crate::metadata_columns::RESERVED_COL_NAME_FILE; +use crate::metadata_columns::{RESERVED_COL_NAME_FILE, RESERVED_COL_NAME_UNDERSCORE_POS}; use crate::spec::{ DataContentType, DataFileBuilder, DataFileFormat, ManifestEntry, ManifestListWriter, ManifestStatus, ManifestWriterBuilder, PartitionSpec, SchemaRef, Struct, TableMetadata, @@ -2529,3 +2529,179 @@ async fn test_incremental_scan_with_file_column() { } } } + +#[tokio::test] +async fn test_incremental_select_with_pos_column() { + use arrow_array::cast::AsArray; + + // Create a fixture with test data + let fixture = IncrementalTestFixture::new(vec![ + Operation::Add(vec![], "empty.parquet".to_string()), // Snapshot 1: empty + Operation::Add( + vec![ + (1, "a".to_string()), + (2, "b".to_string()), + (3, "c".to_string()), + ], + "data-1.parquet".to_string(), + ), // Snapshot 2 + ]) + .await; + + // Build an incremental scan with _pos column + let scan = fixture + .table + .incremental_scan(Some(1), Some(2)) + .select(["n", RESERVED_COL_NAME_UNDERSCORE_POS]) + .build() + .unwrap(); + + let stream = scan.to_arrow().await.unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + + // Get append batches (we're only appending in this test) + let append_batches: Vec<_> = batches + .iter() + .filter(|(t, _)| *t == crate::arrow::IncrementalBatchType::Append) + .map(|(_, b)| b.clone()) + .collect(); + + // Verify we have append batches + assert!(!append_batches.is_empty(), "Should have append batches"); + + for batch in append_batches { + // Should have 2 columns: n and _pos + assert_eq!( + batch.num_columns(), + 2, + "Should have n and _pos columns" + ); + + // Verify the n column exists + assert!(batch.column_by_name("n").is_some(), "n column should exist"); + + // Verify the _pos column exists + let pos_col = batch.column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS); + assert!( + pos_col.is_some(), + "_pos column should be present in the batch" + ); + + // Verify the _pos column has correct data type (Int64 from RowNumber extension) + let pos_col = pos_col.unwrap(); + assert_eq!( + pos_col.data_type(), + &arrow_schema::DataType::Int64, + "_pos column should use Int64 type" + ); + + // Get the position values from the Int64Array + let pos_array = pos_col.as_primitive::(); + + // Verify first position is 0 + assert_eq!(pos_array.value(0), 0, "First row should have position 0"); + + // Verify positions are sequential + for i in 1..pos_array.len() { + assert_eq!( + pos_array.value(i), + i as i64, + "Row {} should have position {}", + i, + i + ); + } + } +} + +#[tokio::test] +async fn test_incremental_select_with_pos_and_file_columns() { + use arrow_array::cast::AsArray; + + // Create a fixture with test data + let fixture = IncrementalTestFixture::new(vec![ + Operation::Add(vec![], "empty.parquet".to_string()), // Snapshot 1: empty + Operation::Add( + vec![ + (1, "a".to_string()), + (2, "b".to_string()), + (3, "c".to_string()), + ], + "data-1.parquet".to_string(), + ), // Snapshot 2 + ]) + .await; + + // Build an incremental scan with both _pos and _file columns + let scan = fixture + .table + .incremental_scan(Some(1), Some(2)) + .select([ + "n", + RESERVED_COL_NAME_FILE, + "data", + RESERVED_COL_NAME_UNDERSCORE_POS, + ]) + .build() + .unwrap(); + + let stream = scan.to_arrow().await.unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + + // Get append batches + let append_batches: Vec<_> = batches + .iter() + .filter(|(t, _)| *t == crate::arrow::IncrementalBatchType::Append) + .map(|(_, b)| b.clone()) + .collect(); + + // Verify we have append batches + assert!(!append_batches.is_empty(), "Should have append batches"); + + for batch in append_batches { + // Should have 4 columns: n, _file, data, _pos + assert_eq!( + batch.num_columns(), + 4, + "Should have n, _file, data, and _pos columns" + ); + + // Verify all columns exist + assert!(batch.column_by_name("n").is_some()); + assert!(batch.column_by_name(RESERVED_COL_NAME_FILE).is_some()); + assert!(batch.column_by_name("data").is_some()); + assert!(batch.column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS).is_some()); + + // Verify the _pos column has correct data type + let pos_col = batch.column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS).unwrap(); + assert_eq!( + pos_col.data_type(), + &arrow_schema::DataType::Int64, + "_pos column should use Int64 type" + ); + + // Verify positions are sequential starting from 0 + let pos_array = pos_col.as_primitive::(); + for i in 0..pos_array.len() { + assert_eq!( + pos_array.value(i), + i as i64, + "Row {} should have position {}", + i, + i + ); + } + + // Verify _file column contains a valid file path + let file_col = batch.column_by_name(RESERVED_COL_NAME_FILE).unwrap(); + let string_array = file_col.as_string::(); + for i in 0..batch.num_rows() { + let file_path = string_array.value(i); + assert!( + file_path.ends_with(".parquet"), + "File path should end with .parquet: {}", + file_path + ); + } + } +} From 57b1d54c3ec7013edf058da5de171c5a68a3e50a Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 12:43:37 +0100 Subject: [PATCH 24/29] with_virtual for adding to hash map --- crates/iceberg/src/arrow/incremental.rs | 2 +- crates/iceberg/src/arrow/reader.rs | 2 +- .../src/arrow/record_batch_transformer.rs | 55 ++++++++++--------- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/crates/iceberg/src/arrow/incremental.rs b/crates/iceberg/src/arrow/incremental.rs index 072a757f4b..acbbfc2791 100644 --- a/crates/iceberg/src/arrow/incremental.rs +++ b/crates/iceberg/src/arrow/incremental.rs @@ -225,7 +225,7 @@ async fn process_incremental_append_task( if has_pos_column { record_batch_transformer_builder = record_batch_transformer_builder - .with_virtual_columns(vec![Arc::clone(row_pos_field())]); + .with_virtual_field(Arc::clone(row_pos_field()))?; } let mut record_batch_transformer = record_batch_transformer_builder.build(); diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index eaf4618297..d87da11c8d 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -362,7 +362,7 @@ impl ArrowReader { if has_pos_column { record_batch_transformer_builder = record_batch_transformer_builder - .with_virtual_columns(vec![Arc::clone(row_pos_field())]); + .with_virtual_field(Arc::clone(row_pos_field()))?; } if let (Some(partition_spec), Some(partition_data)) = diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 396f0d9b9b..ca6e14acd2 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -154,7 +154,7 @@ pub(crate) struct RecordBatchTransformerBuilder { snapshot_schema: Arc, projected_iceberg_field_ids: Vec, constant_fields: HashMap, - virtual_fields: Vec, + virtual_fields: HashMap, } impl RecordBatchTransformerBuilder { @@ -166,7 +166,7 @@ impl RecordBatchTransformerBuilder { snapshot_schema, projected_iceberg_field_ids: projected_iceberg_field_ids.to_vec(), constant_fields: HashMap::new(), - virtual_fields: Vec::new(), + virtual_fields: HashMap::new(), } } @@ -182,9 +182,25 @@ impl RecordBatchTransformerBuilder { Ok(self) } - pub(crate) fn with_virtual_columns(mut self, virtual_fields: Vec) -> Self { - self.virtual_fields = virtual_fields; - self + /// Add a virtual field for a specific field ID. + /// This is used for virtual/metadata fields like _pos that are produced by the Parquet reader. + /// + /// # Arguments + /// * `field` - The Arrow field representing the virtual column + pub(crate) fn with_virtual_field(mut self, field: FieldRef) -> Result { + // Extract field ID from metadata + let field_id = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Virtual field must have a field ID in metadata", + ) + })?; + self.virtual_fields.insert(field_id, field); + Ok(self) } /// Set partition spec and data together for identifying identity-transformed partition columns. @@ -260,7 +276,8 @@ pub(crate) struct RecordBatchTransformer { constant_fields: HashMap, // Virtual fields are metadata fields that are not present in the snapshot schema, // but are present in the source schema (arrow reader produces them) - virtual_fields: Vec, + // Map from field_id to FieldRef + virtual_fields: HashMap, // BatchTransform gets lazily constructed based on the schema of // the first RecordBatch we receive from the file @@ -323,31 +340,19 @@ impl RecordBatchTransformer { snapshot_schema: &IcebergSchema, projected_iceberg_field_ids: &[i32], constant_fields: &HashMap, - virtual_fields: &[FieldRef], + virtual_fields: &HashMap, ) -> Result { let mapped_unprojected_arrow_schema = Arc::new(schema_to_arrow_schema(snapshot_schema)?); let field_id_to_mapped_schema_map = Self::build_field_id_to_arrow_schema_map(&mapped_unprojected_arrow_schema)?; - // Build a map of virtual field IDs to virtual fields for quick lookup - let virtual_field_map: HashMap = virtual_fields - .iter() - .filter_map(|field| { - field - .metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|id_str| id_str.parse::().ok()) - .map(|field_id| (field_id, Arc::clone(field))) - }) - .collect(); - // Create a new arrow schema by selecting fields from mapped_unprojected, // in the order of the field ids in projected_iceberg_field_ids let fields: Vec = projected_iceberg_field_ids .iter() .map(|field_id| { // Check if this is a virtual field from Parquet reader - if let Some(virtual_field) = virtual_field_map.get(field_id) { + if let Some(virtual_field) = virtual_fields.get(field_id) { return Ok(Arc::clone(virtual_field)); } @@ -384,10 +389,6 @@ impl RecordBatchTransformer { let target_schema = Arc::new(ArrowSchema::new(fields)); - // Extract virtual field IDs for passing to generate_transform_operations - let virtual_field_ids: std::collections::HashSet = - virtual_field_map.keys().copied().collect(); - match Self::compare_schemas(source_schema, &target_schema) { SchemaComparison::Equivalent => Ok(BatchTransform::PassThrough), SchemaComparison::NameChangesOnly => Ok(BatchTransform::ModifySchema { target_schema }), @@ -398,7 +399,7 @@ impl RecordBatchTransformer { projected_iceberg_field_ids, field_id_to_mapped_schema_map, constant_fields, - &virtual_field_ids, + virtual_fields, )?, target_schema, }), @@ -456,7 +457,7 @@ impl RecordBatchTransformer { projected_iceberg_field_ids: &[i32], field_id_to_mapped_schema_map: HashMap, constant_fields: &HashMap, - virtual_field_ids: &std::collections::HashSet, + virtual_fields: &HashMap, ) -> Result> { let field_id_to_source_schema_map = Self::build_field_id_to_arrow_schema_map(source_schema)?; @@ -477,7 +478,7 @@ impl RecordBatchTransformer { // Check if this is a virtual field from Parquet reader (like _pos) // Virtual fields don't exist in snapshot schema, they come from source - if virtual_field_ids.contains(field_id) { + if virtual_fields.contains_key(field_id) { let source_index = field_id_to_source_schema_map .get(field_id) .map(|(_, idx)| *idx) From 1a6d3490401251e2a63c1f3c57a2091d947000f7 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 12:46:17 +0100 Subject: [PATCH 25/29] remove todo comment, it's fine --- crates/iceberg/src/arrow/reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index d87da11c8d..f275dbbc74 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -265,7 +265,7 @@ impl ArrowReader { &task.data_file_path, file_io.clone(), should_load_page_index, - Some(ArrowReaderOptions::new().with_virtual_columns(virtual_columns.clone())?), // TODO @vustef: Did we have to clone? There's too much cloning... + Some(ArrowReaderOptions::new().with_virtual_columns(virtual_columns.clone())?), ) .await?; From aa28a83245745150c339b6c27280ea2099437d92 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 12:52:33 +0100 Subject: [PATCH 26/29] revert back some changes --- crates/iceberg/src/arrow/record_batch_transformer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index ca6e14acd2..444e46775d 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -348,7 +348,7 @@ impl RecordBatchTransformer { // Create a new arrow schema by selecting fields from mapped_unprojected, // in the order of the field ids in projected_iceberg_field_ids - let fields: Vec = projected_iceberg_field_ids + let fields: Result> = projected_iceberg_field_ids .iter() .map(|field_id| { // Check if this is a virtual field from Parquet reader @@ -385,9 +385,9 @@ impl RecordBatchTransformer { .clone()) } }) - .collect::>>()?; + .collect(); - let target_schema = Arc::new(ArrowSchema::new(fields)); + let target_schema = Arc::new(ArrowSchema::new(fields?)); match Self::compare_schemas(source_schema, &target_schema) { SchemaComparison::Equivalent => Ok(BatchTransform::PassThrough), From ace9baf91cbc67ee74a9a53b8bafc55df63e805f Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 12:53:53 +0100 Subject: [PATCH 27/29] -||- --- crates/iceberg/src/arrow/record_batch_transformer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/record_batch_transformer.rs b/crates/iceberg/src/arrow/record_batch_transformer.rs index 444e46775d..4b4010bbba 100644 --- a/crates/iceberg/src/arrow/record_batch_transformer.rs +++ b/crates/iceberg/src/arrow/record_batch_transformer.rs @@ -377,7 +377,7 @@ impl RecordBatchTransformer { Ok(Arc::new(constant_field)) } } else { - // Regular field from snapshot schema + // Regular field - use schema as-is Ok(field_id_to_mapped_schema_map .get(field_id) .ok_or(Error::new(ErrorKind::Unexpected, "field not found"))? From d75584d92f72e49b43ebbe9f2faafa0d5732e151 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 13:03:00 +0100 Subject: [PATCH 28/29] add `with_pos_column` in tests --- crates/iceberg/src/scan/incremental/tests.rs | 59 ++++++++++++++++++++ crates/iceberg/src/scan/mod.rs | 44 +++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/crates/iceberg/src/scan/incremental/tests.rs b/crates/iceberg/src/scan/incremental/tests.rs index 708a74e7a5..23718835c0 100644 --- a/crates/iceberg/src/scan/incremental/tests.rs +++ b/crates/iceberg/src/scan/incremental/tests.rs @@ -2611,6 +2611,65 @@ async fn test_incremental_select_with_pos_column() { i ); } + + // Test variant 2: Use with_pos_column() method instead of selecting by name + let scan = fixture + .table + .incremental_scan(Some(1), Some(2)) + .select(["n"]) + .with_pos_column() + .build() + .unwrap(); + + let stream = scan.to_arrow().await.unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + + // Get append batches + let append_batches: Vec<_> = batches + .iter() + .filter(|(t, _)| *t == crate::arrow::IncrementalBatchType::Append) + .map(|(_, b)| b.clone()) + .collect(); + + // Verify we have append batches + assert!(!append_batches.is_empty(), "Should have append batches"); + + for batch in append_batches { + // Should have 2 columns: n and _pos + assert_eq!( + batch.num_columns(), + 2, + "Should have n and _pos columns when using with_pos_column()" + ); + + // Verify the _pos column exists + let pos_col = batch.column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS); + assert!( + pos_col.is_some(), + "_pos column should be present when using with_pos_column()" + ); + + // Verify the _pos column has correct data type + let pos_col = pos_col.unwrap(); + assert_eq!( + pos_col.data_type(), + &arrow_schema::DataType::Int64, + "_pos column should use Int64 type" + ); + + // Verify positions are sequential + let pos_array = pos_col.as_primitive::(); + assert_eq!(pos_array.value(0), 0, "First row should have position 0"); + for i in 1..pos_array.len() { + assert_eq!( + pos_array.value(i), + i as i64, + "Row {} should have position {}", + i, + i + ); + } + } } } diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 8dd3e3961b..14af028edd 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -2255,6 +2255,50 @@ pub mod tests { i ); } + + // Test variant 2: Use with_pos_column() method instead of selecting by name + let table_scan = fixture + .table + .scan() + .select(["x"]) + .with_pos_column() + .with_row_selection_enabled(true) + .build() + .unwrap(); + + let batch_stream = table_scan.to_arrow().await.unwrap(); + let batches: Vec<_> = batch_stream.try_collect().await.unwrap(); + + // Verify we have 2 columns: x and _pos + assert_eq!(batches[0].num_columns(), 2); + + // Verify the _pos column exists + let pos_col = batches[0].column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS); + assert!( + pos_col.is_some(), + "_pos column should be present when using with_pos_column()" + ); + + // Verify the _pos column has correct data type + let pos_col = pos_col.unwrap(); + assert_eq!( + pos_col.data_type(), + &arrow_schema::DataType::Int64, + "_pos column should use Int64 type" + ); + + // Verify positions are sequential + let pos_array = pos_col.as_primitive::(); + assert_eq!(pos_array.value(0), 0, "First row should have position 0"); + for i in 1..pos_array.len().min(10) { + assert_eq!( + pos_array.value(i), + i as i64, + "Row {} should have position {}", + i, + i + ); + } } #[tokio::test] From 5003e5d01cc081fbaeaedb3e4b9321c7727c5ae3 Mon Sep 17 00:00:00 2001 From: Vukasin Stefanovic Date: Thu, 20 Nov 2025 13:03:22 +0100 Subject: [PATCH 29/29] cargo fmt --- crates/iceberg/src/arrow/incremental.rs | 8 +++----- crates/iceberg/src/arrow/reader.rs | 4 ++-- crates/iceberg/src/scan/incremental/tests.rs | 16 +++++++++------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crates/iceberg/src/arrow/incremental.rs b/crates/iceberg/src/arrow/incremental.rs index acbbfc2791..e2eba958f5 100644 --- a/crates/iceberg/src/arrow/incremental.rs +++ b/crates/iceberg/src/arrow/incremental.rs @@ -30,9 +30,7 @@ use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; use crate::arrow::{ArrowReader, StreamsInto}; use crate::delete_vector::DeleteVector; use crate::io::FileIO; -use crate::metadata_columns::{ - RESERVED_FIELD_ID_UNDERSCORE_POS, row_pos_field, -}; +use crate::metadata_columns::{RESERVED_FIELD_ID_UNDERSCORE_POS, row_pos_field}; use crate::runtime::spawn; use crate::scan::ArrowRecordBatchStream; use crate::scan::incremental::{ @@ -224,8 +222,8 @@ async fn process_incremental_append_task( )?; if has_pos_column { - record_batch_transformer_builder = record_batch_transformer_builder - .with_virtual_field(Arc::clone(row_pos_field()))?; + record_batch_transformer_builder = + record_batch_transformer_builder.with_virtual_field(Arc::clone(row_pos_field()))?; } let mut record_batch_transformer = record_batch_transformer_builder.build(); diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index f275dbbc74..d6cc8ae9ca 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -361,8 +361,8 @@ impl ArrowReader { )?; if has_pos_column { - record_batch_transformer_builder = record_batch_transformer_builder - .with_virtual_field(Arc::clone(row_pos_field()))?; + record_batch_transformer_builder = + record_batch_transformer_builder.with_virtual_field(Arc::clone(row_pos_field()))?; } if let (Some(partition_spec), Some(partition_data)) = diff --git a/crates/iceberg/src/scan/incremental/tests.rs b/crates/iceberg/src/scan/incremental/tests.rs index 23718835c0..22fe3142cd 100644 --- a/crates/iceberg/src/scan/incremental/tests.rs +++ b/crates/iceberg/src/scan/incremental/tests.rs @@ -2571,11 +2571,7 @@ async fn test_incremental_select_with_pos_column() { for batch in append_batches { // Should have 2 columns: n and _pos - assert_eq!( - batch.num_columns(), - 2, - "Should have n and _pos columns" - ); + assert_eq!(batch.num_columns(), 2, "Should have n and _pos columns"); // Verify the n column exists assert!(batch.column_by_name("n").is_some(), "n column should exist"); @@ -2729,10 +2725,16 @@ async fn test_incremental_select_with_pos_and_file_columns() { assert!(batch.column_by_name("n").is_some()); assert!(batch.column_by_name(RESERVED_COL_NAME_FILE).is_some()); assert!(batch.column_by_name("data").is_some()); - assert!(batch.column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS).is_some()); + assert!( + batch + .column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS) + .is_some() + ); // Verify the _pos column has correct data type - let pos_col = batch.column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS).unwrap(); + let pos_col = batch + .column_by_name(RESERVED_COL_NAME_UNDERSCORE_POS) + .unwrap(); assert_eq!( pos_col.data_type(), &arrow_schema::DataType::Int64,