apache · mbutrovich · Apr 7, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/Cargo.toml b/native/Cargo.toml
@@ -34,14 +34,14 @@ edition = "2021"
 rust-version = "1.88"
 
 [workspace.dependencies]
-arrow = { version = "57.3.0", features = ["prettyprint", "ffi", "chrono-tz"] }
+arrow = { version = "58.1.0", features = ["prettyprint", "ffi", "chrono-tz"] }
 async-trait = { version = "0.1" }
 bytes = { version = "1.11.1" }
-parquet = { version = "57.3.0", default-features = false, features = ["experimental"] }
-datafusion = { version = "52.4.0", default-features = false, features = ["unicode_expressions", "crypto_expressions", "nested_expressions", "parquet"] }
-datafusion-datasource = { version = "52.4.0" }
-datafusion-physical-expr-adapter = { version = "52.4.0" }
-datafusion-spark = { version = "52.4.0" }
+parquet = { version = "58.1.0", default-features = false, features = ["experimental"] }
+datafusion = { version = "53.0.0", default-features = false, features = ["unicode_expressions", "crypto_expressions", "nested_expressions", "parquet"] }
+datafusion-datasource = { version = "53.0.0" }
+datafusion-physical-expr-adapter = { version = "53.0.0" }
+datafusion-spark = { version = "53.0.0", features = ["core"] }
 datafusion-comet-spark-expr = { path = "spark-expr" }
 datafusion-comet-common = { path = "common" }
 datafusion-comet-jni-bridge = { path = "jni-bridge" }
@@ -54,12 +54,12 @@ num = "0.4"
 rand = "0.10"
 regex = "1.12.3"
 thiserror = "2"
-object_store = { version = "0.12.3", features = ["gcp", "azure", "aws", "http"] }
+object_store = { version = "0.13.1", features = ["gcp", "azure", "aws", "http"] }
 url = "2.2"
 aws-config = "1.8.14"
 aws-credential-types = "1.2.13"
-iceberg = { git = "https://github.com/apache/iceberg-rust", tag = "v0.9.0-rc.1" }
-iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", tag = "v0.9.0-rc.1", features = ["opendal-all"] }
+iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "477a1e5" }
+iceberg-storage-opendal = { git = "https://github.com/apache/iceberg-rust", rev = "477a1e5", features = ["opendal-all"] }
 
 [profile.release]
 debug = true

diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
@@ -70,9 +70,9 @@ aws-credential-types = { workspace = true }
 parking_lot = "0.12.5"
 datafusion-comet-objectstore-hdfs = { path = "../hdfs", optional = true, default-features = false, features = ["hdfs"] }
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-native-roots", "http2"] }
-object_store_opendal = {version = "0.55.0", optional = true}
+object_store_opendal = { git = "https://github.com/apache/opendal", rev = "173feb6", package = "object_store_opendal", optional = true}
 hdfs-sys = {version = "0.3", optional = true, features = ["hdfs_3_3"]}
-opendal = { version ="0.55.0", optional = true, features = ["services-hdfs"] }
+opendal = { git = "https://github.com/apache/opendal", rev = "173feb6", optional = true, features = ["services-hdfs"] }
 iceberg = { workspace = true }
 iceberg-storage-opendal = { workspace = true }
 serde_json = "1.0"
@@ -91,7 +91,7 @@ jni = { version = "0.22.4", features = ["invocation"] }
 lazy_static = "1.4"
 assertables = "9"
 hex = "0.4.3"
-datafusion-functions-nested = { version = "52.4.0" }
+datafusion-functions-nested = { version = "53.0.0" }
 
 [features]
 backtrace = ["datafusion/backtrace"]

diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs
@@ -393,6 +393,11 @@ fn prepare_datafusion_session_context(
 
 // register UDFs from datafusion-spark crate
 fn register_datafusion_spark_function(session_ctx: &SessionContext) {
+    // Don't register SparkArrayRepeat — it returns NULL when the element is NULL
+    // (e.g. array_repeat(null, 3) returns NULL instead of [null, null, null]).
+    // Comet's Scala serde wraps the call in a CaseWhen for null count handling,
+    // so DataFusion's built-in ArrayRepeat is sufficient.
+    // TODO: file upstream issue against datafusion-spark
     session_ctx.register_udf(ScalarUDF::new_from_impl(SparkExpm1::default()));
     session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSha2::default()));
     session_ctx.register_udf(ScalarUDF::new_from_impl(CharFunc::default()));

diff --git a/native/core/src/execution/memory_pools/fair_pool.rs b/native/core/src/execution/memory_pools/fair_pool.rs
@@ -103,16 +103,21 @@ impl MemoryPool for CometFairMemoryPool {
             .expect("unexpected amount of unregister happened");
     }
 
-    fn grow(&self, reservation: &MemoryReservation, additional: usize) {
-        self.try_grow(reservation, additional).unwrap();
+    fn grow(&self, _reservation: &MemoryReservation, additional: usize) {
+        self.try_grow(_reservation, additional).unwrap();
     }
 
-    fn shrink(&self, reservation: &MemoryReservation, subtractive: usize) {
+    fn shrink(&self, _reservation: &MemoryReservation, subtractive: usize) {
         if subtractive > 0 {
             let mut state = self.state.lock();
-            let size = reservation.size();
-            if size < subtractive {
-                panic!("Failed to release {subtractive} bytes where only {size} bytes reserved")
+            // We don't use reservation.size() here because DataFusion 53+ decrements
+            // the reservation's atomic size before calling pool.shrink(), so it would
+            // reflect the post-shrink value rather than the pre-shrink value.
+            if state.used < subtractive {
+                panic!(
+                    "Failed to release {subtractive} bytes where only {} bytes tracked by pool",
+                    state.used
+                )
             }
             self.release(subtractive)
                 .unwrap_or_else(|_| panic!("Failed to release {subtractive} bytes"));
@@ -122,7 +127,7 @@ impl MemoryPool for CometFairMemoryPool {
 
     fn try_grow(
         &self,
-        reservation: &MemoryReservation,
+        _reservation: &MemoryReservation,
         additional: usize,
     ) -> Result<(), DataFusionError> {
         if additional > 0 {
@@ -132,10 +137,13 @@ impl MemoryPool for CometFairMemoryPool {
                 .pool_size
                 .checked_div(num)
                 .expect("overflow in checked_div");
-            let size = reservation.size();
-            if limit < size + additional {
+            // We use state.used instead of reservation.size() because DataFusion 53+
+            // calls pool.try_grow() before incrementing the reservation's atomic size,
+            // so reservation.size() would not include prior grows.
+            let used = state.used;
+            if limit < used + additional {
                 return resources_err!(
-                    "Failed to acquire {additional} bytes where {size} bytes already reserved and the fair limit is {limit} bytes, {num} registered"
+                    "Failed to acquire {additional} bytes where {used} bytes already reserved and the fair limit is {limit} bytes, {num} registered"
                 );
             }
 

diff --git a/native/core/src/execution/operators/expand.rs b/native/core/src/execution/operators/expand.rs
@@ -42,7 +42,7 @@ pub struct ExpandExec {
     projections: Vec<Vec<Arc<dyn PhysicalExpr>>>,
     child: Arc<dyn ExecutionPlan>,
     schema: SchemaRef,
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl ExpandExec {
@@ -52,12 +52,12 @@ impl ExpandExec {
         child: Arc<dyn ExecutionPlan>,
         schema: SchemaRef,
     ) -> Self {
-        let cache = PlanProperties::new(
+        let cache = Arc::new(PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&schema)),
             Partitioning::UnknownPartitioning(1),
             EmissionType::Final,
             Boundedness::Bounded,
-        );
+        ));
 
         Self {
             projections,
@@ -129,7 +129,7 @@ impl ExecutionPlan for ExpandExec {
         Ok(Box::pin(expand_stream))
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 

diff --git a/native/core/src/execution/operators/iceberg_scan.rs b/native/core/src/execution/operators/iceberg_scan.rs
@@ -58,7 +58,7 @@ pub struct IcebergScanExec {
     /// Output schema after projection
     output_schema: SchemaRef,
     /// Cached execution plan properties
-    plan_properties: PlanProperties,
+    plan_properties: Arc<PlanProperties>,
     /// Catalog-specific configuration for FileIO
     catalog_properties: HashMap<String, String>,
     /// Pre-planned file scan tasks
@@ -93,13 +93,13 @@ impl IcebergScanExec {
         })
     }
 
-    fn compute_properties(schema: SchemaRef, num_partitions: usize) -> PlanProperties {
-        PlanProperties::new(
+    fn compute_properties(schema: SchemaRef, num_partitions: usize) -> Arc<PlanProperties> {
+        Arc::new(PlanProperties::new(
             EquivalenceProperties::new(schema),
             Partitioning::UnknownPartitioning(num_partitions),
             EmissionType::Incremental,
             Boundedness::Bounded,
-        )
+        ))
     }
 }
 
@@ -116,7 +116,7 @@ impl ExecutionPlan for IcebergScanExec {
         Arc::clone(&self.output_schema)
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.plan_properties
     }
 
@@ -288,7 +288,7 @@ where
                     _ => {
                         let adapter = self
                             .adapter_factory
-                            .create(Arc::clone(&self.schema), Arc::clone(&file_schema));
+                            .create(Arc::clone(&self.schema), Arc::clone(&file_schema))?;
                         let exprs =
                             build_projection_expressions(&self.schema, &adapter).map_err(|e| {
                                 DataFusionError::Execution(format!(

diff --git a/native/core/src/execution/operators/parquet_writer.rs b/native/core/src/execution/operators/parquet_writer.rs
@@ -23,16 +23,18 @@ use std::{
     fmt,
     fmt::{Debug, Formatter},
     fs::File,
-    io::Cursor,
     sync::Arc,
 };
 
+#[cfg(feature = "hdfs-opendal")]
 use opendal::Operator;
+#[cfg(feature = "hdfs-opendal")]
+use std::io::Cursor;
 
 use crate::execution::shuffle::CompressionCodec;
-use crate::parquet::parquet_support::{
-    create_hdfs_operator, is_hdfs_scheme, prepare_object_store_with_configs,
-};
+use crate::parquet::parquet_support::is_hdfs_scheme;
+#[cfg(feature = "hdfs-opendal")]
+use crate::parquet::parquet_support::{create_hdfs_operator, prepare_object_store_with_configs};
 use arrow::datatypes::{Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
@@ -45,7 +47,7 @@ use datafusion::{
         metrics::{ExecutionPlanMetricsSet, MetricsSet},
         stream::RecordBatchStreamAdapter,
         DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
-        SendableRecordBatchStream, Statistics,
+        SendableRecordBatchStream,
     },
 };
 use futures::TryStreamExt;
@@ -64,6 +66,7 @@ enum ParquetWriter {
     /// Contains the arrow writer, HDFS operator, and destination path
     /// an Arrow writer writes to in-memory buffer the data converted to Parquet format
     /// The opendal::Writer is created lazily on first write
+    #[cfg(feature = "hdfs-opendal")]
     Remote(
         ArrowWriter<Cursor<Vec<u8>>>,
         Option<opendal::Writer>,
@@ -80,6 +83,7 @@ impl ParquetWriter {
     ) -> std::result::Result<(), parquet::errors::ParquetError> {
         match self {
             ParquetWriter::LocalFile(writer) => writer.write(batch),
+            #[cfg(feature = "hdfs-opendal")]
             ParquetWriter::Remote(
                 arrow_parquet_buffer_writer,
                 hdfs_writer_opt,
@@ -134,6 +138,7 @@ impl ParquetWriter {
                 writer.close()?;
                 Ok(())
             }
+            #[cfg(feature = "hdfs-opendal")]
             ParquetWriter::Remote(
                 arrow_parquet_buffer_writer,
                 mut hdfs_writer_opt,
@@ -208,7 +213,7 @@ pub struct ParquetWriterExec {
     /// Metrics
     metrics: ExecutionPlanMetricsSet,
     /// Cache for plan properties
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
 }
 
 impl ParquetWriterExec {
@@ -228,12 +233,12 @@ impl ParquetWriterExec {
         // Preserve the input's partitioning so each partition writes its own file
         let input_partitioning = input.output_partitioning().clone();
 
-        let cache = PlanProperties::new(
+        let cache = Arc::new(PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&input.schema())),
             input_partitioning,
             EmissionType::Final,
             Boundedness::Bounded,
-        );
+        ));
 
         Ok(ParquetWriterExec {
             input,
@@ -275,7 +280,7 @@ impl ParquetWriterExec {
         output_file_path: &str,
         schema: SchemaRef,
         props: WriterProperties,
-        runtime_env: Arc<datafusion::execution::runtime_env::RuntimeEnv>,
+        _runtime_env: Arc<datafusion::execution::runtime_env::RuntimeEnv>,
         object_store_options: &HashMap<String, String>,
     ) -> Result<ParquetWriter> {
         // Parse URL and match on storage scheme directly
@@ -284,11 +289,11 @@ impl ParquetWriterExec {
         })?;
 
         if is_hdfs_scheme(&url, object_store_options) {
-            // HDFS storage
+            #[cfg(feature = "hdfs-opendal")]
             {
                 // Use prepare_object_store_with_configs to create and register the object store
                 let (_object_store_url, object_store_path) = prepare_object_store_with_configs(
-                    runtime_env,
+                    _runtime_env,
                     output_file_path.to_string(),
                     object_store_options,
                 )
@@ -324,6 +329,12 @@ impl ParquetWriterExec {
                     object_store_path.to_string(),
                 ))
             }
+            #[cfg(not(feature = "hdfs-opendal"))]
+            {
+                Err(DataFusionError::Execution(
+                    "HDFS support is not enabled. Rebuild with the 'hdfs-opendal' feature.".into(),
+                ))
+            }
         } else if output_file_path.starts_with("file://")
             || output_file_path.starts_with("file:")
             || !output_file_path.contains("://")
@@ -405,11 +416,7 @@ impl ExecutionPlan for ParquetWriterExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Result<Statistics> {
-        self.input.partition_statistics(None)
-    }
-
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }
 
@@ -576,6 +583,7 @@ mod tests {
 
     /// Helper function to create a test RecordBatch with 1000 rows of (int, string) data
     /// Example batch_id 1 -> 0..1000, 2 -> 1001..2000
+    #[allow(dead_code)]
     fn create_test_record_batch(batch_id: i32) -> Result<RecordBatch> {
         assert!(batch_id > 0, "batch_id must be greater than 0");
         let num_rows = batch_id * 1000;

diff --git a/native/core/src/execution/operators/scan.rs b/native/core/src/execution/operators/scan.rs
@@ -70,7 +70,7 @@ pub struct ScanExec {
     /// It is also used in unit test to mock the input data from JVM.
     pub batch: Arc<Mutex<Option<InputBatch>>>,
     /// Cache of expensive-to-compute plan properties
-    cache: PlanProperties,
+    cache: Arc<PlanProperties>,
     /// Metrics collector
     metrics: ExecutionPlanMetricsSet,
     /// Baseline metrics
@@ -93,14 +93,14 @@ impl ScanExec {
         // Build schema directly from data types since get_next now always unpacks dictionaries
         let schema = schema_from_data_types(&data_types);
 
-        let cache = PlanProperties::new(
+        let cache = Arc::new(PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&schema)),
             // The partitioning is not important because we are not using DataFusion's
             // query planner or optimizer
             Partitioning::UnknownPartitioning(1),
             EmissionType::Final,
             Boundedness::Bounded,
-        );
+        ));
 
         Ok(Self {
             exec_context_id,
@@ -415,7 +415,7 @@ impl ExecutionPlan for ScanExec {
         )))
     }
 
-    fn properties(&self) -> &PlanProperties {
+    fn properties(&self) -> &Arc<PlanProperties> {
         &self.cache
     }