apache · mbutrovich · Sep 24, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -325,14 +325,11 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(true)
 
-  // RangePartitioning contains bugs https://github.com/apache/datafusion-comet/issues/1906
   val COMET_EXEC_SHUFFLE_WITH_RANGE_PARTITIONING_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.native.shuffle.partitioning.range.enabled")
-      .doc("Experimental feature to enable range partitioning for Comet native shuffle. " +
-        "This feature is experimental while we investigate scenarios that don't partition data " +
-        "correctly.")
+      .doc("Whether to enable range partitioning for Comet native shuffle.")
       .booleanConf
-      .createWithDefault(false)
+      .createWithDefault(true)
 
   val COMET_EXEC_SHUFFLE_COMPRESSION_CODEC: ConfigEntry[String] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.compression.codec")

diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md
@@ -72,7 +72,7 @@ Comet provides the following configuration settings.
 | spark.comet.memoryOverhead | The amount of additional memory to be allocated per executor process for Comet, in MiB, when running Spark in on-heap mode. This config is optional. If this is not specified, it will be set to `spark.comet.memory.overhead.factor` * `spark.executor.memory`. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | |
 | spark.comet.metrics.updateInterval | The interval in milliseconds to update metrics. If interval is negative, metrics will be updated upon task completion. | 3000 |
 | spark.comet.native.shuffle.partitioning.hash.enabled | Whether to enable hash partitioning for Comet native shuffle. | true |
-| spark.comet.native.shuffle.partitioning.range.enabled | Experimental feature to enable range partitioning for Comet native shuffle. This feature is experimental while we investigate scenarios that don't partition data correctly. | false |
+| spark.comet.native.shuffle.partitioning.range.enabled | Whether to enable range partitioning for Comet native shuffle. | true |
 | spark.comet.nativeLoadRequired | Whether to require Comet native library to load successfully when Comet is enabled. If not, Comet will silently fallback to Spark when it fails to load the native lib. Otherwise, an error will be thrown and the Spark job will be aborted. | false |
 | spark.comet.parquet.enable.directBuffer | Whether to use Java direct byte buffer when reading Parquet. | false |
 | spark.comet.parquet.read.io.adjust.readRange.skew | In the parallel reader, if the read ranges submitted are skewed in sizes, this option will cause the reader to break up larger read ranges into smaller ranges to reduce the skew. This will result in a slightly larger number of connections opened to the file system but may give improved performance. | false |

diff --git a/native/core/benches/shuffle_writer.rs b/native/core/benches/shuffle_writer.rs
@@ -16,8 +16,9 @@
 // under the License.
 
 use arrow::array::builder::{Date32Builder, Decimal128Builder, Int32Builder};
-use arrow::array::{builder::StringBuilder, RecordBatch};
+use arrow::array::{builder::StringBuilder, Array, Int32Array, RecordBatch};
 use arrow::datatypes::{DataType, Field, Schema};
+use arrow::row::{RowConverter, SortField};
 use comet::execution::shuffle::{
     CometPartitioning, CompressionCodec, ShuffleBlockWriter, ShuffleWriterExec,
 };
@@ -31,6 +32,7 @@ use datafusion::{
     physical_plan::{common::collect, ExecutionPlan},
     prelude::SessionContext,
 };
+use itertools::Itertools;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::runtime::Runtime;
@@ -84,16 +86,37 @@ fn criterion_benchmark(c: &mut Criterion) {
         );
     }
 
+    let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new_default(
+        col("c0", batch.schema().as_ref()).unwrap(),
+    )])
+    .unwrap();
+
+    let sort_fields: Vec<SortField> = batch
+        .columns()
+        .iter()
+        .zip(&lex_ordering)
+        .map(|(array, sort_expr)| {
+            SortField::new_with_options(array.data_type().clone(), sort_expr.options)
+        })
+        .collect();
+    let row_converter = RowConverter::new(sort_fields).unwrap();
+
+    // These are hard-coded values based on the benchmark params of 8192 rows per batch, and 16
+    // partitions. If these change, these values need to be recalculated, or bring over the
+    // bounds-finding logic from shuffle_write_test in shuffle_writer.rs.
+    let bounds_ints = vec![
+        512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680,
+    ];
+    let bounds_array: Arc<dyn Array> = Arc::new(Int32Array::from(bounds_ints));
+    let bounds_rows = row_converter
+        .convert_columns(vec![bounds_array].as_slice())
+        .unwrap();
+
+    let owned_rows = bounds_rows.iter().map(|row| row.owned()).collect_vec();
+
     for partitioning in [
         CometPartitioning::Hash(vec![Arc::new(Column::new("a", 0))], 16),
-        CometPartitioning::RangePartitioning(
-            LexOrdering::new(vec![PhysicalSortExpr::new_default(
-                col("c0", batch.schema().as_ref()).unwrap(),
-            )])
-            .unwrap(),
-            16,
-            100,
-        ),
+        CometPartitioning::RangePartitioning(lex_ordering, 16, Arc::new(row_converter), owned_rows),
     ] {
         let compression_codec = CompressionCodec::None;
         group.bench_function(

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -92,6 +92,7 @@ use arrow::array::{
     NullArray, StringBuilder, TimestampMicrosecondArray,
 };
 use arrow::buffer::{BooleanBuffer, NullBuffer, OffsetBuffer};
+use arrow::row::{OwnedRow, RowConverter, SortField};
 use datafusion::common::utils::SingleRowListArrayBuilder;
 use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
 use datafusion::physical_plan::filter::FilterExec;
@@ -484,14 +485,14 @@ impl PhysicalPlanner {
                                     )))
                                 }
                             }
-                        },
+                        }
                         Value::ListVal(values) => {
                             if let DataType::List(_) = data_type {
                                 SingleRowListArrayBuilder::new(literal_to_array_ref(data_type, values.clone())?).build_list_scalar()
                             } else {
                                 return Err(GeneralError(format!(
                                     "Expected DataType::List but got {data_type:?}"
-                                )))
+                                )));
                             }
                         }
                     }
@@ -1402,8 +1403,14 @@ impl PhysicalPlanner {
                 assert_eq!(children.len(), 1);
                 let (scans, child) = self.create_plan(&children[0], inputs, partition_count)?;
 
-                let partitioning = self
-                    .create_partitioning(writer.partitioning.as_ref().unwrap(), child.schema())?;
+                // We wrap native shuffle in a CopyExec. This existed previously, but for
+                // RangePartitioning at least we want to ensure that dictionaries are unpacked.
+                let wrapped_child = Self::wrap_in_copy_exec(Arc::clone(&child.native_plan));
+
+                let partitioning = self.create_partitioning(
+                    writer.partitioning.as_ref().unwrap(),
+                    wrapped_child.schema(),
+                )?;
 
                 let codec = match writer.codec.try_into() {
                     Ok(SparkCompressionCodec::None) => Ok(CompressionCodec::None),
@@ -1419,7 +1426,7 @@ impl PhysicalPlanner {
                 }?;
 
                 let shuffle_writer = Arc::new(ShuffleWriterExec::try_new(
-                    Self::wrap_in_copy_exec(Arc::clone(&child.native_plan)),
+                    wrapped_child,
                     partitioning,
                     codec,
                     writer.output_data_file.clone(),
@@ -2344,16 +2351,65 @@ impl PhysicalPlanner {
                 ))
             }
             PartitioningStruct::RangePartition(range_partition) => {
+                // Generate the lexical ordering for comparisons
                 let exprs: Result<Vec<PhysicalSortExpr>, ExecutionError> = range_partition
                     .sort_orders
                     .iter()
                     .map(|expr| self.create_sort_expr(expr, Arc::clone(&input_schema)))
                     .collect();
                 let lex_ordering = LexOrdering::new(exprs?).unwrap();
+
+                // Generate the row converter for comparing incoming batches to boundary rows
+                let sort_fields: Vec<SortField> = lex_ordering
+                    .iter()
+                    .map(|sort_expr| {
+                        sort_expr
+                            .expr
+                            .data_type(input_schema.as_ref())
+                            .map(|dt| SortField::new_with_options(dt, sort_expr.options))
+                    })
+                    .collect::<Result<Vec<_>, _>>()?;
+
+                // Deserialize the literals to columnar collections of ScalarValues
+                let mut scalar_values: Vec<Vec<ScalarValue>> = vec![vec![]; lex_ordering.len()];
+                for boundary_row in &range_partition.boundary_rows {
+                    // For each serialized expr in a boundary row, convert to a Literal
+                    // expression, then extract the ScalarValue from the Literal and push it
+                    // into the collection of ScalarValues
+                    for (col_idx, col_values) in scalar_values
+                        .iter_mut()
+                        .enumerate()
+                        .take(lex_ordering.len())
+                    {
+                        let expr = self.create_expr(
+                            &boundary_row.partition_bounds[col_idx],
+                            Arc::clone(&input_schema),
+                        )?;
+                        let literal_expr =
+                            expr.as_any().downcast_ref::<Literal>().expect("Literal");
+                        col_values.push(literal_expr.value().clone());
+                    }
+                }
+
+                // Convert the collection of ScalarValues to collection of Arrow Arrays
+                let arrays: Vec<ArrayRef> = scalar_values
+                    .iter()
+                    .map(|scalar_vec| ScalarValue::iter_to_array(scalar_vec.iter().cloned()))
+                    .collect::<Result<Vec<_>, _>>()?;
+
+                // Create a RowConverter and use to create OwnedRows from the Arrays
+                let converter = RowConverter::new(sort_fields)?;
+                let boundary_rows = converter.convert_columns(&arrays)?;
+                // Rows are only a view into Arrow Arrays. We need to create OwnedRows with their
+                // own internal memory ownership to pass as our boundary values to the partitioner.
+                let boundary_owned_rows: Vec<OwnedRow> =
+                    boundary_rows.iter().map(|row| row.owned()).collect();
+
                 Ok(CometPartitioning::RangePartitioning(
                     lex_ordering,
                     range_partition.num_partitions as usize,
-                    range_partition.sample_size as usize,
+                    Arc::new(converter),
+                    boundary_owned_rows,
                 ))
             }
             PartitioningStruct::SinglePartition(_) => Ok(CometPartitioning::SinglePartition),

diff --git a/native/core/src/execution/shuffle/comet_partitioning.rs b/native/core/src/execution/shuffle/comet_partitioning.rs
@@ -15,26 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::row::{OwnedRow, RowConverter};
 use datafusion::physical_expr::{LexOrdering, PhysicalExpr};
 use std::sync::Arc;
 
 #[derive(Debug, Clone)]
 pub enum CometPartitioning {
     SinglePartition,
     /// Allocate rows based on a hash of one of more expressions and the specified number of
-    /// partitions
+    /// partitions. Args are 1) the expression to hash on, and 2) the number of partitions.
     Hash(Vec<Arc<dyn PhysicalExpr>>, usize),
     /// Allocate rows based on the lexical order of one of more expressions and the specified number of
-    /// partitions
-    RangePartitioning(LexOrdering, usize, usize),
+    /// partitions. Args are 1) the LexOrdering to use to compare values and split into partitions,
+    /// 2) the number of partitions, 3) the RowConverter used to view incoming RecordBatches as Arrow
+    /// Rows for comparing to 4) OwnedRows that represent the boundaries of each partition, used with
+    /// LexOrdering to bin each value in the RecordBatch to a partition.
+    RangePartitioning(LexOrdering, usize, Arc<RowConverter>, Vec<OwnedRow>),
 }
 
 impl CometPartitioning {
     pub fn partition_count(&self) -> usize {
         use CometPartitioning::*;
         match self {
             SinglePartition => 1,
-            Hash(_, n) | RangePartitioning(_, n, _) => *n,
+            Hash(_, n) | RangePartitioning(_, n, _, _) => *n,
         }
     }
 }
diff --git a/native/core/src/execution/shuffle/mod.rs b/native/core/src/execution/shuffle/mod.rs
@@ -19,7 +19,6 @@ pub(crate) mod codec;
 mod comet_partitioning;
 mod list;
 mod map;
-mod range_partitioner;
 pub mod row;
 mod shuffle_writer;