apache · AdamGS · Dec 14, 2024 · Aug 3, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -42,6 +42,7 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
+PREFER_ROUND_ROBIN=${PREFER_ROUND_ROBIN:-true}
 VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
 
 usage() {
@@ -133,6 +134,7 @@ CARGO_COMMAND       command that runs the benchmark binary
 DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
 RESULTS_NAME        folder where the benchmark files are stored
 PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
+PREFER_ROUND_ROBIN  Prefer round robin partitioning (default true)
 VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
 DATAFUSION_*        Set the given datafusion configuration
 "
@@ -207,6 +209,9 @@ main() {
                 tpch10)
                     data_tpch "10"
                     ;;
+                tpch50)
+                    data_tpch "50"
+                    ;;
                 tpch_mem10)
                     # same data as for tpch10
                     data_tpch "10"
@@ -327,6 +332,7 @@ main() {
             echo "RESULTS_DIR: ${RESULTS_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
             echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
+            echo "PREFER_ROUND_ROBIN: ${PREFER_ROUND_ROBIN}"
             echo "***************************"
 
             # navigate to the appropriate directory
@@ -370,6 +376,9 @@ main() {
                 tpch_csv10)
                     run_tpch "10" "csv"
                     ;;
+                tpch50)
+                    run_tpch "50"
+                    ;;
                 tpch_mem10)
                     run_tpch_mem "10"
                     ;;
@@ -562,9 +571,8 @@ run_tpch() {
     RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch benchmark..."
-
     FORMAT=$2
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --prefer_round_robin "${PREFER_ROUND_ROBIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
 # Runs the tpch in memory
@@ -580,7 +588,7 @@ run_tpch_mem() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
     # -m means in memory
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --prefer_round_robin "${PREFER_ROUND_ROBIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
 # Runs the cancellation benchmark
@@ -650,7 +658,7 @@ run_clickbench_partitioned() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (partitioned, 100 files) benchmark..."
-    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --prefer_round_robin "${PREFER_ROUND_ROBIN}" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
 

diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -29,6 +29,9 @@ use datafusion_common::exec_datafusion_err;
 use datafusion_common::instant::Instant;
 use structopt::StructOpt;
 
+/// hack to avoid `default_value is meaningless for bool` errors
+type BoolDefaultTrue = bool;
+
 /// Driver program to run the ClickBench benchmark
 ///
 /// The ClickBench[1] benchmarks are widely cited in the industry and
@@ -78,6 +81,11 @@ pub struct RunOpt {
     /// If present, write results json here
     #[structopt(parse(from_os_str), short = "o", long = "output")]
     output_path: Option<PathBuf>,
+
+    /// If true then round robin repartitioning is used, if false then on demand repartitioning
+    /// True by default.
+    #[structopt(short = "r", long = "prefer_round_robin", default_value = "true")]
+    prefer_round_robin: BoolDefaultTrue,
 }
 
 /// Get the SQL file path
@@ -138,6 +146,10 @@ impl RunOpt {
             }
         }
 
+        config
+            .options_mut()
+            .optimizer
+            .prefer_round_robin_repartition = self.prefer_round_robin;
         let rt_builder = self.common.runtime_env_builder()?;
         let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
         self.register_hits(&ctx).await?;

diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -92,6 +92,11 @@ pub struct RunOpt {
     #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
 
+    /// If true then round robin repartitioning is used, if false then on demand repartitioning
+    /// True by default.
+    #[structopt(short = "r", long = "prefer_round_robin", default_value = "true")]
+    prefer_round_robin: BoolDefaultTrue,
+
     /// Mark the first column of each table as sorted in ascending order.
     /// The tables should have been created with the `--sort` option for this to have any effect.
     #[structopt(short = "t", long = "sorted")]
@@ -112,6 +117,10 @@ impl RunOpt {
             .config()?
             .with_collect_statistics(!self.disable_statistics);
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
+        config
+            .options_mut()
+            .optimizer
+            .prefer_round_robin_repartition = self.prefer_round_robin;
         let rt_builder = self.common.runtime_env_builder()?;
         let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
         // register tables
@@ -379,6 +388,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            prefer_round_robin: true,
             sorted: false,
         };
         opt.register_tables(&ctx).await?;
@@ -416,6 +426,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            prefer_round_robin: true,
             sorted: false,
         };
         opt.register_tables(&ctx).await?;

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -708,6 +708,10 @@ config_namespace! {
         /// repartitioning to increase parallelism to leverage more CPU cores
         pub enable_round_robin_repartition: bool, default = true
 
+        /// When set to false, the physical plan optimizer will replace the round robin
+        /// repartitioning with on demand repartitioning
+        pub prefer_round_robin_repartition: bool, default = true
+
         /// When set to true, the optimizer will attempt to perform limit operations
         /// during aggregations, if possible
         pub enable_topk_aggregation: bool, default = true

diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
@@ -44,6 +44,7 @@ mod sp_repartition_fuzz_tests {
     use datafusion_physical_expr::ConstExpr;
     use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
     use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::repartition::on_demand_repartition::OnDemandRepartitionExec;
     use test_utils::add_empty_batches;
 
     use itertools::izip;
@@ -295,25 +296,40 @@ mod sp_repartition_fuzz_tests {
         // behaviour. We can choose, n_distinct as we like. However,
         // we chose it a large number to decrease probability of having same rows in the table.
         let n_distinct = 1_000_000;
-        for (is_first_roundrobin, is_first_sort_preserving) in
-            [(false, false), (false, true), (true, false), (true, true)]
-        {
-            for is_second_roundrobin in [false, true] {
-                let mut handles = Vec::new();
-
-                for seed in seed_start..seed_end {
-                    #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
-                    let job = tokio::spawn(run_sort_preserving_repartition_test(
-                        make_staggered_batches::<true>(n_row, n_distinct, seed as u64),
-                        is_first_roundrobin,
-                        is_first_sort_preserving,
-                        is_second_roundrobin,
-                    ));
-                    handles.push(job);
-                }
-
-                for job in handles {
-                    job.await.unwrap();
+        for use_on_demand_repartition in [false, true] {
+            for (is_first_roundrobin, is_first_sort_preserving) in
+                [(false, false), (false, true), (true, false), (true, true)]
+            {
+                for is_second_roundrobin in [false, true] {
+                    // On demand repartition only replaces the roundrobin repartition
+                    if use_on_demand_repartition
+                        && !is_first_roundrobin
+                        && !is_second_roundrobin
+                    {
+                        continue;
+                    }
+                    let mut handles = Vec::new();
+
+                    for seed in seed_start..seed_end {
+                        #[allow(clippy::disallowed_methods)]
+                        // spawn allowed only in tests
+                        let job = tokio::spawn(run_sort_preserving_repartition_test(
+                            make_staggered_batches::<true>(
+                                n_row,
+                                n_distinct,
+                                seed as u64,
+                            ),
+                            is_first_roundrobin,
+                            is_first_sort_preserving,
+                            is_second_roundrobin,
+                            use_on_demand_repartition,
+                        ));
+                        handles.push(job);
+                    }
+
+                    for job in handles {
+                        job.await.unwrap();
+                    }
                 }
             }
         }
@@ -342,9 +358,17 @@ mod sp_repartition_fuzz_tests {
         // If `true`, second repartition executor after `DataSourceExec` will be in `RoundRobin` mode
         // else it will be in `Hash` mode
         is_second_roundrobin: bool,
+        // If `true`, `OnDemandRepartitionExec` will be used instead of `RepartitionExec`
+        use_on_demand_repartition: bool,
     ) {
         let schema = input1[0].schema();
-        let session_config = SessionConfig::new().with_batch_size(50);
+        let mut session_config = SessionConfig::new().with_batch_size(50);
+        if use_on_demand_repartition {
+            session_config
+                .options_mut()
+                .optimizer
+                .prefer_round_robin_repartition = false;
+        }
         let ctx = SessionContext::new_with_config(session_config);
         let sort_keys = ["a", "b", "c"].map(|ordering_col| {
             PhysicalSortExpr::new_default(col(ordering_col, &schema).unwrap())
@@ -362,16 +386,32 @@ mod sp_repartition_fuzz_tests {
         let hash_exprs = vec![col("c", &schema).unwrap()];
 
         let intermediate = match (is_first_roundrobin, is_first_sort_preserving) {
-            (true, true) => sort_preserving_repartition_exec_round_robin(running_source),
-            (true, false) => repartition_exec_round_robin(running_source),
+            (true, true) => {
+                if use_on_demand_repartition {
+                    sort_preserving_repartition_exec_on_demand(running_source)
+                } else {
+                    sort_preserving_repartition_exec_round_robin(running_source)
+                }
+            }
+            (true, false) => {
+                if use_on_demand_repartition {
+                    repartition_exec_on_demand(running_source)
+                } else {
+                    repartition_exec_round_robin(running_source)
+                }
+            }
             (false, true) => {
                 sort_preserving_repartition_exec_hash(running_source, hash_exprs.clone())
             }
             (false, false) => repartition_exec_hash(running_source, hash_exprs.clone()),
         };
 
         let intermediate = if is_second_roundrobin {
-            sort_preserving_repartition_exec_round_robin(intermediate)
+            if use_on_demand_repartition {
+                sort_preserving_repartition_exec_on_demand(intermediate)
+            } else {
+                sort_preserving_repartition_exec_round_robin(intermediate)
+            }
         } else {
             sort_preserving_repartition_exec_hash(intermediate, hash_exprs.clone())
         };
@@ -394,6 +434,16 @@ mod sp_repartition_fuzz_tests {
         )
     }
 
+    fn sort_preserving_repartition_exec_on_demand(
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(
+            OnDemandRepartitionExec::try_new(input, Partitioning::OnDemand(2))
+                .unwrap()
+                .with_preserve_order(),
+        )
+    }
+
     fn repartition_exec_round_robin(
         input: Arc<dyn ExecutionPlan>,
     ) -> Arc<dyn ExecutionPlan> {
@@ -402,6 +452,14 @@ mod sp_repartition_fuzz_tests {
         )
     }
 
+    fn repartition_exec_on_demand(
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(
+            OnDemandRepartitionExec::try_new(input, Partitioning::OnDemand(2)).unwrap(),
+        )
+    }
+
     fn sort_preserving_repartition_exec_hash(
         input: Arc<dyn ExecutionPlan>,
         hash_expr: Vec<Arc<dyn PhysicalExpr>>,