ARROW-10292: [Rust] [DataFusion] Simplify merge

jorgecarleitao · kszucs · commit 7209ffcb58e6 · 2020-10-19T20:00:07.000+02:00
Currently, `mergeExec` uses `tokio::spawn` to parallelize the work, by calling `tokio::spawn` once per logical thread. However, `tokio::spawn` returns a task / future, which `tokio` runtime will then schedule on its thread pool. Therefore, there is no need to limit the number of tasks to the number of logical threads, as tokio's runtime itself is responsible for that work. In particular, since we are using [`rt-threaded`](https://docs.rs/tokio/0.2.22/tokio/runtime/index.html#threaded-scheduler), tokio already declares a thread pool from the number of logical threads available. This PR removes the coupling, in `mergeExec`, between the number of logical threads (`max_concurrency`) and the number of created tasks. I observe no change in performance: <details> <summary>Benchmark results</summary> ``` Switched to branch 'simplify_merge' Your branch is up to date with 'origin/simplify_merge'. Compiling datafusion v2.0.0-SNAPSHOT (/Users/jorgecarleitao/projects/arrow/rust/datafusion) Finished bench [optimized] target(s) in 38.02s Running /Users/jorgecarleitao/projects/arrow/rust/target/release/deps/aggregate_query_sql-5241a705a1ff29ae Gnuplot not found, using plotters backend aggregate_query_no_group_by 15 12 time: [715.17 us 722.60 us 730.19 us] change: [-8.3167% -5.2253% -2.2675%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe aggregate_query_group_by 15 12 time: [5.6538 ms 5.6695 ms 5.6892 ms] change: [+0.1012% +0.5308% +0.9913%] (p = 0.02 < 0.05) Change within noise threshold. Found 10 outliers among 100 measurements (10.00%) 4 (4.00%) high mild 6 (6.00%) high severe aggregate_query_group_by_with_filter 15 12 time: [2.6598 ms 2.6665 ms 2.6751 ms] change: [-0.5532% -0.1446% +0.2679%] (p = 0.51 > 0.05) No change in performance detected. Found 7 outliers among 100 measurements (7.00%) 3 (3.00%) high mild 4 (4.00%) high severe ``` </details> Closes #8453 from jorgecarleitao/simplify_merge Authored-by: Jorge C. Leitao <jorgecarleitao@gmail.com> Signed-off-by: Jorge C. Leitao <jorgecarleitao@gmail.com>
diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs
@@ -332,7 +332,7 @@ impl ExecutionContext {
             }
             _ => {
                 // merge into a single partition
-                let plan = MergeExec::new(plan.clone(), self.state.config.concurrency);
+                let plan = MergeExec::new(plan.clone());
                 // MergeExec must produce a single partition
                 assert_eq!(1, plan.output_partitioning().partition_count());
                 common::collect(plan.execute(0).await?)
diff --git a/rust/datafusion/src/physical_plan/hash_aggregate.rs b/rust/datafusion/src/physical_plan/hash_aggregate.rs
@@ -810,7 +810,7 @@ mod tests {
             .unwrap();
         assert_eq!(*sums, Float64Array::from(vec![2.0, 7.0, 11.0]));
 
-        let merge = Arc::new(MergeExec::new(partial_aggregate, 2));
+        let merge = Arc::new(MergeExec::new(partial_aggregate));
 
         let final_group: Vec<Arc<dyn PhysicalExpr>> =
             (0..groups.len()).map(|i| col(&groups[i].1)).collect();
diff --git a/rust/datafusion/src/physical_plan/limit.rs b/rust/datafusion/src/physical_plan/limit.rs
@@ -243,8 +243,7 @@ mod tests {
         // input should have 4 partitions
         assert_eq!(csv.output_partitioning().partition_count(), num_partitions);
 
-        let limit =
-            GlobalLimitExec::new(Arc::new(MergeExec::new(Arc::new(csv), 2)), 7, 2);
+        let limit = GlobalLimitExec::new(Arc::new(MergeExec::new(Arc::new(csv))), 7, 2);
 
         // the result should contain 4 batches (one per input partition)
         let iter = limit.execute(0).await?;
diff --git a/rust/datafusion/src/physical_plan/merge.rs b/rust/datafusion/src/physical_plan/merge.rs
@@ -32,25 +32,20 @@ use arrow::record_batch::RecordBatch;
 use super::SendableRecordBatchReader;
 
 use async_trait::async_trait;
-use tokio::task::{self, JoinHandle};
+use tokio;
 
 /// Merge execution plan executes partitions in parallel and combines them into a single
 /// partition. No guarantees are made about the order of the resulting partition.
 #[derive(Debug)]
 pub struct MergeExec {
     /// Input execution plan
     input: Arc<dyn ExecutionPlan>,
-    /// Maximum number of concurrent threads
-    concurrency: usize,
 }
 
 impl MergeExec {
     /// Create a new MergeExec
-    pub fn new(input: Arc<dyn ExecutionPlan>, max_concurrency: usize) -> Self {
-        MergeExec {
-            input,
-            concurrency: max_concurrency,
-        }
+    pub fn new(input: Arc<dyn ExecutionPlan>) -> Self {
+        MergeExec { input }
     }
 }
 
@@ -79,10 +74,7 @@ impl ExecutionPlan for MergeExec {
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         match children.len() {
-            1 => Ok(Arc::new(MergeExec::new(
-                children[0].clone(),
-                self.concurrency,
-            ))),
+            1 => Ok(Arc::new(MergeExec::new(children[0].clone()))),
             _ => Err(ExecutionError::General(
                 "MergeExec wrong number of children".to_string(),
             )),
@@ -108,35 +100,23 @@ impl ExecutionPlan for MergeExec {
                 self.input.execute(0).await
             }
             _ => {
-                let partitions_per_thread = (input_partitions / self.concurrency).max(1);
-                let range: Vec<usize> = (0..input_partitions).collect();
-                let chunks = range.chunks(partitions_per_thread);
-
-                let mut tasks = vec![];
-                for chunk in chunks {
-                    let chunk = chunk.to_vec();
-                    let input = self.input.clone();
-                    let task: JoinHandle<Result<Vec<Arc<RecordBatch>>>> =
-                        task::spawn(async move {
-                            let mut batches: Vec<Arc<RecordBatch>> = vec![];
-                            for partition in chunk {
-                                let it = input.execute(partition).await?;
-                                common::collect(it).iter().for_each(|b| {
-                                    b.iter()
-                                        .for_each(|b| batches.push(Arc::new(b.clone())))
-                                });
-                            }
-                            Ok(batches)
-                        });
-                    tasks.push(task);
-                }
+                let tasks = (0..input_partitions)
+                    .map(|part_i| {
+                        let input = self.input.clone();
+                        tokio::spawn(async move {
+                            let it = input.execute(part_i).await?;
+                            common::collect(it)
+                        })
+                    })
+                    // this collect *is needed* so that the join below can
+                    // switch between tasks
+                    .collect::<Vec<_>>();
 
-                // combine the results from each thread
                 let mut combined_results: Vec<Arc<RecordBatch>> = vec![];
                 for task in tasks {
                     let result = task.await.unwrap()?;
                     for batch in &result {
-                        combined_results.push(batch.clone());
+                        combined_results.push(Arc::new(batch.clone()));
                     }
                 }
 
@@ -171,7 +151,7 @@ mod tests {
         // input should have 4 partitions
         assert_eq!(csv.output_partitioning().partition_count(), num_partitions);
 
-        let merge = MergeExec::new(Arc::new(csv), 2);
+        let merge = MergeExec::new(Arc::new(csv));
 
         // output of MergeExec should have a single partition
         assert_eq!(merge.output_partitioning().partition_count(), 1);
diff --git a/rust/datafusion/src/physical_plan/planner.rs b/rust/datafusion/src/physical_plan/planner.rs
@@ -117,10 +117,7 @@ impl DefaultPhysicalPlanner {
                             if child.output_partitioning().partition_count() == 1 {
                                 child.clone()
                             } else {
-                                Arc::new(MergeExec::new(
-                                    child.clone(),
-                                    ctx_state.config.concurrency,
-                                ))
+                                Arc::new(MergeExec::new(child.clone()))
                             }
                         })
                         .collect(),
diff --git a/rust/datafusion/src/physical_plan/sort.rs b/rust/datafusion/src/physical_plan/sort.rs
@@ -208,7 +208,7 @@ mod tests {
                     options: SortOptions::default(),
                 },
             ],
-            Arc::new(MergeExec::new(Arc::new(csv), 2)),
+            Arc::new(MergeExec::new(Arc::new(csv))),
             2,
         )?);
 

Original file line number	Diff line number	Diff line change
`@@ -332,7 +332,7 @@ impl ExecutionContext {`
`332`	`332`	`}`
`333`	`333`	`_ => {`
`334`	`334`	`// merge into a single partition`
`335`		`- let plan = MergeExec::new(plan.clone(), self.state.config.concurrency);`
	`335`	`+ let plan = MergeExec::new(plan.clone());`
`336`	`336`	`// MergeExec must produce a single partition`
`337`	`337`	`assert_eq!(1, plan.output_partitioning().partition_count());`
`338`	`338`	`common::collect(plan.execute(0).await?)`