From f4959210fbe66a48603ccdf91d49c2b76a6d2b3e Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Tue, 19 Mar 2024 18:53:01 -0500 Subject: [PATCH 01/27] for debug --- datafusion/expr/src/tree_node/expr.rs | 2 + .../optimizer/src/common_subexpr_eliminate.rs | 132 +++++++++++++++--- 2 files changed, 113 insertions(+), 21 deletions(-) diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs index 1c672851e9b5b..64967b509cca2 100644 --- a/datafusion/expr/src/tree_node/expr.rs +++ b/datafusion/expr/src/tree_node/expr.rs @@ -79,6 +79,7 @@ impl TreeNode for Expr { | Expr::Wildcard {..} | Expr::Placeholder (_) => vec![], Expr::BinaryExpr(BinaryExpr { left, right, .. }) => { + println!("left is {:?}, right is {:?}", left, right); vec![left.as_ref(), right.as_ref()] } Expr::Like(Like { expr, pattern, .. }) @@ -122,6 +123,7 @@ impl TreeNode for Expr { let mut expr_vec = args.iter().collect::>(); expr_vec.extend(partition_by); expr_vec.extend(order_by); + println!("expressions vector is {:?}", expr_vec); expr_vec } Expr::InList(InList { expr, list, .. }) => { diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 0c9064d0641fc..5a139b818cefd 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; +use arrow::array; use arrow::datatypes::DataType; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, @@ -34,6 +35,7 @@ use datafusion_common::{ use datafusion_expr::expr::Alias; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; use datafusion_expr::{col, Expr, ExprSchemable}; +use regex_syntax::ast::print; /// A map from expression's identifier to tuple including /// - the expression itself (cloned) @@ -120,14 +122,25 @@ impl CommonSubexprEliminate { .iter() .zip(arrays_list.iter()) .map(|(exprs, arrays)| { - exprs + println!("************ \n expr is {:?} \n ************ \n", exprs); + let res = exprs .iter() .cloned() .zip(arrays.iter()) .map(|(expr, id_array)| { + println!( + "************ \n current expr is {:?} \n ************ \n", + expr + ); + println!( + "************ \n current id_array is {:?} \n ************ \n", + id_array + ); replace_common_expr(expr, id_array, expr_set, affected_id) }) - .collect::>>() + .collect::>>(); + println!("************ \n expr is {:?} \n ************ \n", res); + res }) .collect::>>() } @@ -140,6 +153,18 @@ impl CommonSubexprEliminate { expr_set: &ExprSet, config: &dyn OptimizerConfig, ) -> Result<(Vec>, LogicalPlan)> { + println!( + "************ \n current plan is {:?} \n *********** \n", + input + ); + println!( + "************ \n exprs list are {:?} \n *********** \n", + exprs_list + ); + println!( + "************ \n array list are {:?} \n *********** \n", + arrays_list + ); let mut affected_id = BTreeSet::::new(); let rewrite_exprs = @@ -151,7 +176,14 @@ impl CommonSubexprEliminate { if !affected_id.is_empty() { new_input = build_common_expr_project_plan(new_input, affected_id, expr_set)?; } - + println!( + "\n *********** \n rewrite expressions are {:?} \n *********** \n", + rewrite_exprs + ); + println!( + "\n *********** \n new input is {:?} \n *********** \n", + new_input + ); Ok((rewrite_exprs, new_input)) } @@ -187,7 +219,10 @@ impl CommonSubexprEliminate { window_exprs.push(window_expr); arrays_per_window.push(arrays); } - + println!( + "\n *********** \n window exprs are {:?} \n *********** \n", + window_exprs + ); let mut window_exprs = window_exprs .iter() .map(|expr| expr.as_slice()) @@ -196,7 +231,10 @@ impl CommonSubexprEliminate { .iter() .map(|arrays| arrays.as_slice()) .collect::>(); - + println!( + "\n *********** \n arrays per windows are {:?} \n *********** \n", + arrays_per_window + ); assert_eq!(window_exprs.len(), arrays_per_window.len()); let (mut new_expr, new_input) = self.rewrite_expr( &window_exprs, @@ -205,8 +243,19 @@ impl CommonSubexprEliminate { &expr_set, config, )?; + println!( + "\n *********** \n new exprs are {:?} \n *********** \n", + new_expr + ); + println!( + "\n *********** \n new input are {:?} \n *********** \n", + new_input + ); assert_eq!(window_exprs.len(), new_expr.len()); - + println!( + "\n *********** \n before plan are {:?} \n *********** \n", + plan + ); // Construct consecutive window operator, with their corresponding new window expressions. plan = new_input; while let Some(new_window_expr) = new_expr.pop() { @@ -220,13 +269,17 @@ impl CommonSubexprEliminate { .into_iter() .zip(orig_window_expr.iter()) .map(|(new_window_expr, window_expr)| { + println!("new_window_expr is {:?}", new_window_expr); let original_name = window_expr.name_for_alias()?; new_window_expr.alias_if_changed(original_name) }) .collect::>>()?; plan = LogicalPlan::Window(Window::try_new(new_window_expr, Arc::new(plan))?); } - + println!( + "\n *********** \n final plan are {:?} \n *********** \n", + plan + ); Ok(plan) } @@ -353,6 +406,8 @@ impl CommonSubexprEliminate { config: &dyn OptimizerConfig, ) -> Result { let expr = plan.expressions(); + println!("********** \n expressions are {:?} \n ********* \n", expr); + println!("********** \n ur plan is {:?} \n ********* \n", plan); let inputs = plan.inputs(); let input = inputs[0]; let input_schema = Arc::clone(input.schema()); @@ -360,11 +415,17 @@ impl CommonSubexprEliminate { // Visit expr list and build expr identifier to occuring count map (`expr_set`). let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?; - + println!("********* \n arrays {:?} \n ********* \n", arrays); + println!("********* \n expr_set {:?} \n ********* \n", expr_set); let (mut new_expr, new_input) = self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?; - - plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]) + println!( + "********* \n new expr is {:?} \n and new input is {:?} ********* \n", + new_expr, new_input + ); + let res_plan = plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]); + println!("********* \n res plan is {:?} \n ********* \n", res_plan); + res_plan } } @@ -412,9 +473,17 @@ impl OptimizerRule for CommonSubexprEliminate { }; let original_schema = plan.schema().clone(); + println!( + "***************** \n original schema is {:?} \n ***************", + original_schema + ); match optimized_plan { Some(optimized_plan) if optimized_plan.schema() != &original_schema => { // add an additional projection if the output schema changed. + println!( + "************** \n optimized schema {:?} \n ************", + optimized_plan.schema(), + ); Ok(Some(build_recover_project_plan( &original_schema, optimized_plan, @@ -454,9 +523,12 @@ fn to_arrays( expr_set: &mut ExprSet, expr_mask: ExprMask, ) -> Result>> { - expr.iter() + println!("schema is {:?}", input_schema); + let res = expr + .iter() .map(|e| { let mut id_array = vec![]; + println!("current expression is {:?}", e); expr_to_identifier( e, expr_set, @@ -467,7 +539,9 @@ fn to_arrays( Ok(id_array) }) - .collect::>>() + .collect::>>(); + println!("res is {:?}", res); + res } /// Build the "intermediate" projection plan that evaluates the extracted common expressions. @@ -674,6 +748,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { } fn f_up(&mut self, expr: &Expr) -> Result { + println!("********* \n expr is {:?} \n *********", expr); self.series_number += 1; let (idx, sub_expr_identifier) = self.pop_enter_mark(); @@ -693,7 +768,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { self.visit_stack.push(VisitRecord::ExprItem(desc.clone())); let data_type = expr.get_type(&self.input_schema)?; - + print!("data type is {:?}", data_type); self.expr_set .entry(desc) .or_insert_with(|| (expr.clone(), 0, data_type)) @@ -747,6 +822,10 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate // the `id_array`, which records the expr's identifier used to rewrite expr. So if we // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too. + println!( + "****************** \n expr is {:?} \n ****************** \n ", + expr + ); if expr.short_circuits() || is_volatile_expression(&expr)? { return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump)); } @@ -819,14 +898,25 @@ fn replace_common_expr( expr_set: &ExprSet, affected_id: &mut BTreeSet, ) -> Result { - expr.rewrite(&mut CommonSubexprRewriter { - expr_set, - id_array, - affected_id, - max_series_number: 0, - curr_index: 0, - }) - .data() + println!("************ \n expr are {:?} \n ************", expr); + println!( + "************ \n id_array are {:?} \n ************", + id_array + ); + let rewrited = expr + .rewrite(&mut CommonSubexprRewriter { + expr_set, + id_array, + affected_id, + max_series_number: 0, + curr_index: 0, + }) + .data(); + println!( + "************ \n rewrited values are {:?} \n ************", + rewrited + ); + rewrited } #[cfg(test)] From cc1d4125688f98bace96adac7ab80046a3550898 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Wed, 20 Mar 2024 20:10:46 -0500 Subject: [PATCH 02/27] adding projections for subquery --- .../optimizer/src/common_subexpr_eliminate.rs | 204 ++++++++++++++++-- 1 file changed, 189 insertions(+), 15 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 5a139b818cefd..d2a6c3bc270d2 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -17,24 +17,28 @@ //! Eliminate common sub-expression. -use std::collections::{BTreeSet, HashMap}; -use std::sync::Arc; - use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; +use datafusion_expr::Expr; +use std::collections::{BTreeSet, HashMap}; +use std::slice::Windows; +use std::sync::Arc; use arrow::array; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, }; use datafusion_common::{ - internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, + internal_datafusion_err, internal_err, plan_err, Column, DFField, DFSchema, + DFSchemaRef, DataFusionError, Result, }; -use datafusion_expr::expr::Alias; +use datafusion_expr::expr::{Alias, WindowFunction}; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; -use datafusion_expr::{col, Expr, ExprSchemable}; +use datafusion_expr::tree_node::plan; +use datafusion_expr::{col, BinaryExpr, Cast, ExprSchemable}; +use hashbrown::HashSet; use regex_syntax::ast::print; /// A map from expression's identifier to tuple including @@ -171,7 +175,7 @@ impl CommonSubexprEliminate { self.rewrite_exprs_list(exprs_list, arrays_list, expr_set, &mut affected_id)?; let mut new_input = self - .try_optimize(input, config)? + .common_optimize(input, config)? .unwrap_or_else(|| input.clone()); if !affected_id.is_empty() { new_input = build_common_expr_project_plan(new_input, affected_id, expr_set)?; @@ -262,9 +266,13 @@ impl CommonSubexprEliminate { // Since `new_expr` and `window_exprs` length are same. We can safely `.unwrap` here. let orig_window_expr = window_exprs.pop().unwrap(); assert_eq!(new_window_expr.len(), orig_window_expr.len()); - + println!( + "************* \n origin window expressions {:?} \n ************** \n ", + orig_window_expr + ); // Rename new re-written window expressions with original name (by giving alias) // Otherwise we may receive schema error, in subsequent operators. + println!("************* \n before changed window expressions {:?} \n ************** \n ", new_window_expr); let new_window_expr = new_window_expr .into_iter() .zip(orig_window_expr.iter()) @@ -274,6 +282,7 @@ impl CommonSubexprEliminate { new_window_expr.alias_if_changed(original_name) }) .collect::>>()?; + println!("************* \n after changed window expressions {:?} \n ************** \n ", new_window_expr); plan = LogicalPlan::Window(Window::try_new(new_window_expr, Arc::new(plan))?); } println!( @@ -428,9 +437,8 @@ impl CommonSubexprEliminate { res_plan } } - -impl OptimizerRule for CommonSubexprEliminate { - fn try_optimize( +impl CommonSubexprEliminate { + fn common_optimize( &self, plan: &LogicalPlan, config: &dyn OptimizerConfig, @@ -484,6 +492,10 @@ impl OptimizerRule for CommonSubexprEliminate { "************** \n optimized schema {:?} \n ************", optimized_plan.schema(), ); + println!( + "************** \n current plan is {:?} \n ************", + optimized_plan, + ); Ok(Some(build_recover_project_plan( &original_schema, optimized_plan, @@ -492,6 +504,33 @@ impl OptimizerRule for CommonSubexprEliminate { plan => Ok(plan), } } + /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the + /// + fn add_extra_projection(&self, plan: &LogicalPlan) -> Result> { + let res = plan + .clone() + .rewrite(&mut ProjectionAdder { + depth_map: HashMap::new(), + depth: 0, + }) + .map(|transformed| Some(transformed.data)); + println!("************** \n result plan is {:?} *********** \n", res); + res + } +} +impl OptimizerRule for CommonSubexprEliminate { + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + let optimized_plan_option = self.common_optimize(plan, config)?; + let plan = match optimized_plan_option { + Some(plan) => plan, + _ => plan.clone(), + }; + self.add_extra_projection(&plan) + } fn name(&self) -> &str { "common_sub_expression_eliminate" @@ -557,6 +596,10 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type)) => { // todo: check `nullable` + println!( + "\n ********** expr is {:?} \n \n and data type is {:?} \n **************** \n", + expr, data_type + ); let field = DFField::new_unqualified(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); @@ -590,8 +633,19 @@ fn build_recover_project_plan( let col_exprs = schema .fields() .iter() - .map(|field| Expr::Column(field.qualified_column())) + .map(|field| { + println!("field is {:?}", field); + Expr::Column(field.qualified_column()) + }) .collect(); + println!( + "************** \n col_exprs are {:?} \n **************", + col_exprs + ); + println!( + "************** \n current plan is {:?} \n **************", + input + ); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, Arc::new(input), @@ -919,6 +973,126 @@ fn replace_common_expr( rewrited } +struct ProjectionAdder { + depth_map: HashMap>, + depth: u128, +} + +impl ProjectionAdder { + // TODO: adding more expressions for sub query, currently only support for Simple Binary Expressions + fn get_complex_expressions( + exprs: Vec, + schema: DFSchemaRef, + ) -> HashSet { + let mut res = HashSet::new(); + for expr in exprs { + match expr { + Expr::BinaryExpr(BinaryExpr { + left: ref l_box, + op: _, + right: ref r_box, + }) => { + match (&**l_box, &**r_box) { + (Expr::Column(l), Expr::Column(r)) => { + // Both `left` and `right` are `Expr::Column`, so we push to `res` + if schema.has_column(l) && schema.has_column(r) { + res.insert(DFField::new_unqualified( + &expr.to_string(), + schema + .field_from_column(l) + .unwrap() + .data_type() + .clone(), + true, + )); + } + } + // If they are not both `Expr::Column`, you can handle other cases or do nothing + _ => {} + } + } + Expr::Cast(_) => { + res.extend(Self::get_complex_expressions(vec![expr], schema.clone())) + } + + Expr::WindowFunction(WindowFunction { fun, args, .. }) => { + res.extend(Self::get_complex_expressions(args, schema.clone())) + } + _ => {} + } + } + res + } +} +impl TreeNodeRewriter for ProjectionAdder { + type Node = LogicalPlan; + /// currently we just collect the complex bianryOP + + fn f_down(&mut self, node: Self::Node) -> Result> { + // use depth to trace where we are in the LogicalPlan tree + self.depth += 1; + // extract all expressions + check whether it contains in depth_sets + let exprs = node.expressions(); + println!("********* \n cur plan is {:?} \n *********** \n", node); + println!("********* \n exprs are {:?} \n *********** \n", exprs); + let depth_set = self.depth_map.entry(self.depth).or_default(); + println!("self.input schema is {:?}", node.schema()); + depth_set.extend(Self::get_complex_expressions(exprs, node.schema().clone())); + Ok(Transformed::no(node)) + } + fn f_up(&mut self, node: Self::Node) -> Result> { + self.depth -= 1; + println!("*********** \ncur plan is {:?} \n*************\n", node); + + let current_depth_schema = + self.depth_map.get(&self.depth).cloned().unwrap_or_default(); + + // get the intersected part + let added_schema = self + .depth_map + .iter() + .filter(|(&depth, _)| depth < self.depth) + .fold(current_depth_schema, |acc, (_, fields)| { + acc.intersection(fields).cloned().collect() + }); + + // do not do extra things + if added_schema.is_empty() { + return Ok(Transformed::no(node)); + } + + println!("\n*************\n{:?}\n*************\n", added_schema); + + match node { + // do not add for Projections + LogicalPlan::Projection(_) => Ok(Transformed::no(node)), + _ => { + let mut col_exprs: Vec = node + .schema() + .fields() + .iter() + .map(|field| Expr::Column(field.qualified_column())) + .collect(); + + col_exprs.extend( + added_schema + .iter() + .map(|field| Expr::Column(field.qualified_column())), + ); + + // adding new plan here + let new_plan = LogicalPlan::Projection(Projection::try_new( + col_exprs.clone(), + Arc::new(node.clone()), + )?); + + Ok(Transformed::yes( + node.with_new_exprs(col_exprs, [new_plan].to_vec())?, + )) + } + } + } +} #[cfg(test)] mod test { use std::iter; @@ -943,7 +1117,7 @@ mod test { fn assert_optimized_plan_eq(expected: &str, plan: &LogicalPlan) { let optimizer = CommonSubexprEliminate {}; let optimized_plan = optimizer - .try_optimize(plan, &OptimizerContext::new()) + .common_optimize(plan, &OptimizerContext::new()) .unwrap() .expect("failed to optimize plan"); let formatted_plan = format!("{optimized_plan:?}"); @@ -1362,7 +1536,7 @@ mod test { .unwrap(); let rule = CommonSubexprEliminate {}; let optimized_plan = rule - .try_optimize(&plan, &OptimizerContext::new()) + .common_optimize(&plan, &OptimizerContext::new()) .unwrap() .unwrap(); From 1c64d23b02436dd7d20fe192b32914a6c78fde88 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Thu, 21 Mar 2024 20:12:04 -0500 Subject: [PATCH 03/27] refactor code and add tests --- datafusion/expr/src/tree_node/expr.rs | 2 - .../optimizer/src/common_subexpr_eliminate.rs | 168 ++++++++++++------ .../optimizer/src/optimize_projections.rs | 104 ++++++++++- datafusion/sql/src/select.rs | 2 + .../test_files/project_complex_sub_query.slt | 52 ++++++ 5 files changed, 265 insertions(+), 63 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/project_complex_sub_query.slt diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs index 64967b509cca2..1c672851e9b5b 100644 --- a/datafusion/expr/src/tree_node/expr.rs +++ b/datafusion/expr/src/tree_node/expr.rs @@ -79,7 +79,6 @@ impl TreeNode for Expr { | Expr::Wildcard {..} | Expr::Placeholder (_) => vec![], Expr::BinaryExpr(BinaryExpr { left, right, .. }) => { - println!("left is {:?}, right is {:?}", left, right); vec![left.as_ref(), right.as_ref()] } Expr::Like(Like { expr, pattern, .. }) @@ -123,7 +122,6 @@ impl TreeNode for Expr { let mut expr_vec = args.iter().collect::>(); expr_vec.extend(partition_by); expr_vec.extend(order_by); - println!("expressions vector is {:?}", expr_vec); expr_vec } Expr::InList(InList { expr, list, .. }) => { diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index d2a6c3bc270d2..11121f4bd4db7 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -21,6 +21,7 @@ use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; use datafusion_expr::Expr; use std::collections::{BTreeSet, HashMap}; +use std::ops::Deref; use std::slice::Windows; use std::sync::Arc; @@ -486,8 +487,8 @@ impl CommonSubexprEliminate { original_schema ); match optimized_plan { + Some(LogicalPlan::Projection(_)) => Ok(optimized_plan), Some(optimized_plan) if optimized_plan.schema() != &original_schema => { - // add an additional projection if the output schema changed. println!( "************** \n optimized schema {:?} \n ************", optimized_plan.schema(), @@ -510,6 +511,7 @@ impl CommonSubexprEliminate { let res = plan .clone() .rewrite(&mut ProjectionAdder { + data_type_map: HashMap::new(), depth_map: HashMap::new(), depth: 0, }) @@ -525,11 +527,15 @@ impl OptimizerRule for CommonSubexprEliminate { config: &dyn OptimizerConfig, ) -> Result> { let optimized_plan_option = self.common_optimize(plan, config)?; + + // println!("optimized plan option is {:?}", optimized_plan_option); let plan = match optimized_plan_option { Some(plan) => plan, _ => plan.clone(), }; - self.add_extra_projection(&plan) + let res = self.add_extra_projection(&plan); + println!("res is {:?}", res); + res } fn name(&self) -> &str { @@ -600,6 +606,7 @@ fn build_common_expr_project_plan( "\n ********** expr is {:?} \n \n and data type is {:?} \n **************** \n", expr, data_type ); + println!("********* \n {:?} ***********\n", expr); let field = DFField::new_unqualified(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); @@ -615,7 +622,11 @@ fn build_common_expr_project_plan( project_exprs.push(Expr::Column(field.qualified_column())); } } - + println!( + "************** \n exprs are {:?} \n ************** \n", + project_exprs + ); + println!("********** \n input are {:?} \n ************** \n", input); Ok(LogicalPlan::Projection(Projection::try_new( project_exprs, Arc::new(input), @@ -974,8 +985,9 @@ fn replace_common_expr( } struct ProjectionAdder { - depth_map: HashMap>, + depth_map: HashMap>, depth: u128, + data_type_map: HashMap, } impl ProjectionAdder { @@ -983,45 +995,55 @@ impl ProjectionAdder { fn get_complex_expressions( exprs: Vec, schema: DFSchemaRef, - ) -> HashSet { + ) -> (HashSet, HashMap) { let mut res = HashSet::new(); + let mut expr_data_type: HashMap = HashMap::new(); + println!( + "********** \n current schema is {:?} \n ******** \n", + schema + ); for expr in exprs { + println!("current expr is {:?}", expr); match expr { Expr::BinaryExpr(BinaryExpr { left: ref l_box, op: _, right: ref r_box, - }) => { - match (&**l_box, &**r_box) { - (Expr::Column(l), Expr::Column(r)) => { - // Both `left` and `right` are `Expr::Column`, so we push to `res` - if schema.has_column(l) && schema.has_column(r) { - res.insert(DFField::new_unqualified( - &expr.to_string(), - schema - .field_from_column(l) - .unwrap() - .data_type() - .clone(), - true, - )); - } - } - // If they are not both `Expr::Column`, you can handle other cases or do nothing - _ => {} + }) => match (&**l_box, &**r_box) { + (Expr::Column(l), Expr::Column(r)) => { + let l_field = schema + .field_from_column(l) + .expect("Field not found for left column"); + + // res.insert(DFField::new_unqualified( + // &expr.to_string(), + // l_field.data_type().clone(), + // true, + // )); + expr_data_type + .entry(expr.clone()) + .or_insert(l_field.data_type().clone()); + res.insert(expr.clone()); } - } - Expr::Cast(_) => { - res.extend(Self::get_complex_expressions(vec![expr], schema.clone())) + _ => {} + }, + Expr::Cast(Cast { expr, data_type }) => { + let (expr_set, type_map) = + Self::get_complex_expressions(vec![*expr], schema.clone()); + res.extend(expr_set); + expr_data_type.extend(type_map); } Expr::WindowFunction(WindowFunction { fun, args, .. }) => { - res.extend(Self::get_complex_expressions(args, schema.clone())) + let (expr_set, type_map) = + Self::get_complex_expressions(args, schema.clone()); + res.extend(expr_set); + expr_data_type.extend(type_map); } _ => {} } } - res + (res, expr_data_type) } } impl TreeNodeRewriter for ProjectionAdder { @@ -1037,57 +1059,97 @@ impl TreeNodeRewriter for ProjectionAdder { println!("********* \n exprs are {:?} \n *********** \n", exprs); let depth_set = self.depth_map.entry(self.depth).or_default(); println!("self.input schema is {:?}", node.schema()); - depth_set.extend(Self::get_complex_expressions(exprs, node.schema().clone())); + let mut schema = node.schema().deref().clone(); + for ip in node.inputs() { + schema.merge(ip.schema()); + } + let (extended_set, data_map) = + Self::get_complex_expressions(exprs, Arc::new(schema)); + println!( + "*********** \n extened set is {:?} \n ********** \n", + extended_set + ); + depth_set.extend(extended_set); + self.data_type_map.extend(data_map); Ok(Transformed::no(node)) } fn f_up(&mut self, node: Self::Node) -> Result> { - self.depth -= 1; - println!("*********** \ncur plan is {:?} \n*************\n", node); + println!("*********** \n cur plan is {:?} \n*************\n", node); let current_depth_schema = self.depth_map.get(&self.depth).cloned().unwrap_or_default(); // get the intersected part - let added_schema = self + let added_expr = self .depth_map .iter() .filter(|(&depth, _)| depth < self.depth) - .fold(current_depth_schema, |acc, (_, fields)| { - acc.intersection(fields).cloned().collect() + .fold(current_depth_schema, |acc, (_, expr)| { + acc.intersection(expr).cloned().collect() }); - + println!( + "******** \n intersected parts are {:?} \n ***********", + added_expr + ); + println!( + "******** \n depth map is {:?} \n ***********", + self.depth_map + ); + println!( + "************ \n data type map is {:?} \n *********** \n", + self.data_type_map + ); + self.depth -= 1; // do not do extra things - if added_schema.is_empty() { + if added_expr.is_empty() { return Ok(Transformed::no(node)); } - println!("\n*************\n{:?}\n*************\n", added_schema); + println!("\n*************\n{:?}\n*************\n", added_expr); match node { // do not add for Projections - LogicalPlan::Projection(_) => Ok(Transformed::no(node)), + LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) => { + Ok(Transformed::no(node)) + } _ => { - let mut col_exprs: Vec = node - .schema() - .fields() - .iter() - .map(|field| Expr::Column(field.qualified_column())) - .collect(); + // avoid recursive add projections + if added_expr.iter().any(|expr| { + node.inputs()[0] + .schema() + .has_column_with_unqualified_name(&expr.to_string()) + }) { + return Ok(Transformed::no(node)); + } - col_exprs.extend( - added_schema - .iter() - .map(|field| Expr::Column(field.qualified_column())), + let mut field_set = HashSet::new(); + let mut project_exprs = vec![]; + for expr in added_expr { + let f = DFField::new_unqualified( + &expr.to_string(), + self.data_type_map[&expr].clone(), + true, + ); + field_set.insert(f.name().to_owned()); + project_exprs.push(expr.clone().alias(expr.to_string())); + } + for field in node.inputs()[0].schema().fields() { + if field_set.insert(field.qualified_name()) { + project_exprs.push(Expr::Column(field.qualified_column())); + } + } + println!( + "************* \n project expressions are {:?} \n ********** \n", + project_exprs ); - // adding new plan here let new_plan = LogicalPlan::Projection(Projection::try_new( - col_exprs.clone(), - Arc::new(node.clone()), + project_exprs, + Arc::new(node.inputs()[0].clone()), )?); - + println!("new plan is {:?}", new_plan); Ok(Transformed::yes( - node.with_new_exprs(col_exprs, [new_plan].to_vec())?, + node.with_new_exprs(node.expressions(), [new_plan].to_vec())?, )) } } diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 08ee38f64abd8..ce8bcbc132d63 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -31,7 +31,8 @@ use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::SchemaRef; use datafusion_common::{ - get_required_group_by_exprs_indices, Column, DFSchema, DFSchemaRef, JoinType, Result, + get_required_group_by_exprs_indices, Column, DFField, DFSchema, DFSchemaRef, + JoinType, Result, }; use datafusion_expr::expr::{Alias, ScalarFunction}; use datafusion_expr::{ @@ -70,6 +71,10 @@ impl OptimizerRule for OptimizeProjections { ) -> Result> { // All output fields are necessary: let indices = (0..plan.schema().fields().len()).collect::>(); + println!( + "\n ************ \n plan.schema is {:?} \n ************ \n", + plan.schema() + ); optimize_projections(plan, config, &indices) } @@ -106,6 +111,8 @@ fn optimize_projections( config: &dyn OptimizerConfig, indices: &[usize], ) -> Result> { + println!("**********\n cur plan is {:?} \n ********* \n", plan); + println!("**********\n indices is {:?} \n ********* \n", indices); // `child_required_indices` stores // - indices of the columns required for each child // - a flag indicating whether putting a projection above children is beneficial for the parent. @@ -277,7 +284,10 @@ fn optimize_projections( // Only use window expressions that are absolutely necessary according // to parent requirements: let new_window_expr = get_at_indices(&window.window_expr, &window_reqs); - + println!( + "************ \n new_window_expr is {:?} \n *********** \n", + new_window_expr + ); // Get all the required column indices at the input, either by the // parent or window expression requirements. let required_indices = get_all_required_indices( @@ -292,7 +302,10 @@ fn optimize_projections( } else { window.input.as_ref().clone() }; - + println!( + "************ \n window child is {:?} *********** \n", + window_child + ); return if new_window_expr.is_empty() { // When no window expression is necessary, use the input directly: Ok(Some(window_child)) @@ -302,8 +315,16 @@ fn optimize_projections( // refers to `old_child`. let required_exprs = get_required_exprs(window.input.schema(), &required_indices); + println!( + "\n ************** \n required exprs {:?} \n ***********\n", + required_exprs + ); let (window_child, _) = add_projection_on_top_if_helpful(window_child, required_exprs)?; + println!( + "\n ************** \nwindow child is {:?} \n ************ \n", + window_child + ); Window::try_new(new_window_expr, Arc::new(window_child)) .map(|window| Some(LogicalPlan::Window(window))) }; @@ -422,6 +443,7 @@ where /// - `Ok(None)`: Signals that merge is not beneficial (and has not taken place). /// - `Err(error)`: An error occured during the function call. fn merge_consecutive_projections(proj: &Projection) -> Result> { + println!("******** \n proj is {:?} \n ********** \n", proj); let LogicalPlan::Projection(prev_projection) = proj.input.as_ref() else { return Ok(None); }; @@ -662,6 +684,10 @@ fn indices_referred_by_exprs<'a>( input_schema: &DFSchemaRef, exprs: impl Iterator, ) -> Result> { + println!( + "************ \n current schema is {:?} \n *********** \n", + input_schema + ); let indices = exprs .map(|expr| indices_referred_by_expr(input_schema, expr)) .collect::>>()?; @@ -690,13 +716,50 @@ fn indices_referred_by_expr( input_schema: &DFSchemaRef, expr: &Expr, ) -> Result> { - let mut cols = expr.to_columns()?; // Get outer-referenced (subquery) columns: + // TODO: Support more Expressions + let mut cols = expr.to_columns()?; outer_columns(expr, &mut cols); - Ok(cols + let mut res_vec: Vec = cols .iter() .flat_map(|col| input_schema.index_of_column(col)) - .collect()) + .collect(); + match expr { + Expr::BinaryExpr(_) => { + if let Some(index) = + input_schema.index_of_column_by_name(None, &expr.to_string())? + { + match res_vec.binary_search(&index) { + Ok(_) => {} + Err(pos) => { + res_vec.insert(pos, index); + } + } + } + } + _ => {} + } + Ok(res_vec) + // match expr { + // Expr::BinaryExpr(_) => { + // if let Some(index) = + // input_schema.index_of_column_by_name(None, &expr.to_string())? + // { + // return Ok(vec![index]); + // } else { + // return Ok(vec![]); + // } + // } + // _ => { + // let mut cols = expr.to_columns()?; + // outer_columns(expr, &mut cols); + // let res_vec: Vec = cols + // .iter() + // .flat_map(|col| input_schema.index_of_column(col)) + // .collect(); + // Ok(res_vec) + // } + // } } /// Gets all required indices for the input; i.e. those required by the parent @@ -733,6 +796,7 @@ fn get_all_required_indices<'a>( /// /// A vector of expressions corresponding to specified indices. fn get_at_indices(exprs: &[Expr], indices: &[usize]) -> Vec { + println!("************* \n expr is {:?} \n ************ \n", exprs); indices .iter() // Indices may point to further places than `exprs` len. @@ -861,17 +925,36 @@ fn rewrite_projection_given_requirements( config: &dyn OptimizerConfig, indices: &[usize], ) -> Result> { + println!("************ \n proj is {:?} \n *********** \n", proj); let exprs_used = get_at_indices(&proj.expr, indices); + println!( + "************ \n exprs_used is {:?} \n *********** \n", + exprs_used + ); let required_indices = indices_referred_by_exprs(proj.input.schema(), exprs_used.iter())?; + println!( + "************ \n required_indices is {:?} \n *********** \n", + required_indices + ); return if let Some(input) = optimize_projections(&proj.input, config, &required_indices)? { if is_projection_unnecessary(&input, &exprs_used)? { Ok(Some(input)) } else { - Projection::try_new(exprs_used, Arc::new(input)) - .map(|proj| Some(LogicalPlan::Projection(proj))) + println!( + "************ \n exprs_used is {:?} \n *********** \n", + exprs_used + ); + println!("************ \n input is {:?} \n *********** \n", input); + let res = Projection::try_new(exprs_used, Arc::new(input)) + .map(|proj| Some(LogicalPlan::Projection(proj))); + println!( + "************ \n new expressions is {:?} \n *********** \n", + res + ); + res } } else if exprs_used.len() < proj.expr.len() { // Projection expression used is different than the existing projection. @@ -893,6 +976,11 @@ fn rewrite_projection_given_requirements( /// - input schema of the projection, output schema of the projection are same, and /// - all projection expressions are either Column or Literal fn is_projection_unnecessary(input: &LogicalPlan, proj_exprs: &[Expr]) -> Result { + println!( + "*************** \n projections schema is {:?} \n input schema is {:?} \n ********** \n", + &projection_schema(input, proj_exprs)?, + input.schema() + ); Ok(&projection_schema(input, proj_exprs)? == input.schema() && proj_exprs.iter().all(is_expr_trivial)) } diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 1bfd60a8ce1aa..7cabfa8661dc8 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -567,6 +567,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Wrap a plan in a projection fn project(&self, input: LogicalPlan, expr: Vec) -> Result { self.validate_schema_satisfies_exprs(input.schema(), &expr)?; + println!("*********** \n input is {:?} \n *********** \n", input); + println!("*********** \n expr is {:?} \n *********** \n", expr); LogicalPlanBuilder::from(input).project(expr)?.build() } diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt new file mode 100644 index 0000000000000..e778d3d0cebbe --- /dev/null +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Prepare Statement Tests +########## + +statement ok +CREATE EXTERNAL TABLE t ( +c1 INT NOT NULL, +c2 INT NOT NULL, +c3 INT NOT NULL, +c4 INT NOT NULL, +c5 INT NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '/Users/xiangyanxin/personal/DATAFUSION/arrow-datafusion/a.csv'; + +query TT +explain SELECT c3+c4, SUM(c3+c4) OVER() +FROM t; +---- +logical_plan +Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +--WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +----Projection: t.c3 + t.c4 AS t.c3 + t.c4, t.c3, t.c4 +------TableScan: t projection=[c3, c4] + +query TT +explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) +FROM t; +---- +logical_plan +Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW +--WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4 +------TableScan: t projection=[c3, c4] From 3e9683996993b3f1fe9be13e7d5e0cc460e68283 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Thu, 21 Mar 2024 21:23:47 -0500 Subject: [PATCH 04/27] optimize code --- datafusion/core/src/dataframe/mod.rs | 1 + .../optimizer/src/common_subexpr_eliminate.rs | 215 +++--------------- .../optimizer/src/optimize_projections.rs | 58 +---- 3 files changed, 36 insertions(+), 238 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index eea5fc1127ceb..2c58b54fa6bd9 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -2013,6 +2013,7 @@ mod tests { .await? .select_columns(&["c1", "c3"])?; let left_rows = left.clone().collect().await?; + println!("hehehehehehehhehehehhehe"); let right_rows = right.clone().collect().await?; let join = left.join(right, JoinType::Inner, &["c1"], &["c1"], None)?; let join_rows = join.collect().await?; diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 11121f4bd4db7..dd78ab5d0b593 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -19,25 +19,22 @@ use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; -use datafusion_expr::Expr; +use datafusion_expr::{Expr, Operator}; use std::collections::{BTreeSet, HashMap}; use std::ops::Deref; -use std::slice::Windows; + use std::sync::Arc; -use arrow::array; -use arrow::datatypes::{DataType, SchemaRef}; +use arrow::datatypes::DataType; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, }; use datafusion_common::{ - internal_datafusion_err, internal_err, plan_err, Column, DFField, DFSchema, - DFSchemaRef, DataFusionError, Result, + internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, }; use datafusion_expr::expr::{Alias, WindowFunction}; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; -use datafusion_expr::tree_node::plan; use datafusion_expr::{col, BinaryExpr, Cast, ExprSchemable}; use hashbrown::HashSet; use regex_syntax::ast::print; @@ -127,24 +124,14 @@ impl CommonSubexprEliminate { .iter() .zip(arrays_list.iter()) .map(|(exprs, arrays)| { - println!("************ \n expr is {:?} \n ************ \n", exprs); let res = exprs .iter() .cloned() .zip(arrays.iter()) .map(|(expr, id_array)| { - println!( - "************ \n current expr is {:?} \n ************ \n", - expr - ); - println!( - "************ \n current id_array is {:?} \n ************ \n", - id_array - ); replace_common_expr(expr, id_array, expr_set, affected_id) }) .collect::>>(); - println!("************ \n expr is {:?} \n ************ \n", res); res }) .collect::>>() @@ -158,18 +145,6 @@ impl CommonSubexprEliminate { expr_set: &ExprSet, config: &dyn OptimizerConfig, ) -> Result<(Vec>, LogicalPlan)> { - println!( - "************ \n current plan is {:?} \n *********** \n", - input - ); - println!( - "************ \n exprs list are {:?} \n *********** \n", - exprs_list - ); - println!( - "************ \n array list are {:?} \n *********** \n", - arrays_list - ); let mut affected_id = BTreeSet::::new(); let rewrite_exprs = @@ -181,14 +156,6 @@ impl CommonSubexprEliminate { if !affected_id.is_empty() { new_input = build_common_expr_project_plan(new_input, affected_id, expr_set)?; } - println!( - "\n *********** \n rewrite expressions are {:?} \n *********** \n", - rewrite_exprs - ); - println!( - "\n *********** \n new input is {:?} \n *********** \n", - new_input - ); Ok((rewrite_exprs, new_input)) } @@ -224,10 +191,6 @@ impl CommonSubexprEliminate { window_exprs.push(window_expr); arrays_per_window.push(arrays); } - println!( - "\n *********** \n window exprs are {:?} \n *********** \n", - window_exprs - ); let mut window_exprs = window_exprs .iter() .map(|expr| expr.as_slice()) @@ -236,10 +199,6 @@ impl CommonSubexprEliminate { .iter() .map(|arrays| arrays.as_slice()) .collect::>(); - println!( - "\n *********** \n arrays per windows are {:?} \n *********** \n", - arrays_per_window - ); assert_eq!(window_exprs.len(), arrays_per_window.len()); let (mut new_expr, new_input) = self.rewrite_expr( &window_exprs, @@ -248,48 +207,25 @@ impl CommonSubexprEliminate { &expr_set, config, )?; - println!( - "\n *********** \n new exprs are {:?} \n *********** \n", - new_expr - ); - println!( - "\n *********** \n new input are {:?} \n *********** \n", - new_input - ); assert_eq!(window_exprs.len(), new_expr.len()); - println!( - "\n *********** \n before plan are {:?} \n *********** \n", - plan - ); // Construct consecutive window operator, with their corresponding new window expressions. plan = new_input; while let Some(new_window_expr) = new_expr.pop() { // Since `new_expr` and `window_exprs` length are same. We can safely `.unwrap` here. let orig_window_expr = window_exprs.pop().unwrap(); assert_eq!(new_window_expr.len(), orig_window_expr.len()); - println!( - "************* \n origin window expressions {:?} \n ************** \n ", - orig_window_expr - ); // Rename new re-written window expressions with original name (by giving alias) // Otherwise we may receive schema error, in subsequent operators. - println!("************* \n before changed window expressions {:?} \n ************** \n ", new_window_expr); let new_window_expr = new_window_expr .into_iter() .zip(orig_window_expr.iter()) .map(|(new_window_expr, window_expr)| { - println!("new_window_expr is {:?}", new_window_expr); let original_name = window_expr.name_for_alias()?; new_window_expr.alias_if_changed(original_name) }) .collect::>>()?; - println!("************* \n after changed window expressions {:?} \n ************** \n ", new_window_expr); plan = LogicalPlan::Window(Window::try_new(new_window_expr, Arc::new(plan))?); } - println!( - "\n *********** \n final plan are {:?} \n *********** \n", - plan - ); Ok(plan) } @@ -416,8 +352,6 @@ impl CommonSubexprEliminate { config: &dyn OptimizerConfig, ) -> Result { let expr = plan.expressions(); - println!("********** \n expressions are {:?} \n ********* \n", expr); - println!("********** \n ur plan is {:?} \n ********* \n", plan); let inputs = plan.inputs(); let input = inputs[0]; let input_schema = Arc::clone(input.schema()); @@ -425,16 +359,9 @@ impl CommonSubexprEliminate { // Visit expr list and build expr identifier to occuring count map (`expr_set`). let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?; - println!("********* \n arrays {:?} \n ********* \n", arrays); - println!("********* \n expr_set {:?} \n ********* \n", expr_set); let (mut new_expr, new_input) = self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?; - println!( - "********* \n new expr is {:?} \n and new input is {:?} ********* \n", - new_expr, new_input - ); let res_plan = plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]); - println!("********* \n res plan is {:?} \n ********* \n", res_plan); res_plan } } @@ -482,21 +409,9 @@ impl CommonSubexprEliminate { }; let original_schema = plan.schema().clone(); - println!( - "***************** \n original schema is {:?} \n ***************", - original_schema - ); match optimized_plan { Some(LogicalPlan::Projection(_)) => Ok(optimized_plan), Some(optimized_plan) if optimized_plan.schema() != &original_schema => { - println!( - "************** \n optimized schema {:?} \n ************", - optimized_plan.schema(), - ); - println!( - "************** \n current plan is {:?} \n ************", - optimized_plan, - ); Ok(Some(build_recover_project_plan( &original_schema, optimized_plan, @@ -516,7 +431,6 @@ impl CommonSubexprEliminate { depth: 0, }) .map(|transformed| Some(transformed.data)); - println!("************** \n result plan is {:?} *********** \n", res); res } } @@ -534,7 +448,6 @@ impl OptimizerRule for CommonSubexprEliminate { _ => plan.clone(), }; let res = self.add_extra_projection(&plan); - println!("res is {:?}", res); res } @@ -568,12 +481,10 @@ fn to_arrays( expr_set: &mut ExprSet, expr_mask: ExprMask, ) -> Result>> { - println!("schema is {:?}", input_schema); let res = expr .iter() .map(|e| { let mut id_array = vec![]; - println!("current expression is {:?}", e); expr_to_identifier( e, expr_set, @@ -585,7 +496,6 @@ fn to_arrays( Ok(id_array) }) .collect::>>(); - println!("res is {:?}", res); res } @@ -602,11 +512,6 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type)) => { // todo: check `nullable` - println!( - "\n ********** expr is {:?} \n \n and data type is {:?} \n **************** \n", - expr, data_type - ); - println!("********* \n {:?} ***********\n", expr); let field = DFField::new_unqualified(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); @@ -622,11 +527,6 @@ fn build_common_expr_project_plan( project_exprs.push(Expr::Column(field.qualified_column())); } } - println!( - "************** \n exprs are {:?} \n ************** \n", - project_exprs - ); - println!("********** \n input are {:?} \n ************** \n", input); Ok(LogicalPlan::Projection(Projection::try_new( project_exprs, Arc::new(input), @@ -644,19 +544,8 @@ fn build_recover_project_plan( let col_exprs = schema .fields() .iter() - .map(|field| { - println!("field is {:?}", field); - Expr::Column(field.qualified_column()) - }) + .map(|field| Expr::Column(field.qualified_column())) .collect(); - println!( - "************** \n col_exprs are {:?} \n **************", - col_exprs - ); - println!( - "************** \n current plan is {:?} \n **************", - input - ); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, Arc::new(input), @@ -813,7 +702,6 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { } fn f_up(&mut self, expr: &Expr) -> Result { - println!("********* \n expr is {:?} \n *********", expr); self.series_number += 1; let (idx, sub_expr_identifier) = self.pop_enter_mark(); @@ -833,7 +721,6 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { self.visit_stack.push(VisitRecord::ExprItem(desc.clone())); let data_type = expr.get_type(&self.input_schema)?; - print!("data type is {:?}", data_type); self.expr_set .entry(desc) .or_insert_with(|| (expr.clone(), 0, data_type)) @@ -887,10 +774,6 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> { // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate // the `id_array`, which records the expr's identifier used to rewrite expr. So if we // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too. - println!( - "****************** \n expr is {:?} \n ****************** \n ", - expr - ); if expr.short_circuits() || is_volatile_expression(&expr)? { return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump)); } @@ -963,11 +846,6 @@ fn replace_common_expr( expr_set: &ExprSet, affected_id: &mut BTreeSet, ) -> Result { - println!("************ \n expr are {:?} \n ************", expr); - println!( - "************ \n id_array are {:?} \n ************", - id_array - ); let rewrited = expr .rewrite(&mut CommonSubexprRewriter { expr_set, @@ -977,10 +855,6 @@ fn replace_common_expr( curr_index: 0, }) .data(); - println!( - "************ \n rewrited values are {:?} \n ************", - rewrited - ); rewrited } @@ -989,7 +863,12 @@ struct ProjectionAdder { depth: u128, data_type_map: HashMap, } - +pub fn is_not_complex(op: &Operator) -> bool { + matches!( + op, + &Operator::Eq | &Operator::NotEq | &Operator::And | &Operator::Lt | &Operator::Gt + ) +} impl ProjectionAdder { // TODO: adding more expressions for sub query, currently only support for Simple Binary Expressions fn get_complex_expressions( @@ -998,35 +877,32 @@ impl ProjectionAdder { ) -> (HashSet, HashMap) { let mut res = HashSet::new(); let mut expr_data_type: HashMap = HashMap::new(); - println!( - "********** \n current schema is {:?} \n ******** \n", - schema - ); for expr in exprs { - println!("current expr is {:?}", expr); match expr { Expr::BinaryExpr(BinaryExpr { left: ref l_box, - op: _, + op, right: ref r_box, - }) => match (&**l_box, &**r_box) { - (Expr::Column(l), Expr::Column(r)) => { - let l_field = schema - .field_from_column(l) - .expect("Field not found for left column"); - - // res.insert(DFField::new_unqualified( - // &expr.to_string(), - // l_field.data_type().clone(), - // true, - // )); - expr_data_type - .entry(expr.clone()) - .or_insert(l_field.data_type().clone()); - res.insert(expr.clone()); + }) if !is_not_complex(&op) => { + match (&**l_box, &**r_box) { + (Expr::Column(l), Expr::Column(_r)) => { + let l_field = schema + .field_from_column(l) + .expect("Field not found for left column"); + + // res.insert(DFField::new_unqualified( + // &expr.to_string(), + // l_field.data_type().clone(), + // true, + // )); + expr_data_type + .entry(expr.clone()) + .or_insert(l_field.data_type().clone()); + res.insert(expr.clone()); + } + _ => {} } - _ => {} - }, + } Expr::Cast(Cast { expr, data_type }) => { let (expr_set, type_map) = Self::get_complex_expressions(vec![*expr], schema.clone()); @@ -1055,27 +931,18 @@ impl TreeNodeRewriter for ProjectionAdder { self.depth += 1; // extract all expressions + check whether it contains in depth_sets let exprs = node.expressions(); - println!("********* \n cur plan is {:?} \n *********** \n", node); - println!("********* \n exprs are {:?} \n *********** \n", exprs); let depth_set = self.depth_map.entry(self.depth).or_default(); - println!("self.input schema is {:?}", node.schema()); let mut schema = node.schema().deref().clone(); for ip in node.inputs() { schema.merge(ip.schema()); } let (extended_set, data_map) = Self::get_complex_expressions(exprs, Arc::new(schema)); - println!( - "*********** \n extened set is {:?} \n ********** \n", - extended_set - ); depth_set.extend(extended_set); self.data_type_map.extend(data_map); Ok(Transformed::no(node)) } fn f_up(&mut self, node: Self::Node) -> Result> { - println!("*********** \n cur plan is {:?} \n*************\n", node); - let current_depth_schema = self.depth_map.get(&self.depth).cloned().unwrap_or_default(); @@ -1087,26 +954,11 @@ impl TreeNodeRewriter for ProjectionAdder { .fold(current_depth_schema, |acc, (_, expr)| { acc.intersection(expr).cloned().collect() }); - println!( - "******** \n intersected parts are {:?} \n ***********", - added_expr - ); - println!( - "******** \n depth map is {:?} \n ***********", - self.depth_map - ); - println!( - "************ \n data type map is {:?} \n *********** \n", - self.data_type_map - ); self.depth -= 1; // do not do extra things if added_expr.is_empty() { return Ok(Transformed::no(node)); } - - println!("\n*************\n{:?}\n*************\n", added_expr); - match node { // do not add for Projections LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) => { @@ -1138,16 +990,11 @@ impl TreeNodeRewriter for ProjectionAdder { project_exprs.push(Expr::Column(field.qualified_column())); } } - println!( - "************* \n project expressions are {:?} \n ********** \n", - project_exprs - ); // adding new plan here let new_plan = LogicalPlan::Projection(Projection::try_new( project_exprs, Arc::new(node.inputs()[0].clone()), )?); - println!("new plan is {:?}", new_plan); Ok(Transformed::yes( node.with_new_exprs(node.expressions(), [new_plan].to_vec())?, )) diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index ce8bcbc132d63..64e5075b4ca03 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -26,15 +26,16 @@ use std::collections::HashSet; use std::sync::Arc; +use crate::common_subexpr_eliminate::is_not_complex; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::SchemaRef; use datafusion_common::{ - get_required_group_by_exprs_indices, Column, DFField, DFSchema, DFSchemaRef, - JoinType, Result, + get_required_group_by_exprs_indices, Column, DFSchemaRef, JoinType, Result, }; use datafusion_expr::expr::{Alias, ScalarFunction}; + use datafusion_expr::{ logical_plan::LogicalPlan, projection_schema, Aggregate, BinaryExpr, Cast, Distinct, Expr, Projection, TableScan, Window, @@ -71,10 +72,6 @@ impl OptimizerRule for OptimizeProjections { ) -> Result> { // All output fields are necessary: let indices = (0..plan.schema().fields().len()).collect::>(); - println!( - "\n ************ \n plan.schema is {:?} \n ************ \n", - plan.schema() - ); optimize_projections(plan, config, &indices) } @@ -111,8 +108,6 @@ fn optimize_projections( config: &dyn OptimizerConfig, indices: &[usize], ) -> Result> { - println!("**********\n cur plan is {:?} \n ********* \n", plan); - println!("**********\n indices is {:?} \n ********* \n", indices); // `child_required_indices` stores // - indices of the columns required for each child // - a flag indicating whether putting a projection above children is beneficial for the parent. @@ -284,10 +279,6 @@ fn optimize_projections( // Only use window expressions that are absolutely necessary according // to parent requirements: let new_window_expr = get_at_indices(&window.window_expr, &window_reqs); - println!( - "************ \n new_window_expr is {:?} \n *********** \n", - new_window_expr - ); // Get all the required column indices at the input, either by the // parent or window expression requirements. let required_indices = get_all_required_indices( @@ -302,10 +293,6 @@ fn optimize_projections( } else { window.input.as_ref().clone() }; - println!( - "************ \n window child is {:?} *********** \n", - window_child - ); return if new_window_expr.is_empty() { // When no window expression is necessary, use the input directly: Ok(Some(window_child)) @@ -315,16 +302,8 @@ fn optimize_projections( // refers to `old_child`. let required_exprs = get_required_exprs(window.input.schema(), &required_indices); - println!( - "\n ************** \n required exprs {:?} \n ***********\n", - required_exprs - ); let (window_child, _) = add_projection_on_top_if_helpful(window_child, required_exprs)?; - println!( - "\n ************** \nwindow child is {:?} \n ************ \n", - window_child - ); Window::try_new(new_window_expr, Arc::new(window_child)) .map(|window| Some(LogicalPlan::Window(window))) }; @@ -443,7 +422,6 @@ where /// - `Ok(None)`: Signals that merge is not beneficial (and has not taken place). /// - `Err(error)`: An error occured during the function call. fn merge_consecutive_projections(proj: &Projection) -> Result> { - println!("******** \n proj is {:?} \n ********** \n", proj); let LogicalPlan::Projection(prev_projection) = proj.input.as_ref() else { return Ok(None); }; @@ -684,10 +662,6 @@ fn indices_referred_by_exprs<'a>( input_schema: &DFSchemaRef, exprs: impl Iterator, ) -> Result> { - println!( - "************ \n current schema is {:?} \n *********** \n", - input_schema - ); let indices = exprs .map(|expr| indices_referred_by_expr(input_schema, expr)) .collect::>>()?; @@ -725,7 +699,7 @@ fn indices_referred_by_expr( .flat_map(|col| input_schema.index_of_column(col)) .collect(); match expr { - Expr::BinaryExpr(_) => { + Expr::BinaryExpr(BinaryExpr { op, .. }) if is_not_complex(op) => { if let Some(index) = input_schema.index_of_column_by_name(None, &expr.to_string())? { @@ -796,7 +770,6 @@ fn get_all_required_indices<'a>( /// /// A vector of expressions corresponding to specified indices. fn get_at_indices(exprs: &[Expr], indices: &[usize]) -> Vec { - println!("************* \n expr is {:?} \n ************ \n", exprs); indices .iter() // Indices may point to further places than `exprs` len. @@ -925,35 +898,17 @@ fn rewrite_projection_given_requirements( config: &dyn OptimizerConfig, indices: &[usize], ) -> Result> { - println!("************ \n proj is {:?} \n *********** \n", proj); let exprs_used = get_at_indices(&proj.expr, indices); - println!( - "************ \n exprs_used is {:?} \n *********** \n", - exprs_used - ); let required_indices = indices_referred_by_exprs(proj.input.schema(), exprs_used.iter())?; - println!( - "************ \n required_indices is {:?} \n *********** \n", - required_indices - ); return if let Some(input) = optimize_projections(&proj.input, config, &required_indices)? { if is_projection_unnecessary(&input, &exprs_used)? { Ok(Some(input)) } else { - println!( - "************ \n exprs_used is {:?} \n *********** \n", - exprs_used - ); - println!("************ \n input is {:?} \n *********** \n", input); let res = Projection::try_new(exprs_used, Arc::new(input)) .map(|proj| Some(LogicalPlan::Projection(proj))); - println!( - "************ \n new expressions is {:?} \n *********** \n", - res - ); res } } else if exprs_used.len() < proj.expr.len() { @@ -976,11 +931,6 @@ fn rewrite_projection_given_requirements( /// - input schema of the projection, output schema of the projection are same, and /// - all projection expressions are either Column or Literal fn is_projection_unnecessary(input: &LogicalPlan, proj_exprs: &[Expr]) -> Result { - println!( - "*************** \n projections schema is {:?} \n input schema is {:?} \n ********** \n", - &projection_schema(input, proj_exprs)?, - input.schema() - ); Ok(&projection_schema(input, proj_exprs)? == input.schema() && proj_exprs.iter().all(is_expr_trivial)) } From 71b5bd7fb384ade491b45dffb10ec7095d22b44c Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Thu, 21 Mar 2024 21:25:00 -0500 Subject: [PATCH 05/27] remove useless print --- datafusion/core/src/dataframe/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 2c58b54fa6bd9..eea5fc1127ceb 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -2013,7 +2013,6 @@ mod tests { .await? .select_columns(&["c1", "c3"])?; let left_rows = left.clone().collect().await?; - println!("hehehehehehehhehehehhehe"); let right_rows = right.clone().collect().await?; let join = left.join(right, JoinType::Inner, &["c1"], &["c1"], None)?; let join_rows = join.collect().await?; From 30d29a0ec92f982ace1a47bfa5038e0f2c2cc00b Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Thu, 21 Mar 2024 21:25:38 -0500 Subject: [PATCH 06/27] remove print --- datafusion/sql/src/select.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 7cabfa8661dc8..1bfd60a8ce1aa 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -567,8 +567,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Wrap a plan in a projection fn project(&self, input: LogicalPlan, expr: Vec) -> Result { self.validate_schema_satisfies_exprs(input.schema(), &expr)?; - println!("*********** \n input is {:?} \n *********** \n", input); - println!("*********** \n expr is {:?} \n *********** \n", expr); LogicalPlanBuilder::from(input).project(expr)?.build() } From 81523d184b13038b649a2478ed838cc6d85867ae Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Thu, 21 Mar 2024 21:32:11 -0500 Subject: [PATCH 07/27] optimize code --- datafusion/optimizer/src/common_subexpr_eliminate.rs | 9 ++++----- datafusion/optimizer/src/optimize_projections.rs | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index dd78ab5d0b593..b2a96e4515296 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -37,7 +37,6 @@ use datafusion_expr::expr::{Alias, WindowFunction}; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; use datafusion_expr::{col, BinaryExpr, Cast, ExprSchemable}; use hashbrown::HashSet; -use regex_syntax::ast::print; /// A map from expression's identifier to tuple including /// - the expression itself (cloned) @@ -903,14 +902,14 @@ impl ProjectionAdder { _ => {} } } - Expr::Cast(Cast { expr, data_type }) => { - let (expr_set, type_map) = + Expr::Cast(Cast { expr, data_type: _ }) => { + let (expr_set, type_data_map) = Self::get_complex_expressions(vec![*expr], schema.clone()); res.extend(expr_set); - expr_data_type.extend(type_map); + expr_data_type.extend(type_data_map); } - Expr::WindowFunction(WindowFunction { fun, args, .. }) => { + Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { let (expr_set, type_map) = Self::get_complex_expressions(args, schema.clone()); res.extend(expr_set); diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 64e5075b4ca03..3e77ff09f4d4a 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -32,7 +32,7 @@ use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::SchemaRef; use datafusion_common::{ - get_required_group_by_exprs_indices, Column, DFSchemaRef, JoinType, Result, + get_required_group_by_exprs_indices, Column, DFSchema, DFSchemaRef, JoinType, Result, }; use datafusion_expr::expr::{Alias, ScalarFunction}; From 1c4010be3aec3145b2456308e880a077c296e125 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Fri, 22 Mar 2024 09:48:39 -0500 Subject: [PATCH 08/27] optimize code --- .../optimizer/src/common_subexpr_eliminate.rs | 17 +++++++---------- .../optimizer/src/optimize_projections.rs | 7 +++---- .../test_files/project_complex_sub_query.slt | 10 ++++++++++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index b2a96e4515296..f86a5f14730e3 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -360,8 +360,7 @@ impl CommonSubexprEliminate { let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?; let (mut new_expr, new_input) = self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?; - let res_plan = plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]); - res_plan + plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]) } } impl CommonSubexprEliminate { @@ -422,15 +421,13 @@ impl CommonSubexprEliminate { /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the /// fn add_extra_projection(&self, plan: &LogicalPlan) -> Result> { - let res = plan - .clone() + plan.clone() .rewrite(&mut ProjectionAdder { data_type_map: HashMap::new(), depth_map: HashMap::new(), depth: 0, }) - .map(|transformed| Some(transformed.data)); - res + .map(|transformed| Some(transformed.data)) } } impl OptimizerRule for CommonSubexprEliminate { @@ -865,7 +862,7 @@ struct ProjectionAdder { pub fn is_not_complex(op: &Operator) -> bool { matches!( op, - &Operator::Eq | &Operator::NotEq | &Operator::And | &Operator::Lt | &Operator::Gt + &Operator::Eq | &Operator::NotEq | &Operator::Lt | &Operator::Gt | &Operator::And ) } impl ProjectionAdder { @@ -960,9 +957,9 @@ impl TreeNodeRewriter for ProjectionAdder { } match node { // do not add for Projections - LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) => { - Ok(Transformed::no(node)) - } + LogicalPlan::Projection(_) + | LogicalPlan::TableScan(_) + | LogicalPlan::Join(_) => Ok(Transformed::no(node)), _ => { // avoid recursive add projections if added_expr.iter().any(|expr| { diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 3e77ff09f4d4a..e46c937b3f2ef 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -699,7 +699,7 @@ fn indices_referred_by_expr( .flat_map(|col| input_schema.index_of_column(col)) .collect(); match expr { - Expr::BinaryExpr(BinaryExpr { op, .. }) if is_not_complex(op) => { + Expr::BinaryExpr(BinaryExpr { op, .. }) if !is_not_complex(op) => { if let Some(index) = input_schema.index_of_column_by_name(None, &expr.to_string())? { @@ -907,9 +907,8 @@ fn rewrite_projection_given_requirements( if is_projection_unnecessary(&input, &exprs_used)? { Ok(Some(input)) } else { - let res = Projection::try_new(exprs_used, Arc::new(input)) - .map(|proj| Some(LogicalPlan::Projection(proj))); - res + Projection::try_new(exprs_used, Arc::new(input)) + .map(|proj| Some(LogicalPlan::Projection(proj))) } } else if exprs_used.len() < proj.expr.len() { // Projection expression used is different than the existing projection. diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index e778d3d0cebbe..accc4b9abdbae 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -50,3 +50,13 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] ----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4 ------TableScan: t projection=[c3, c4] + +query TT +explain SELECT c3-c4, SUM(c3-c4) OVER(order by c3-c4) +FROM t; +---- +logical_plan +Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW +--WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4 +------TableScan: t projection=[c3, c4] From 4add9cb5540bbf80dd24ae2c06d8e4dfdcafa2ef Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Fri, 22 Mar 2024 09:50:32 -0500 Subject: [PATCH 09/27] adding test --- .../test_files/project_complex_sub_query.slt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index accc4b9abdbae..917d165abbb8c 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -60,3 +60,13 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] ----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4 ------TableScan: t projection=[c3, c4] + +query TT +explain SELECT c3-c4, SUM(c3-c4) OVER() +FROM t; +---- +logical_plan +Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +--WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4 +------TableScan: t projection=[c3, c4] From 251c48d55cc81297e1bc4b97caf623d91cabb9f6 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Fri, 22 Mar 2024 10:17:47 -0500 Subject: [PATCH 10/27] fix clippy --- .../optimizer/src/common_subexpr_eliminate.rs | 52 ++++++++----------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index f86a5f14730e3..57a3519c6b557 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -443,8 +443,7 @@ impl OptimizerRule for CommonSubexprEliminate { Some(plan) => plan, _ => plan.clone(), }; - let res = self.add_extra_projection(&plan); - res + self.add_extra_projection(&plan) } fn name(&self) -> &str { @@ -842,16 +841,14 @@ fn replace_common_expr( expr_set: &ExprSet, affected_id: &mut BTreeSet, ) -> Result { - let rewrited = expr - .rewrite(&mut CommonSubexprRewriter { - expr_set, - id_array, - affected_id, - max_series_number: 0, - curr_index: 0, - }) - .data(); - rewrited + expr.rewrite(&mut CommonSubexprRewriter { + expr_set, + id_array, + affected_id, + max_series_number: 0, + curr_index: 0, + }) + .data() } struct ProjectionAdder { @@ -880,23 +877,20 @@ impl ProjectionAdder { op, right: ref r_box, }) if !is_not_complex(&op) => { - match (&**l_box, &**r_box) { - (Expr::Column(l), Expr::Column(_r)) => { - let l_field = schema - .field_from_column(l) - .expect("Field not found for left column"); - - // res.insert(DFField::new_unqualified( - // &expr.to_string(), - // l_field.data_type().clone(), - // true, - // )); - expr_data_type - .entry(expr.clone()) - .or_insert(l_field.data_type().clone()); - res.insert(expr.clone()); - } - _ => {} + if let (Expr::Column(l), Expr::Column(_r)) = (&**l_box, &**r_box) { + let l_field = schema + .field_from_column(l) + .expect("Field not found for left column"); + + // res.insert(DFField::new_unqualified( + // &expr.to_string(), + // l_field.data_type().clone(), + // true, + // )); + expr_data_type + .entry(expr.clone()) + .or_insert(l_field.data_type().clone()); + res.insert(expr.clone()); } } Expr::Cast(Cast { expr, data_type: _ }) => { From 117e49067a8a734d7a671de4b3197b9e9d1896c0 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Sun, 24 Mar 2024 08:03:02 -0500 Subject: [PATCH 11/27] optimize code --- .../optimizer/src/optimize_projections.rs | 20 ------------------- .../test_files/project_complex_sub_query.slt | 2 +- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index e46c937b3f2ef..8ad773d89e13e 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -714,26 +714,6 @@ fn indices_referred_by_expr( _ => {} } Ok(res_vec) - // match expr { - // Expr::BinaryExpr(_) => { - // if let Some(index) = - // input_schema.index_of_column_by_name(None, &expr.to_string())? - // { - // return Ok(vec![index]); - // } else { - // return Ok(vec![]); - // } - // } - // _ => { - // let mut cols = expr.to_columns()?; - // outer_columns(expr, &mut cols); - // let res_vec: Vec = cols - // .iter() - // .flat_map(|col| input_schema.index_of_column(col)) - // .collect(); - // Ok(res_vec) - // } - // } } /// Gets all required indices for the input; i.e. those required by the parent diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index 917d165abbb8c..ffcb9cc3d70c8 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -29,7 +29,7 @@ c5 INT NOT NULL ) STORED AS CSV WITH HEADER ROW -LOCATION '/Users/xiangyanxin/personal/DATAFUSION/arrow-datafusion/a.csv'; +LOCATION '../core/tests/data/demo.csv'; query TT explain SELECT c3+c4, SUM(c3+c4) OVER() From 113e232ccf5f68a2f6535e84a384c900b2fadbb4 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Tue, 26 Mar 2024 14:48:15 -0500 Subject: [PATCH 12/27] optimize --- .../tests/data/project_complex_expression.csv | 18 ++++++ .../optimizer/src/common_subexpr_eliminate.rs | 62 ++++++++++++++++--- .../test_files/project_complex_sub_query.slt | 2 +- 3 files changed, 71 insertions(+), 11 deletions(-) create mode 100644 datafusion/core/tests/data/project_complex_expression.csv diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv new file mode 100644 index 0000000000000..c002d86970769 --- /dev/null +++ b/datafusion/core/tests/data/project_complex_expression.csv @@ -0,0 +1,18 @@ +c1,c2,c3,c4,c5 +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +16,17,18,19,20 +21,22,23,24,25 +26,27,28,29,30 +31,32,33,34,35 +36,37,38,39,40 +41,42,43,44,45 +46,47,48,49,50 +51,52,53,54,55 +56,57,58,59,60 +61,62,63,64,65 +66,67,68,69,70 +71,72,73,74,75 +76,77,78,79,80 +81,82,83,84,85 \ No newline at end of file diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 57a3519c6b557..393bd799bdf9a 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -438,7 +438,6 @@ impl OptimizerRule for CommonSubexprEliminate { ) -> Result> { let optimized_plan_option = self.common_optimize(plan, config)?; - // println!("optimized plan option is {:?}", optimized_plan_option); let plan = match optimized_plan_option { Some(plan) => plan, _ => plan.clone(), @@ -882,11 +881,6 @@ impl ProjectionAdder { .field_from_column(l) .expect("Field not found for left column"); - // res.insert(DFField::new_unqualified( - // &expr.to_string(), - // l_field.data_type().clone(), - // true, - // )); expr_data_type .entry(expr.clone()) .or_insert(l_field.data_type().clone()); @@ -919,7 +913,7 @@ impl TreeNodeRewriter for ProjectionAdder { fn f_down(&mut self, node: Self::Node) -> Result> { // use depth to trace where we are in the LogicalPlan tree self.depth += 1; - // extract all expressions + check whether it contains in depth_sets + // extract all expressions and check whether it contains in depth_sets let exprs = node.expressions(); let depth_set = self.depth_map.entry(self.depth).or_default(); let mut schema = node.schema().deref().clone(); @@ -936,14 +930,58 @@ impl TreeNodeRewriter for ProjectionAdder { let current_depth_schema = self.depth_map.get(&self.depth).cloned().unwrap_or_default(); - // get the intersected part + // get the intersected part looking up let added_expr = self .depth_map .iter() .filter(|(&depth, _)| depth < self.depth) - .fold(current_depth_schema, |acc, (_, expr)| { + .fold(current_depth_schema.clone(), |acc, (_, expr)| { acc.intersection(expr).cloned().collect() }); + // in projection, we are trying to get intersect with lower one and if some column is not used in upper one, we just abandon them + match node.clone() { + LogicalPlan::Projection(_) => { + // we get the expressions that has intersect with deeper layer since those expressions could be rewrite + let mut cross_expr_deeper = HashSet::new(); + + self.depth_map + .iter() + .filter(|(&depth, _)| depth > self.depth) + .for_each(|(_, exprs)| { + // get intersection + let intersection = current_depth_schema + .intersection(exprs) + .cloned() + .collect::>(); + cross_expr_deeper = + cross_expr_deeper.union(&intersection).cloned().collect(); + }); + let mut project_exprs = vec![]; + if cross_expr_deeper.len() > 0 { + let current_expressions = node.expressions(); + for expr in current_expressions { + if cross_expr_deeper.contains(&expr) { + let f = DFField::new_unqualified( + &expr.to_string(), + self.data_type_map[&expr].clone(), + true, + ); + project_exprs.push(Expr::Column(f.qualified_column())); + cross_expr_deeper.remove(&expr); + } else { + project_exprs.push(expr); + } + } + let new_projection = LogicalPlan::Projection(Projection::try_new( + project_exprs, + Arc::new(node.inputs()[0].clone()), + )?); + self.depth -= 1; + return Ok(Transformed::yes(new_projection)); + } + } + _ => {} + } self.depth -= 1; // do not do extra things if added_expr.is_empty() { @@ -954,6 +992,7 @@ impl TreeNodeRewriter for ProjectionAdder { LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) | LogicalPlan::Join(_) => Ok(Transformed::no(node)), + _ => { // avoid recursive add projections if added_expr.iter().any(|expr| { @@ -966,6 +1005,7 @@ impl TreeNodeRewriter for ProjectionAdder { let mut field_set = HashSet::new(); let mut project_exprs = vec![]; + // here we can deduce that for expr in added_expr { let f = DFField::new_unqualified( &expr.to_string(), @@ -1005,7 +1045,7 @@ mod test { }; use datafusion_expr::{ grouping_set, AccumulatorFactoryFunction, AggregateUDF, Signature, - SimpleAggregateUDF, Volatility, + SimpleAggregateUDF, TableScan, Volatility, }; use crate::optimizer::OptimizerContext; @@ -1528,4 +1568,6 @@ mod test { assert!(result.len() == 1); Ok(()) } + #[test] + fn test_extra_projection_for_upper() -> Result<()> {} } diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index ffcb9cc3d70c8..c57bf88d2e4a4 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -29,7 +29,7 @@ c5 INT NOT NULL ) STORED AS CSV WITH HEADER ROW -LOCATION '../core/tests/data/demo.csv'; +LOCATION '../core/tests/data/project_complex_expressions.csv'; query TT explain SELECT c3+c4, SUM(c3+c4) OVER() From d4ac68fb14c10df4cfb92340c0964479ef662e7a Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Tue, 26 Mar 2024 14:58:06 -0500 Subject: [PATCH 13/27] optimize --- .../tests/data/project_complex_expression.csv | 14 +---- .../optimizer/src/common_subexpr_eliminate.rs | 2 - .../test_files/project_complex_sub_query.slt | 54 ++++++++++++++++++- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv index c002d86970769..c78f35a485cd0 100644 --- a/datafusion/core/tests/data/project_complex_expression.csv +++ b/datafusion/core/tests/data/project_complex_expression.csv @@ -3,16 +3,4 @@ c1,c2,c3,c4,c5 6,7,8,9,10 11,12,13,14,15 16,17,18,19,20 -21,22,23,24,25 -26,27,28,29,30 -31,32,33,34,35 -36,37,38,39,40 -41,42,43,44,45 -46,47,48,49,50 -51,52,53,54,55 -56,57,58,59,60 -61,62,63,64,65 -66,67,68,69,70 -71,72,73,74,75 -76,77,78,79,80 -81,82,83,84,85 \ No newline at end of file +21,22,23,24,25 \ No newline at end of file diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 393bd799bdf9a..4db43c3f3f61f 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1568,6 +1568,4 @@ mod test { assert!(result.len() == 1); Ok(()) } - #[test] - fn test_extra_projection_for_upper() -> Result<()> {} } diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index c57bf88d2e4a4..5ba1f493fa379 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -29,7 +29,7 @@ c5 INT NOT NULL ) STORED AS CSV WITH HEADER ROW -LOCATION '../core/tests/data/project_complex_expressions.csv'; +LOCATION '../core/tests/data/project_complex_expression.csv'; query TT explain SELECT c3+c4, SUM(c3+c4) OVER() @@ -40,6 +40,13 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND U --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ----Projection: t.c3 + t.c4 AS t.c3 + t.c4, t.c3, t.c4 ------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[t.c3 + t.c4@0 as t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] +--WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +----CoalescePartitionsExec +------ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4, c3@0 as c3, c4@1 as c4] +--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query TT explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) @@ -50,6 +57,15 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] ----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4 ------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[c3@1 + c4@2 as t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] +--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----BoundedWindowAggExec: wdw=[SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +------SortPreservingMergeExec: [t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST] +--------SortExec: expr=[t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST] +----------ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4] +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query TT explain SELECT c3-c4, SUM(c3-c4) OVER(order by c3-c4) @@ -60,6 +76,15 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] ----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4 ------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[c3@1 - c4@2 as t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] +--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----BoundedWindowAggExec: wdw=[SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +------SortPreservingMergeExec: [t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST] +--------SortExec: expr=[t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST] +----------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4] +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query TT explain SELECT c3-c4, SUM(c3-c4) OVER() @@ -70,3 +95,30 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND U --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4 ------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[t.c3 - t.c4@0 as t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] +--WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +----CoalescePartitionsExec +------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4, c3@0 as c3, c4@1 as c4] +--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true + +query II +SELECT c3+c4, SUM(c3+c4) OVER() +FROM t; +---- +7 135 +17 135 +27 135 +37 135 +47 135 + +query II +SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) +FROM t +---- +7 7 +17 24 +27 51 +37 88 +47 135 From 01b23a3282718562a3ae40aad1c789a2f733c10d Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Tue, 26 Mar 2024 15:25:01 -0500 Subject: [PATCH 14/27] fix clippy --- .../optimizer/src/common_subexpr_eliminate.rs | 73 +++++++++---------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 4db43c3f3f61f..c3790da6162a9 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -939,48 +939,45 @@ impl TreeNodeRewriter for ProjectionAdder { acc.intersection(expr).cloned().collect() }); // in projection, we are trying to get intersect with lower one and if some column is not used in upper one, we just abandon them - match node.clone() { - LogicalPlan::Projection(_) => { - // we get the expressions that has intersect with deeper layer since those expressions could be rewrite - let mut cross_expr_deeper = HashSet::new(); + if let LogicalPlan::Projection(_) = node.clone() { + // we get the expressions that has intersect with deeper layer since those expressions could be rewrite + let mut cross_expr_deeper = HashSet::new(); - self.depth_map - .iter() - .filter(|(&depth, _)| depth > self.depth) - .for_each(|(_, exprs)| { - // get intersection - let intersection = current_depth_schema - .intersection(exprs) - .cloned() - .collect::>(); - cross_expr_deeper = - cross_expr_deeper.union(&intersection).cloned().collect(); - }); - let mut project_exprs = vec![]; - if cross_expr_deeper.len() > 0 { - let current_expressions = node.expressions(); - for expr in current_expressions { - if cross_expr_deeper.contains(&expr) { - let f = DFField::new_unqualified( - &expr.to_string(), - self.data_type_map[&expr].clone(), - true, - ); - project_exprs.push(Expr::Column(f.qualified_column())); - cross_expr_deeper.remove(&expr); - } else { - project_exprs.push(expr); - } + self.depth_map + .iter() + .filter(|(&depth, _)| depth > self.depth) + .for_each(|(_, exprs)| { + // get intersection + let intersection = current_depth_schema + .intersection(exprs) + .cloned() + .collect::>(); + cross_expr_deeper = + cross_expr_deeper.union(&intersection).cloned().collect(); + }); + let mut project_exprs = vec![]; + if !cross_expr_deeper.is_empty() { + let current_expressions = node.expressions(); + for expr in current_expressions { + if cross_expr_deeper.contains(&expr) { + let f = DFField::new_unqualified( + &expr.to_string(), + self.data_type_map[&expr].clone(), + true, + ); + project_exprs.push(Expr::Column(f.qualified_column())); + cross_expr_deeper.remove(&expr); + } else { + project_exprs.push(expr); } - let new_projection = LogicalPlan::Projection(Projection::try_new( - project_exprs, - Arc::new(node.inputs()[0].clone()), - )?); - self.depth -= 1; - return Ok(Transformed::yes(new_projection)); } + let new_projection = LogicalPlan::Projection(Projection::try_new( + project_exprs, + Arc::new(node.inputs()[0].clone()), + )?); + self.depth -= 1; + return Ok(Transformed::yes(new_projection)); } - _ => {} } self.depth -= 1; // do not do extra things From ee962f14dcbfa11680b941fe6cb9fe13b1ea3b67 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Tue, 26 Mar 2024 18:32:43 -0500 Subject: [PATCH 15/27] fix clippy --- datafusion/optimizer/src/common_subexpr_eliminate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index c3790da6162a9..75510bf2a2b92 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1042,7 +1042,7 @@ mod test { }; use datafusion_expr::{ grouping_set, AccumulatorFactoryFunction, AggregateUDF, Signature, - SimpleAggregateUDF, TableScan, Volatility, + SimpleAggregateUDF, Volatility, }; use crate::optimizer::OptimizerContext; From 40869a4d394af27b0a99c4061398d86e8005adec Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 26 Mar 2024 11:22:07 +0300 Subject: [PATCH 16/27] Replace with cached exprs --- .../src/physical_optimizer/enforce_sorting.rs | 77 ++++++ datafusion/core/src/physical_planner.rs | 5 + datafusion/core/tests/data/demo.csv | 2 + .../optimizer/src/common_subexpr_eliminate.rs | 232 ++++++++++++------ .../optimizer/src/optimize_projections.rs | 23 +- .../test_files/project_complex_sub_query.slt | 17 +- 6 files changed, 253 insertions(+), 103 deletions(-) create mode 100644 datafusion/core/tests/data/demo.csv diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 5bf21c3dfab5a..95e4a06b516b3 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -2347,3 +2347,80 @@ mod tests { Ok(()) } } + + +#[cfg(test)] +mod tmp_tests { + use crate::assert_batches_eq; + use crate::physical_plan::{collect, displayable, ExecutionPlan}; + use crate::prelude::SessionContext; + use arrow::util::pretty::print_batches; + use datafusion_common::Result; + use datafusion_execution::config::SessionConfig; + use datafusion_physical_plan::get_plan_string; + use std::sync::Arc; + + fn print_plan(plan: &Arc) -> Result<()> { + let formatted = displayable(plan.as_ref()).indent(true).to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + println!("{:#?}", actual); + Ok(()) + } + + const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE multiple_ordered_table ( + a0 INTEGER, + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER + ) + STORED AS CSV + WITH HEADER ROW + WITH ORDER (a ASC, b ASC) + WITH ORDER (c ASC) + LOCATION '../core/tests/data/window_2.csv'"; + + #[tokio::test] + async fn test_query() -> Result<()> { + let config = SessionConfig::new().with_target_partitions(1); + let ctx = SessionContext::new_with_config(config); + + ctx.sql(MULTIPLE_ORDERED_TABLE).await?; + + let sql = "SELECT a+b, SUM(a+b) OVER() FROM multiple_ordered_table"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let physical_plan = dataframe.create_physical_plan().await?; + print_plan(&physical_plan)?; + let batches = collect(physical_plan.clone(), ctx.task_ctx()).await?; + print_batches(&batches)?; + + let expected = vec![ + "ProjectionExec: expr=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as nth_value1]", + " GlobalLimitExec: skip=0, fetch=5", + " BoundedWindowAggExec: wdw=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: \"NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Sorted]", + " CsvExec: file_groups={1 group: [[Users/akurmustafa/projects/synnada/arrow-datafusion-synnada/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true", + ]; + // Get string representation of the plan + let actual = get_plan_string(&physical_plan); + assert_eq!( + expected, actual, + "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + let expected = [ + "+------------+", + "| nth_value1 |", + "+------------+", + "| 2 |", + "| 2 |", + "| 2 |", + "| 2 |", + "| 2 |", + "+------------+", + ]; + assert_batches_eq!(expected, &batches); + Ok(()) + } +} diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index ca708b05823e7..4334bb617f78c 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -855,6 +855,11 @@ impl DefaultPhysicalPlanner { )?)) } LogicalPlan::Projection(Projection { input, expr, .. }) => { + println!("---------------"); + for expr in expr{ + println!("expr:{:?}", expr); + } + println!("---------------"); let input_exec = self.create_initial_plan(input, session_state).await?; let input_schema = input.as_ref().schema(); diff --git a/datafusion/core/tests/data/demo.csv b/datafusion/core/tests/data/demo.csv new file mode 100644 index 0000000000000..d1492228246f0 --- /dev/null +++ b/datafusion/core/tests/data/demo.csv @@ -0,0 +1,2 @@ +c1,c2,c3,c4,c5 +1,2,3,4,5 diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 57a3519c6b557..ef6b15fca3131 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -123,15 +123,14 @@ impl CommonSubexprEliminate { .iter() .zip(arrays_list.iter()) .map(|(exprs, arrays)| { - let res = exprs + exprs .iter() .cloned() .zip(arrays.iter()) .map(|(expr, id_array)| { replace_common_expr(expr, id_array, expr_set, affected_id) }) - .collect::>>(); - res + .collect::>>() }) .collect::>>() } @@ -424,8 +423,10 @@ impl CommonSubexprEliminate { plan.clone() .rewrite(&mut ProjectionAdder { data_type_map: HashMap::new(), - depth_map: HashMap::new(), + cumulative_expr_map: HashMap::new(), + cumulative_map: HashSet::new(), depth: 0, + should_update_parents: false, }) .map(|transformed| Some(transformed.data)) } @@ -438,7 +439,6 @@ impl OptimizerRule for CommonSubexprEliminate { ) -> Result> { let optimized_plan_option = self.common_optimize(plan, config)?; - // println!("optimized plan option is {:?}", optimized_plan_option); let plan = match optimized_plan_option { Some(plan) => plan, _ => plan.clone(), @@ -476,8 +476,7 @@ fn to_arrays( expr_set: &mut ExprSet, expr_mask: ExprMask, ) -> Result>> { - let res = expr - .iter() + expr.iter() .map(|e| { let mut id_array = vec![]; expr_to_identifier( @@ -490,8 +489,7 @@ fn to_arrays( Ok(id_array) }) - .collect::>>(); - res + .collect::>>() } /// Build the "intermediate" projection plan that evaluates the extracted common expressions. @@ -852,9 +850,11 @@ fn replace_common_expr( } struct ProjectionAdder { - depth_map: HashMap>, + cumulative_expr_map: HashMap>, + cumulative_map: HashSet, depth: u128, data_type_map: HashMap, + should_update_parents: bool, } pub fn is_not_complex(op: &Operator) -> bool { matches!( @@ -882,11 +882,6 @@ impl ProjectionAdder { .field_from_column(l) .expect("Field not found for left column"); - // res.insert(DFField::new_unqualified( - // &expr.to_string(), - // l_field.data_type().clone(), - // true, - // )); expr_data_type .entry(expr.clone()) .or_insert(l_field.data_type().clone()); @@ -917,81 +912,166 @@ impl TreeNodeRewriter for ProjectionAdder { /// currently we just collect the complex bianryOP fn f_down(&mut self, node: Self::Node) -> Result> { - // use depth to trace where we are in the LogicalPlan tree - self.depth += 1; - // extract all expressions + check whether it contains in depth_sets - let exprs = node.expressions(); - let depth_set = self.depth_map.entry(self.depth).or_default(); - let mut schema = node.schema().deref().clone(); - for ip in node.inputs() { - schema.merge(ip.schema()); + let depth_set = &mut self.cumulative_map; + // Insert for other end points + if let LogicalPlan::TableScan(_) = node{ + self.cumulative_expr_map.insert(self.depth, depth_set.clone()); + self.depth += 1; + return Ok(Transformed::no(node)); + } else { + // TODO: Assumes operators doesn't modify name of the fields. + // use depth to trace where we are in the LogicalPlan tree + self.depth += 1; + // extract all expressions + check whether it contains in depth_sets + let exprs = node.expressions(); + // let depth_set = self.cumulative_expr_map.entry(self.depth).or_default(); + let mut schema = node.schema().deref().clone(); + for ip in node.inputs() { + schema.merge(ip.schema()); + } + let (extended_set, data_map) = + Self::get_complex_expressions(exprs, Arc::new(schema)); + depth_set.extend(extended_set); + self.data_type_map.extend(data_map); + Ok(Transformed::no(node)) } - let (extended_set, data_map) = - Self::get_complex_expressions(exprs, Arc::new(schema)); - depth_set.extend(extended_set); - self.data_type_map.extend(data_map); - Ok(Transformed::no(node)) } - fn f_up(&mut self, node: Self::Node) -> Result> { - let current_depth_schema = - self.depth_map.get(&self.depth).cloned().unwrap_or_default(); - // get the intersected part - let added_expr = self - .depth_map - .iter() - .filter(|(&depth, _)| depth < self.depth) - .fold(current_depth_schema, |acc, (_, expr)| { - acc.intersection(expr).cloned().collect() - }); + fn f_up(&mut self, node: Self::Node) -> Result> { + let added_expr = + self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default(); + println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map); + println!("self.cumulative_map: {:?}", self.cumulative_map); + println!("node:{:?}", node); self.depth -= 1; // do not do extra things - if added_expr.is_empty() { + let should_add_projection = !added_expr.is_empty(); + if !should_add_projection && !self.should_update_parents { + println!("not updating node"); return Ok(Transformed::no(node)); } - match node { - // do not add for Projections - LogicalPlan::Projection(_) - | LogicalPlan::TableScan(_) - | LogicalPlan::Join(_) => Ok(Transformed::no(node)), - _ => { - // avoid recursive add projections - if added_expr.iter().any(|expr| { - node.inputs()[0] - .schema() - .has_column_with_unqualified_name(&expr.to_string()) - }) { - return Ok(Transformed::no(node)); + // if added_expr.is_empty() { + // println!("not updating node"); + // return Ok(Transformed::no(node)); + // } + + let child = node.inputs()[0]; + let child = if should_add_projection { + self.should_update_parents = true; + let mut field_set = HashSet::new(); + let mut project_exprs = vec![]; + for expr in &added_expr { + let f = DFField::new_unqualified( + &expr.to_string(), + self.data_type_map[expr].clone(), + true, + ); + field_set.insert(f.name().to_owned()); + project_exprs.push(expr.clone().alias(expr.to_string())); + } + // Do not lose fields in the child. + for field in child.schema().fields() { + if field_set.insert(field.qualified_name()) { + project_exprs.push(Expr::Column(field.qualified_column())); } + } - let mut field_set = HashSet::new(); - let mut project_exprs = vec![]; - for expr in added_expr { - let f = DFField::new_unqualified( - &expr.to_string(), - self.data_type_map[&expr].clone(), - true, - ); - field_set.insert(f.name().to_owned()); - project_exprs.push(expr.clone().alias(expr.to_string())); - } - for field in node.inputs()[0].schema().fields() { - if field_set.insert(field.qualified_name()) { - project_exprs.push(Expr::Column(field.qualified_column())); - } + // adding new plan here + let new_child = LogicalPlan::Projection(Projection::try_new( + project_exprs, + Arc::new(node.inputs()[0].clone()), + )?); + new_child + } else { + child.clone() + }; + let mut expressions = node.expressions(); + // for expr in &added_expr{ + // println!("expr.display_name() {:?}", expr.display_name()?); + // } + println!(" original expressions: {:?}", expressions); + let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::>(); + // Replace expressions with its pre-computed variant if available. + expressions.iter_mut().try_for_each(|expr|{ + update_expr_with_available_columns(expr, &available_columns) + })?; + println!("after update expressions: {:?}", expressions); + + let new_node = node.with_new_exprs(expressions, [child].to_vec())?; + println!("new node: {:?}", new_node); + Ok(Transformed::yes( + new_node, + )) + + // match node { + // // do not add for Projections + // LogicalPlan::Projection(_) + // | LogicalPlan::TableScan(_) + // | LogicalPlan::Join(_) => Ok(Transformed::no(node)), + // _ => { + // // avoid recursive add projections + // if added_expr.iter().any(|expr| { + // node.inputs()[0] + // .schema() + // .has_column_with_unqualified_name(&expr.to_string()) + // }) { + // return Ok(Transformed::no(node)); + // } + // + // let mut field_set = HashSet::new(); + // let mut project_exprs = vec![]; + // for expr in added_expr { + // let f = DFField::new_unqualified( + // &expr.to_string(), + // self.data_type_map[&expr].clone(), + // true, + // ); + // field_set.insert(f.name().to_owned()); + // project_exprs.push(expr.clone().alias(expr.to_string())); + // } + // for field in node.inputs()[0].schema().fields() { + // if field_set.insert(field.qualified_name()) { + // project_exprs.push(Expr::Column(field.qualified_column())); + // } + // } + // // adding new plan here + // let new_plan = LogicalPlan::Projection(Projection::try_new( + // project_exprs, + // Arc::new(node.inputs()[0].clone()), + // )?); + // Ok(Transformed::yes( + // node.with_new_exprs(node.expressions(), [new_plan].to_vec())?, + // )) + // } + // } + } +} + +fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Column]) -> Result<()> { + match expr { + Expr::BinaryExpr(_) => { + for available_col in available_columns{ + // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?); + if available_col.flat_name() == expr.display_name()?{ + println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); + *expr = Expr::Column(available_col.clone()); } - // adding new plan here - let new_plan = LogicalPlan::Projection(Projection::try_new( - project_exprs, - Arc::new(node.inputs()[0].clone()), - )?); - Ok(Transformed::yes( - node.with_new_exprs(node.expressions(), [new_plan].to_vec())?, - )) } + }, + Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { + args.iter_mut().try_for_each(|arg| update_expr_with_available_columns(arg, available_columns))? + }, + Expr::Cast(Cast{ expr, .. }) => { + update_expr_with_available_columns(expr, available_columns)? + } + _ => { + println!("unsupported expression : {:?}", expr); + // cannot rewrite } } + Ok(()) } + #[cfg(test)] mod test { use std::iter; diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 8ad773d89e13e..284ea6673f8b2 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -26,7 +26,6 @@ use std::collections::HashSet; use std::sync::Arc; -use crate::common_subexpr_eliminate::is_not_complex; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; @@ -70,6 +69,8 @@ impl OptimizerRule for OptimizeProjections { plan: &LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { + println!("plan at the start:"); + println!("{:?}", plan); // All output fields are necessary: let indices = (0..plan.schema().fields().len()).collect::>(); optimize_projections(plan, config, &indices) @@ -694,26 +695,10 @@ fn indices_referred_by_expr( // TODO: Support more Expressions let mut cols = expr.to_columns()?; outer_columns(expr, &mut cols); - let mut res_vec: Vec = cols + Ok(cols .iter() .flat_map(|col| input_schema.index_of_column(col)) - .collect(); - match expr { - Expr::BinaryExpr(BinaryExpr { op, .. }) if !is_not_complex(op) => { - if let Some(index) = - input_schema.index_of_column_by_name(None, &expr.to_string())? - { - match res_vec.binary_search(&index) { - Ok(_) => {} - Err(pos) => { - res_vec.insert(pos, index); - } - } - } - } - _ => {} - } - Ok(res_vec) + .collect()) } /// Gets all required indices for the input; i.e. those required by the parent diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index ffcb9cc3d70c8..30a84f1fc397f 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -31,15 +31,17 @@ STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/demo.csv'; +statement ok +set datafusion.explain.logical_plan_only = true; + query TT explain SELECT c3+c4, SUM(c3+c4) OVER() FROM t; ---- logical_plan -Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ---WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -----Projection: t.c3 + t.c4 AS t.c3 + t.c4, t.c3, t.c4 -------TableScan: t projection=[c3, c4] +WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +--Projection: t.c3 + t.c4 AS t.c3 + t.c4 +----TableScan: t projection=[c3, c4] query TT explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) @@ -66,7 +68,6 @@ explain SELECT c3-c4, SUM(c3-c4) OVER() FROM t; ---- logical_plan -Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ---WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4 -------TableScan: t projection=[c3, c4] +WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +--Projection: t.c3 - t.c4 AS t.c3 - t.c4 +----TableScan: t projection=[c3, c4] From 4396682fa15ca4ef111f1d300562af9504d5cdba Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 27 Mar 2024 15:21:41 +0300 Subject: [PATCH 17/27] Minor changes --- datafusion/core/src/physical_planner.rs | 5 ----- .../optimizer/src/common_subexpr_eliminate.rs | 18 +++++++++--------- .../optimizer/src/optimize_projections.rs | 2 -- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 4334bb617f78c..ca708b05823e7 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -855,11 +855,6 @@ impl DefaultPhysicalPlanner { )?)) } LogicalPlan::Projection(Projection { input, expr, .. }) => { - println!("---------------"); - for expr in expr{ - println!("expr:{:?}", expr); - } - println!("---------------"); let input_exec = self.create_initial_plan(input, session_state).await?; let input_schema = input.as_ref().schema(); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index ef6b15fca3131..3aef3cf703c17 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -940,14 +940,14 @@ impl TreeNodeRewriter for ProjectionAdder { fn f_up(&mut self, node: Self::Node) -> Result> { let added_expr = self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default(); - println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map); - println!("self.cumulative_map: {:?}", self.cumulative_map); - println!("node:{:?}", node); + // println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map); + // println!("self.cumulative_map: {:?}", self.cumulative_map); + // println!("node:{:?}", node); self.depth -= 1; // do not do extra things let should_add_projection = !added_expr.is_empty(); if !should_add_projection && !self.should_update_parents { - println!("not updating node"); + // println!("not updating node"); return Ok(Transformed::no(node)); } // if added_expr.is_empty() { @@ -989,16 +989,16 @@ impl TreeNodeRewriter for ProjectionAdder { // for expr in &added_expr{ // println!("expr.display_name() {:?}", expr.display_name()?); // } - println!(" original expressions: {:?}", expressions); + // println!(" original expressions: {:?}", expressions); let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::>(); // Replace expressions with its pre-computed variant if available. expressions.iter_mut().try_for_each(|expr|{ update_expr_with_available_columns(expr, &available_columns) })?; - println!("after update expressions: {:?}", expressions); + // println!("after update expressions: {:?}", expressions); let new_node = node.with_new_exprs(expressions, [child].to_vec())?; - println!("new node: {:?}", new_node); + // println!("new node: {:?}", new_node); Ok(Transformed::yes( new_node, )) @@ -1053,7 +1053,7 @@ fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Colu for available_col in available_columns{ // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?); if available_col.flat_name() == expr.display_name()?{ - println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); + // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); *expr = Expr::Column(available_col.clone()); } } @@ -1065,7 +1065,7 @@ fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Colu update_expr_with_available_columns(expr, available_columns)? } _ => { - println!("unsupported expression : {:?}", expr); + // println!("unsupported expression : {:?}", expr); // cannot rewrite } } diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 284ea6673f8b2..39dc56d2af93f 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -69,8 +69,6 @@ impl OptimizerRule for OptimizeProjections { plan: &LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { - println!("plan at the start:"); - println!("{:?}", plan); // All output fields are necessary: let indices = (0..plan.schema().fields().len()).collect::>(); optimize_projections(plan, config, &indices) From 83002cefa8f8f19407dcf49f236e74b58d2eb010 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 27 Mar 2024 16:24:32 +0300 Subject: [PATCH 18/27] Minor changes --- .../optimizer/src/common_subexpr_eliminate.rs | 165 +++++++----------- 1 file changed, 64 insertions(+), 101 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 3aef3cf703c17..bab58b5880e70 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -906,6 +906,47 @@ impl ProjectionAdder { } (res, expr_data_type) } + + fn update_expr_with_available_columns(&self, expr: &mut Expr, available_columns: &[Column]) -> Result<()> { + match expr { + Expr::BinaryExpr(_) => { + for available_col in available_columns{ + // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?); + if available_col.flat_name() == expr.display_name()?{ + // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); + *expr = Expr::Column(available_col.clone()); + } + } + }, + Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { + args.iter_mut().try_for_each(|arg| self.update_expr_with_available_columns(arg, available_columns))? + }, + Expr::Cast(Cast{ expr, .. }) => { + self.update_expr_with_available_columns(expr, available_columns)? + } + _ => { + // cannot rewrite + } + } + Ok(()) + } + + + // Assumes operators doesn't modify name of the fields. + // Otherwise this operation is not safe. + fn extend_with_exprs(&mut self, node: &LogicalPlan) { + // use depth to trace where we are in the LogicalPlan tree + // extract all expressions + check whether it contains in depth_sets + let exprs = node.expressions(); + let mut schema = node.schema().deref().clone(); + for ip in node.inputs() { + schema.merge(ip.schema()); + } + let (extended_set, data_map) = + Self::get_complex_expressions(exprs, Arc::new(schema)); + self.cumulative_map.extend(extended_set); + self.data_type_map.extend(data_map); + } } impl TreeNodeRewriter for ProjectionAdder { type Node = LogicalPlan; @@ -914,35 +955,35 @@ impl TreeNodeRewriter for ProjectionAdder { fn f_down(&mut self, node: Self::Node) -> Result> { let depth_set = &mut self.cumulative_map; // Insert for other end points - if let LogicalPlan::TableScan(_) = node{ - self.cumulative_expr_map.insert(self.depth, depth_set.clone()); - self.depth += 1; - return Ok(Transformed::no(node)); - } else { - // TODO: Assumes operators doesn't modify name of the fields. - // use depth to trace where we are in the LogicalPlan tree - self.depth += 1; - // extract all expressions + check whether it contains in depth_sets - let exprs = node.expressions(); - // let depth_set = self.cumulative_expr_map.entry(self.depth).or_default(); - let mut schema = node.schema().deref().clone(); - for ip in node.inputs() { - schema.merge(ip.schema()); + self.depth += 1; + match node { + LogicalPlan::TableScan(_) => { + self.cumulative_expr_map.insert(self.depth-1, depth_set.clone()); + return Ok(Transformed::no(node)); + }, + LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => { + self.extend_with_exprs(&node); + Ok(Transformed::no(node)) + } + LogicalPlan::Projection(_) => { + if depth_set.is_empty(){ + self.extend_with_exprs(&node); + } else { + self.cumulative_expr_map.insert(self.depth-1, depth_set.clone()); + } + Ok(Transformed::no(node)) + }, + _ => { + // Unsupported operators + self.cumulative_map.clear(); + Ok(Transformed::no(node)) } - let (extended_set, data_map) = - Self::get_complex_expressions(exprs, Arc::new(schema)); - depth_set.extend(extended_set); - self.data_type_map.extend(data_map); - Ok(Transformed::no(node)) } } fn f_up(&mut self, node: Self::Node) -> Result> { let added_expr = self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default(); - // println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map); - // println!("self.cumulative_map: {:?}", self.cumulative_map); - // println!("node:{:?}", node); self.depth -= 1; // do not do extra things let should_add_projection = !added_expr.is_empty(); @@ -950,10 +991,6 @@ impl TreeNodeRewriter for ProjectionAdder { // println!("not updating node"); return Ok(Transformed::no(node)); } - // if added_expr.is_empty() { - // println!("not updating node"); - // return Ok(Transformed::no(node)); - // } let child = node.inputs()[0]; let child = if should_add_projection { @@ -986,92 +1023,18 @@ impl TreeNodeRewriter for ProjectionAdder { child.clone() }; let mut expressions = node.expressions(); - // for expr in &added_expr{ - // println!("expr.display_name() {:?}", expr.display_name()?); - // } - // println!(" original expressions: {:?}", expressions); let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::>(); // Replace expressions with its pre-computed variant if available. expressions.iter_mut().try_for_each(|expr|{ - update_expr_with_available_columns(expr, &available_columns) + self.update_expr_with_available_columns(expr, &available_columns) })?; - // println!("after update expressions: {:?}", expressions); let new_node = node.with_new_exprs(expressions, [child].to_vec())?; - // println!("new node: {:?}", new_node); Ok(Transformed::yes( new_node, )) - - // match node { - // // do not add for Projections - // LogicalPlan::Projection(_) - // | LogicalPlan::TableScan(_) - // | LogicalPlan::Join(_) => Ok(Transformed::no(node)), - // _ => { - // // avoid recursive add projections - // if added_expr.iter().any(|expr| { - // node.inputs()[0] - // .schema() - // .has_column_with_unqualified_name(&expr.to_string()) - // }) { - // return Ok(Transformed::no(node)); - // } - // - // let mut field_set = HashSet::new(); - // let mut project_exprs = vec![]; - // for expr in added_expr { - // let f = DFField::new_unqualified( - // &expr.to_string(), - // self.data_type_map[&expr].clone(), - // true, - // ); - // field_set.insert(f.name().to_owned()); - // project_exprs.push(expr.clone().alias(expr.to_string())); - // } - // for field in node.inputs()[0].schema().fields() { - // if field_set.insert(field.qualified_name()) { - // project_exprs.push(Expr::Column(field.qualified_column())); - // } - // } - // // adding new plan here - // let new_plan = LogicalPlan::Projection(Projection::try_new( - // project_exprs, - // Arc::new(node.inputs()[0].clone()), - // )?); - // Ok(Transformed::yes( - // node.with_new_exprs(node.expressions(), [new_plan].to_vec())?, - // )) - // } - // } } } - -fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Column]) -> Result<()> { - match expr { - Expr::BinaryExpr(_) => { - for available_col in available_columns{ - // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?); - if available_col.flat_name() == expr.display_name()?{ - // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); - *expr = Expr::Column(available_col.clone()); - } - } - }, - Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { - args.iter_mut().try_for_each(|arg| update_expr_with_available_columns(arg, available_columns))? - }, - Expr::Cast(Cast{ expr, .. }) => { - update_expr_with_available_columns(expr, available_columns)? - } - _ => { - // println!("unsupported expression : {:?}", expr); - // cannot rewrite - } - } - Ok(()) -} - #[cfg(test)] mod test { use std::iter; From b77efabd7ddba2540e2b22e6549686175c933a73 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 11:24:07 +0300 Subject: [PATCH 19/27] Tmp --- .../src/physical_optimizer/enforce_sorting.rs | 25 ++++---- .../tests/data/project_complex_expression.csv | 6 ++ .../optimizer/src/common_subexpr_eliminate.rs | 27 +++++++- .../optimizer/src/optimize_projections.rs | 1 + .../test_files/project_complex_sub_query.slt | 63 ++++++++++++++++--- 5 files changed, 100 insertions(+), 22 deletions(-) create mode 100644 datafusion/core/tests/data/project_complex_expression.csv diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 95e4a06b516b3..1c4a453d2c252 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -2367,18 +2367,16 @@ mod tmp_tests { Ok(()) } - const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE multiple_ordered_table ( - a0 INTEGER, - a INTEGER, - b INTEGER, - c INTEGER, - d INTEGER - ) - STORED AS CSV - WITH HEADER ROW - WITH ORDER (a ASC, b ASC) - WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv'"; + const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE t ( +c1 INT NOT NULL, +c2 INT NOT NULL, +c3 INT NOT NULL, +c4 INT NOT NULL, +c5 INT NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '../core/tests/data/project_complex_expression.csv'"; #[tokio::test] async fn test_query() -> Result<()> { @@ -2387,7 +2385,8 @@ mod tmp_tests { ctx.sql(MULTIPLE_ORDERED_TABLE).await?; - let sql = "SELECT a+b, SUM(a+b) OVER() FROM multiple_ordered_table"; + let sql = "SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) +FROM t"; let msg = format!("Creating logical plan for '{sql}'"); let dataframe = ctx.sql(sql).await.expect(&msg); diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv new file mode 100644 index 0000000000000..f37f11cc7f1ca --- /dev/null +++ b/datafusion/core/tests/data/project_complex_expression.csv @@ -0,0 +1,6 @@ +c1,c2,c3,c4,c5 +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +16,17,18,19,20 +21,22,23,24,25 diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index bab58b5880e70..987618513642c 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -893,8 +893,13 @@ impl ProjectionAdder { Self::get_complex_expressions(vec![*expr], schema.clone()); res.extend(expr_set); expr_data_type.extend(type_data_map); + }, + Expr::Alias(Alias{expr, relation: _, name: _}) => { + let (expr_set, type_data_map) = + Self::get_complex_expressions(vec![*expr], schema.clone()); + res.extend(expr_set); + expr_data_type.extend(type_data_map); } - Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { let (expr_set, type_map) = Self::get_complex_expressions(args, schema.clone()); @@ -923,14 +928,31 @@ impl ProjectionAdder { }, Expr::Cast(Cast{ expr, .. }) => { self.update_expr_with_available_columns(expr, available_columns)? + }, + Expr::Alias(alias) => { + self.update_expr_with_available_columns(&mut alias.expr, available_columns)?; } _ => { // cannot rewrite } } + Self::trim_expr(expr)?; Ok(()) } + fn trim_expr(expr: &mut Expr) -> Result<()> { + let orig_name = expr.display_name()?; + match expr { + Expr::Alias(alias) => { + Self::trim_expr(&mut alias.expr)?; + if orig_name == alias.expr.display_name()?{ + *expr = *alias.expr.clone(); + } + Ok(()) + } + _ => Ok(()), + } + } // Assumes operators doesn't modify name of the fields. // Otherwise this operation is not safe. @@ -966,6 +988,7 @@ impl TreeNodeRewriter for ProjectionAdder { Ok(Transformed::no(node)) } LogicalPlan::Projection(_) => { + // println!("proj node exprs: {:?}", node.expressions()); if depth_set.is_empty(){ self.extend_with_exprs(&node); } else { @@ -988,7 +1011,7 @@ impl TreeNodeRewriter for ProjectionAdder { // do not do extra things let should_add_projection = !added_expr.is_empty(); if !should_add_projection && !self.should_update_parents { - // println!("not updating node"); + // clearln!("not updating node"); return Ok(Transformed::no(node)); } diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 39dc56d2af93f..9201d0a5e9b39 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -174,6 +174,7 @@ fn optimize_projections( return Ok(None); } LogicalPlan::Projection(proj) => { + // println!("expr:{:?}", proj.expr); return if let Some(proj) = merge_consecutive_projections(proj)? { Ok(Some( rewrite_projection_given_requirements(&proj, config, indices)? diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index 30a84f1fc397f..06bfaf5289219 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -29,10 +29,7 @@ c5 INT NOT NULL ) STORED AS CSV WITH HEADER ROW -LOCATION '../core/tests/data/demo.csv'; - -statement ok -set datafusion.explain.logical_plan_only = true; +LOCATION '../core/tests/data/project_complex_expression.csv'; query TT explain SELECT c3+c4, SUM(c3+c4) OVER() @@ -42,6 +39,12 @@ logical_plan WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] --Projection: t.c3 + t.c4 AS t.c3 + t.c4 ----TableScan: t projection=[c3, c4] +physical_plan +WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +--CoalescePartitionsExec +----ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4] +------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query TT explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) @@ -52,6 +55,15 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] ----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4 ------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[c3@1 + c4@2 as t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] +--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----BoundedWindowAggExec: wdw=[SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +------SortPreservingMergeExec: [t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST] +--------SortExec: expr=[t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST] +----------ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4] +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query TT explain SELECT c3-c4, SUM(c3-c4) OVER(order by c3-c4) @@ -62,12 +74,49 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] ----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4 ------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[c3@1 - c4@2 as t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] +--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----BoundedWindowAggExec: wdw=[SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +------SortPreservingMergeExec: [t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST] +--------SortExec: expr=[t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST] +----------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4] +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query TT explain SELECT c3-c4, SUM(c3-c4) OVER() FROM t; ---- logical_plan -WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ---Projection: t.c3 - t.c4 AS t.c3 - t.c4 -----TableScan: t projection=[c3, c4] +Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +--WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4 +------TableScan: t projection=[c3, c4] +physical_plan +ProjectionExec: expr=[t.c3 - t.c4@0 as t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] +--WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +----CoalescePartitionsExec +------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4, c3@0 as c3, c4@1 as c4] +--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true + +query II +SELECT c3+c4, SUM(c3+c4) OVER() +FROM t; +---- +7 135 +17 135 +27 135 +37 135 +47 135 + +query II +SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) +FROM t +---- +7 7 +17 24 +27 51 +37 88 +47 135 From 59e0385dc0f8ac47d6a2a0e6d9205eb85e5b75ca Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 13:25:53 +0300 Subject: [PATCH 20/27] slt tests pass --- .../src/physical_optimizer/enforce_sorting.rs | 3 +- datafusion/expr/src/expr.rs | 15 +- .../optimizer/src/common_subexpr_eliminate.rs | 186 +++++++++++------- datafusion/sqllogictest/test_files/insert.slt | 2 +- .../test_files/project_complex_sub_query.slt | 20 +- .../sqllogictest/test_files/subquery.slt | 2 +- 6 files changed, 137 insertions(+), 91 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 1c4a453d2c252..63f35b70b4bfc 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -2348,7 +2348,6 @@ mod tests { } } - #[cfg(test)] mod tmp_tests { use crate::assert_batches_eq; @@ -2385,7 +2384,7 @@ LOCATION '../core/tests/data/project_complex_expression.csv'"; ctx.sql(MULTIPLE_ORDERED_TABLE).await?; - let sql = "SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4) + let sql = "SELECT c3+c4, SUM(c3+c4) OVER() FROM t"; let msg = format!("Creating logical plan for '{sql}'"); diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 7ede4cd8ffc9e..2e4930a1a0cb3 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1866,8 +1866,19 @@ fn create_name(e: &Expr) -> Result { Ok(format!("{expr} BETWEEN {low} AND {high}")) } } - Expr::Sort { .. } => { - internal_err!("Create name does not support sort expression") + Expr::Sort(Sort { + expr, + asc, + nulls_first, + }) => { + let dir = if *asc { "ASC" } else { "DESC" }; + let nulls = if *nulls_first { + "NULLS_FIRST" + } else { + "NULLS_LAST" + }; + Ok(format!("{expr} {dir} {nulls}")) + // internal_err!("Create name does not support sort expression") } Expr::Wildcard { qualifier } => match qualifier { Some(qualifier) => internal_err!( diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 987618513642c..51e704d30ecc9 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -420,15 +420,18 @@ impl CommonSubexprEliminate { /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the /// fn add_extra_projection(&self, plan: &LogicalPlan) -> Result> { - plan.clone() + // println!("plan at the start: {:?}", plan); + let res = plan + .clone() .rewrite(&mut ProjectionAdder { - data_type_map: HashMap::new(), - cumulative_expr_map: HashMap::new(), - cumulative_map: HashSet::new(), + insertion_point_map: HashMap::new(), depth: 0, - should_update_parents: false, - }) - .map(|transformed| Some(transformed.data)) + complex_exprs: HashMap::new(), + })? + .data; + // .map(|transformed| Some(transformed.data)) + // println!("plan at the end: {:?}", res); + Ok(Some(res)) } } impl OptimizerRule for CommonSubexprEliminate { @@ -850,11 +853,10 @@ fn replace_common_expr( } struct ProjectionAdder { - cumulative_expr_map: HashMap>, - cumulative_map: HashSet, - depth: u128, - data_type_map: HashMap, - should_update_parents: bool, + insertion_point_map: HashMap>, + depth: usize, + complex_exprs: HashMap, + // should_update_parents: bool, } pub fn is_not_complex(op: &Operator) -> bool { matches!( @@ -862,14 +864,14 @@ pub fn is_not_complex(op: &Operator) -> bool { &Operator::Eq | &Operator::NotEq | &Operator::Lt | &Operator::Gt | &Operator::And ) } + impl ProjectionAdder { // TODO: adding more expressions for sub query, currently only support for Simple Binary Expressions fn get_complex_expressions( exprs: Vec, schema: DFSchemaRef, - ) -> (HashSet, HashMap) { + ) -> HashSet<(Expr, DataType)> { let mut res = HashSet::new(); - let mut expr_data_type: HashMap = HashMap::new(); for expr in exprs { match expr { Expr::BinaryExpr(BinaryExpr { @@ -881,56 +883,62 @@ impl ProjectionAdder { let l_field = schema .field_from_column(l) .expect("Field not found for left column"); - - expr_data_type - .entry(expr.clone()) - .or_insert(l_field.data_type().clone()); - res.insert(expr.clone()); + res.insert((expr.clone(), l_field.data_type().clone())); } } Expr::Cast(Cast { expr, data_type: _ }) => { - let (expr_set, type_data_map) = + let exprs_with_type = Self::get_complex_expressions(vec![*expr], schema.clone()); - res.extend(expr_set); - expr_data_type.extend(type_data_map); - }, - Expr::Alias(Alias{expr, relation: _, name: _}) => { - let (expr_set, type_data_map) = + res.extend(exprs_with_type); + } + Expr::Alias(Alias { + expr, + relation: _, + name: _, + }) => { + let exprs_with_type = Self::get_complex_expressions(vec![*expr], schema.clone()); - res.extend(expr_set); - expr_data_type.extend(type_data_map); + res.extend(exprs_with_type); } Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { - let (expr_set, type_map) = + let exprs_with_type = Self::get_complex_expressions(args, schema.clone()); - res.extend(expr_set); - expr_data_type.extend(type_map); + res.extend(exprs_with_type); } _ => {} } } - (res, expr_data_type) + res } - fn update_expr_with_available_columns(&self, expr: &mut Expr, available_columns: &[Column]) -> Result<()> { + fn update_expr_with_available_columns( + &self, + expr: &mut Expr, + available_columns: &[Column], + ) -> Result<()> { match expr { Expr::BinaryExpr(_) => { - for available_col in available_columns{ + for available_col in available_columns { // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?); - if available_col.flat_name() == expr.display_name()?{ + if available_col.flat_name() == expr.display_name()? { // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); *expr = Expr::Column(available_col.clone()); } } - }, + } Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { - args.iter_mut().try_for_each(|arg| self.update_expr_with_available_columns(arg, available_columns))? - }, - Expr::Cast(Cast{ expr, .. }) => { + args.iter_mut().try_for_each(|arg| { + self.update_expr_with_available_columns(arg, available_columns) + })? + } + Expr::Cast(Cast { expr, .. }) => { self.update_expr_with_available_columns(expr, available_columns)? - }, + } Expr::Alias(alias) => { - self.update_expr_with_available_columns(&mut alias.expr, available_columns)?; + self.update_expr_with_available_columns( + &mut alias.expr, + available_columns, + )?; } _ => { // cannot rewrite @@ -945,7 +953,7 @@ impl ProjectionAdder { match expr { Expr::Alias(alias) => { Self::trim_expr(&mut alias.expr)?; - if orig_name == alias.expr.display_name()?{ + if orig_name == alias.expr.display_name()? { *expr = *alias.expr.clone(); } Ok(()) @@ -960,14 +968,18 @@ impl ProjectionAdder { // use depth to trace where we are in the LogicalPlan tree // extract all expressions + check whether it contains in depth_sets let exprs = node.expressions(); + // println!("extended exprs: {:?}", exprs); let mut schema = node.schema().deref().clone(); for ip in node.inputs() { schema.merge(ip.schema()); } - let (extended_set, data_map) = - Self::get_complex_expressions(exprs, Arc::new(schema)); - self.cumulative_map.extend(extended_set); - self.data_type_map.extend(data_map); + let expr_with_type = Self::get_complex_expressions(exprs, Arc::new(schema)); + for (expr, dtype) in expr_with_type { + let (_, count) = self.complex_exprs.entry(expr).or_insert_with(|| (dtype, 0)); + *count += 1; + } + // self.cumulative_map.extend(extended_set); + // self.data_type_map.extend(data_map); } } impl TreeNodeRewriter for ProjectionAdder { @@ -975,59 +987,72 @@ impl TreeNodeRewriter for ProjectionAdder { /// currently we just collect the complex bianryOP fn f_down(&mut self, node: Self::Node) -> Result> { - let depth_set = &mut self.cumulative_map; // Insert for other end points self.depth += 1; match node { LogicalPlan::TableScan(_) => { - self.cumulative_expr_map.insert(self.depth-1, depth_set.clone()); + self.insertion_point_map + .insert(self.depth - 1, self.complex_exprs.clone()); return Ok(Transformed::no(node)); - }, + } LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => { self.extend_with_exprs(&node); Ok(Transformed::no(node)) } LogicalPlan::Projection(_) => { // println!("proj node exprs: {:?}", node.expressions()); - if depth_set.is_empty(){ + if self.complex_exprs.is_empty() { self.extend_with_exprs(&node); } else { - self.cumulative_expr_map.insert(self.depth-1, depth_set.clone()); + self.insertion_point_map + .insert(self.depth - 1, self.complex_exprs.clone()); } Ok(Transformed::no(node)) - }, + } _ => { // Unsupported operators - self.cumulative_map.clear(); + self.complex_exprs.clear(); Ok(Transformed::no(node)) } } } fn f_up(&mut self, node: Self::Node) -> Result> { - let added_expr = - self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default(); + let cached_exprs = self + .insertion_point_map + .get(&self.depth) + .cloned() + .unwrap_or_default(); self.depth -= 1; // do not do extra things - let should_add_projection = !added_expr.is_empty(); - if !should_add_projection && !self.should_update_parents { - // clearln!("not updating node"); + // println!("---------------------"); + // for (depth, complex_exprs) in &self.insertion_point_map{ + // println!("depth: {:?}, complex_exprs: {:?}", depth, complex_exprs); + // } + // println!("---------------------"); + // let should_add_projection = !cached_exprs.is_empty(); + let should_add_projection = + cached_exprs.iter().any(|(_expr, (_, count))| *count > 1); + // if let LogicalPlan::Projection(projection) = node { + // + // } + + let children = node.inputs(); + if children.len() != 1 { + // Only can rewrite node with single child return Ok(Transformed::no(node)); } - - let child = node.inputs()[0]; + let child = children[0].clone(); let child = if should_add_projection { - self.should_update_parents = true; let mut field_set = HashSet::new(); let mut project_exprs = vec![]; - for expr in &added_expr { - let f = DFField::new_unqualified( - &expr.to_string(), - self.data_type_map[expr].clone(), - true, - ); - field_set.insert(f.name().to_owned()); - project_exprs.push(expr.clone().alias(expr.to_string())); + for (expr, (dtype, count)) in &cached_exprs { + if *count > 1 { + let f = + DFField::new_unqualified(&expr.to_string(), dtype.clone(), true); + field_set.insert(f.name().to_owned()); + project_exprs.push(expr.clone().alias(expr.to_string())); + } } // Do not lose fields in the child. for field in child.schema().fields() { @@ -1043,19 +1068,32 @@ impl TreeNodeRewriter for ProjectionAdder { )?); new_child } else { - child.clone() + child }; let mut expressions = node.expressions(); - let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::>(); + let available_columns = child + .schema() + .fields() + .iter() + .map(|field| field.qualified_column()) + .collect::>(); // Replace expressions with its pre-computed variant if available. - expressions.iter_mut().try_for_each(|expr|{ + // println!("-----------------------"); + // for expr in &expressions{ + // println!("old expr: {:?}", expr); + // } + expressions.iter_mut().try_for_each(|expr| { self.update_expr_with_available_columns(expr, &available_columns) })?; + // for expr in &expressions{ + // println!("new expr: {:?}", expr); + // } + // println!("-----------------------"); + // println!("old node:{:?}", node); let new_node = node.with_new_exprs(expressions, [child].to_vec())?; - Ok(Transformed::yes( - new_node, - )) + // println!("new node:{:?}", new_node); + Ok(Transformed::yes(new_node)) } } #[cfg(test)] diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index fc272326973b1..113e544ad0636 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -169,7 +169,7 @@ ORDER BY c1 ---- logical_plan Dml: op=[Insert Into] table=[table_without_values] ---Projection: a1 AS a1, a2 AS a2 +--Projection: a1, a2 ----Sort: aggregate_test_100.c1 ASC NULLS LAST ------Projection: SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 --------WindowAggr: windowExpr=[[SUM(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, COUNT(UInt8(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index 06bfaf5289219..9f88de66ea3bd 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -37,7 +37,7 @@ FROM t; ---- logical_plan WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ---Projection: t.c3 + t.c4 AS t.c3 + t.c4 +--Projection: t.c3 + t.c4 ----TableScan: t projection=[c3, c4] physical_plan WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] @@ -89,17 +89,15 @@ explain SELECT c3-c4, SUM(c3-c4) OVER() FROM t; ---- logical_plan -Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ---WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] -----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4 -------TableScan: t projection=[c3, c4] +WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +--Projection: t.c3 - t.c4 +----TableScan: t projection=[c3, c4] physical_plan -ProjectionExec: expr=[t.c3 - t.c4@0 as t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] ---WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] -----CoalescePartitionsExec -------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4, c3@0 as c3, c4@1 as c4] ---------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true +WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +--CoalescePartitionsExec +----ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4] +------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true query II SELECT c3+c4, SUM(c3+c4) OVER() diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 4fb94cfab5233..aac64b55b535a 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -449,7 +449,7 @@ query TT explain SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 1) as t2_int from t1 ---- logical_plan -Projection: t1.t1_id, () AS t2_int +Projection: t1.t1_id, () --Subquery: ----Limit: skip=0, fetch=1 ------Projection: t2.t2_int From c1a16f9018af639be57db04e05f766718c5ce134 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 13:29:15 +0300 Subject: [PATCH 21/27] Remove unnecessary test --- .../src/physical_optimizer/enforce_sorting.rs | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 63f35b70b4bfc..5bf21c3dfab5a 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -2347,78 +2347,3 @@ mod tests { Ok(()) } } - -#[cfg(test)] -mod tmp_tests { - use crate::assert_batches_eq; - use crate::physical_plan::{collect, displayable, ExecutionPlan}; - use crate::prelude::SessionContext; - use arrow::util::pretty::print_batches; - use datafusion_common::Result; - use datafusion_execution::config::SessionConfig; - use datafusion_physical_plan::get_plan_string; - use std::sync::Arc; - - fn print_plan(plan: &Arc) -> Result<()> { - let formatted = displayable(plan.as_ref()).indent(true).to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - println!("{:#?}", actual); - Ok(()) - } - - const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE t ( -c1 INT NOT NULL, -c2 INT NOT NULL, -c3 INT NOT NULL, -c4 INT NOT NULL, -c5 INT NOT NULL -) -STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/project_complex_expression.csv'"; - - #[tokio::test] - async fn test_query() -> Result<()> { - let config = SessionConfig::new().with_target_partitions(1); - let ctx = SessionContext::new_with_config(config); - - ctx.sql(MULTIPLE_ORDERED_TABLE).await?; - - let sql = "SELECT c3+c4, SUM(c3+c4) OVER() -FROM t"; - - let msg = format!("Creating logical plan for '{sql}'"); - let dataframe = ctx.sql(sql).await.expect(&msg); - let physical_plan = dataframe.create_physical_plan().await?; - print_plan(&physical_plan)?; - let batches = collect(physical_plan.clone(), ctx.task_ctx()).await?; - print_batches(&batches)?; - - let expected = vec![ - "ProjectionExec: expr=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as nth_value1]", - " GlobalLimitExec: skip=0, fetch=5", - " BoundedWindowAggExec: wdw=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: \"NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Sorted]", - " CsvExec: file_groups={1 group: [[Users/akurmustafa/projects/synnada/arrow-datafusion-synnada/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true", - ]; - // Get string representation of the plan - let actual = get_plan_string(&physical_plan); - assert_eq!( - expected, actual, - "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" - ); - - let expected = [ - "+------------+", - "| nth_value1 |", - "+------------+", - "| 2 |", - "| 2 |", - "| 2 |", - "| 2 |", - "| 2 |", - "+------------+", - ]; - assert_batches_eq!(expected, &batches); - Ok(()) - } -} From fb9e4d68e0167f0a7845fc3ceb423f83e2c960ec Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 13:58:20 +0300 Subject: [PATCH 22/27] Remove unnecessary changes --- datafusion/optimizer/src/common_subexpr_eliminate.rs | 2 +- datafusion/sqllogictest/test_files/insert.slt | 2 +- .../sqllogictest/test_files/project_complex_sub_query.slt | 4 ++-- datafusion/sqllogictest/test_files/subquery.slt | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 51e704d30ecc9..dcb6f23e237f3 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -944,7 +944,7 @@ impl ProjectionAdder { // cannot rewrite } } - Self::trim_expr(expr)?; + // Self::trim_expr(expr)?; Ok(()) } diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 113e544ad0636..fc272326973b1 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -169,7 +169,7 @@ ORDER BY c1 ---- logical_plan Dml: op=[Insert Into] table=[table_without_values] ---Projection: a1, a2 +--Projection: a1 AS a1, a2 AS a2 ----Sort: aggregate_test_100.c1 ASC NULLS LAST ------Projection: SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 --------WindowAggr: windowExpr=[[SUM(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, COUNT(UInt8(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt index 9f88de66ea3bd..56e728325b790 100644 --- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt +++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt @@ -37,7 +37,7 @@ FROM t; ---- logical_plan WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ---Projection: t.c3 + t.c4 +--Projection: t.c3 + t.c4 AS t.c3 + t.c4 ----TableScan: t projection=[c3, c4] physical_plan WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] @@ -90,7 +90,7 @@ FROM t; ---- logical_plan WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ---Projection: t.c3 - t.c4 +--Projection: t.c3 - t.c4 AS t.c3 - t.c4 ----TableScan: t projection=[c3, c4] physical_plan WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index aac64b55b535a..4fb94cfab5233 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -449,7 +449,7 @@ query TT explain SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 1) as t2_int from t1 ---- logical_plan -Projection: t1.t1_id, () +Projection: t1.t1_id, () AS t2_int --Subquery: ----Limit: skip=0, fetch=1 ------Projection: t2.t2_int From bdf626ad2f11973b7ae1a3a06ed121ad630c39a1 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 14:03:46 +0300 Subject: [PATCH 23/27] Resolve linter errors --- .../optimizer/src/common_subexpr_eliminate.rs | 64 +++---------------- 1 file changed, 9 insertions(+), 55 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index dcb6f23e237f3..b6c51b8f8a4c4 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -420,8 +420,7 @@ impl CommonSubexprEliminate { /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the /// fn add_extra_projection(&self, plan: &LogicalPlan) -> Result> { - // println!("plan at the start: {:?}", plan); - let res = plan + let result = plan .clone() .rewrite(&mut ProjectionAdder { insertion_point_map: HashMap::new(), @@ -429,9 +428,7 @@ impl CommonSubexprEliminate { complex_exprs: HashMap::new(), })? .data; - // .map(|transformed| Some(transformed.data)) - // println!("plan at the end: {:?}", res); - Ok(Some(res)) + Ok(Some(result)) } } impl OptimizerRule for CommonSubexprEliminate { @@ -912,30 +909,27 @@ impl ProjectionAdder { } fn update_expr_with_available_columns( - &self, expr: &mut Expr, available_columns: &[Column], ) -> Result<()> { match expr { Expr::BinaryExpr(_) => { for available_col in available_columns { - // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?); if available_col.flat_name() == expr.display_name()? { - // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col); *expr = Expr::Column(available_col.clone()); } } } Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => { args.iter_mut().try_for_each(|arg| { - self.update_expr_with_available_columns(arg, available_columns) + Self::update_expr_with_available_columns(arg, available_columns) })? } Expr::Cast(Cast { expr, .. }) => { - self.update_expr_with_available_columns(expr, available_columns)? + Self::update_expr_with_available_columns(expr, available_columns)? } Expr::Alias(alias) => { - self.update_expr_with_available_columns( + Self::update_expr_with_available_columns( &mut alias.expr, available_columns, )?; @@ -944,31 +938,15 @@ impl ProjectionAdder { // cannot rewrite } } - // Self::trim_expr(expr)?; Ok(()) } - fn trim_expr(expr: &mut Expr) -> Result<()> { - let orig_name = expr.display_name()?; - match expr { - Expr::Alias(alias) => { - Self::trim_expr(&mut alias.expr)?; - if orig_name == alias.expr.display_name()? { - *expr = *alias.expr.clone(); - } - Ok(()) - } - _ => Ok(()), - } - } - // Assumes operators doesn't modify name of the fields. // Otherwise this operation is not safe. fn extend_with_exprs(&mut self, node: &LogicalPlan) { // use depth to trace where we are in the LogicalPlan tree // extract all expressions + check whether it contains in depth_sets let exprs = node.expressions(); - // println!("extended exprs: {:?}", exprs); let mut schema = node.schema().deref().clone(); for ip in node.inputs() { schema.merge(ip.schema()); @@ -978,8 +956,6 @@ impl ProjectionAdder { let (_, count) = self.complex_exprs.entry(expr).or_insert_with(|| (dtype, 0)); *count += 1; } - // self.cumulative_map.extend(extended_set); - // self.data_type_map.extend(data_map); } } impl TreeNodeRewriter for ProjectionAdder { @@ -993,14 +969,13 @@ impl TreeNodeRewriter for ProjectionAdder { LogicalPlan::TableScan(_) => { self.insertion_point_map .insert(self.depth - 1, self.complex_exprs.clone()); - return Ok(Transformed::no(node)); + Ok(Transformed::no(node)) } LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => { self.extend_with_exprs(&node); Ok(Transformed::no(node)) } LogicalPlan::Projection(_) => { - // println!("proj node exprs: {:?}", node.expressions()); if self.complex_exprs.is_empty() { self.extend_with_exprs(&node); } else { @@ -1025,17 +1000,8 @@ impl TreeNodeRewriter for ProjectionAdder { .unwrap_or_default(); self.depth -= 1; // do not do extra things - // println!("---------------------"); - // for (depth, complex_exprs) in &self.insertion_point_map{ - // println!("depth: {:?}, complex_exprs: {:?}", depth, complex_exprs); - // } - // println!("---------------------"); - // let should_add_projection = !cached_exprs.is_empty(); let should_add_projection = cached_exprs.iter().any(|(_expr, (_, count))| *count > 1); - // if let LogicalPlan::Projection(projection) = node { - // - // } let children = node.inputs(); if children.len() != 1 { @@ -1062,11 +1028,10 @@ impl TreeNodeRewriter for ProjectionAdder { } // adding new plan here - let new_child = LogicalPlan::Projection(Projection::try_new( + LogicalPlan::Projection(Projection::try_new( project_exprs, Arc::new(node.inputs()[0].clone()), - )?); - new_child + )?) } else { child }; @@ -1078,21 +1043,10 @@ impl TreeNodeRewriter for ProjectionAdder { .map(|field| field.qualified_column()) .collect::>(); // Replace expressions with its pre-computed variant if available. - // println!("-----------------------"); - // for expr in &expressions{ - // println!("old expr: {:?}", expr); - // } expressions.iter_mut().try_for_each(|expr| { - self.update_expr_with_available_columns(expr, &available_columns) + Self::update_expr_with_available_columns(expr, &available_columns) })?; - // for expr in &expressions{ - // println!("new expr: {:?}", expr); - // } - // println!("-----------------------"); - - // println!("old node:{:?}", node); let new_node = node.with_new_exprs(expressions, [child].to_vec())?; - // println!("new node:{:?}", new_node); Ok(Transformed::yes(new_node)) } } From 31cc2d7d0052418b06389d2cf6f105c284267766 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 14:20:37 +0300 Subject: [PATCH 24/27] Remove left over file --- datafusion/core/tests/data/demo.csv | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 datafusion/core/tests/data/demo.csv diff --git a/datafusion/core/tests/data/demo.csv b/datafusion/core/tests/data/demo.csv deleted file mode 100644 index d1492228246f0..0000000000000 --- a/datafusion/core/tests/data/demo.csv +++ /dev/null @@ -1,2 +0,0 @@ -c1,c2,c3,c4,c5 -1,2,3,4,5 From e7f4b49694803bb41be2500c8003cc65a2b91363 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 14:23:17 +0300 Subject: [PATCH 25/27] Minor changes --- datafusion/core/tests/data/project_complex_expression.csv | 2 +- datafusion/optimizer/src/optimize_projections.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv index f37f11cc7f1ca..c78f35a485cd0 100644 --- a/datafusion/core/tests/data/project_complex_expression.csv +++ b/datafusion/core/tests/data/project_complex_expression.csv @@ -3,4 +3,4 @@ c1,c2,c3,c4,c5 6,7,8,9,10 11,12,13,14,15 16,17,18,19,20 -21,22,23,24,25 +21,22,23,24,25 \ No newline at end of file diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 9201d0a5e9b39..39dc56d2af93f 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -174,7 +174,6 @@ fn optimize_projections( return Ok(None); } LogicalPlan::Projection(proj) => { - // println!("expr:{:?}", proj.expr); return if let Some(proj) = merge_consecutive_projections(proj)? { Ok(Some( rewrite_projection_given_requirements(&proj, config, indices)? From 313cccd78cc782217daed56f6a27baf12bb101ed Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 14:27:05 +0300 Subject: [PATCH 26/27] Remove unnecessary changes --- datafusion/expr/src/expr.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 2e4930a1a0cb3..7ede4cd8ffc9e 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1866,19 +1866,8 @@ fn create_name(e: &Expr) -> Result { Ok(format!("{expr} BETWEEN {low} AND {high}")) } } - Expr::Sort(Sort { - expr, - asc, - nulls_first, - }) => { - let dir = if *asc { "ASC" } else { "DESC" }; - let nulls = if *nulls_first { - "NULLS_FIRST" - } else { - "NULLS_LAST" - }; - Ok(format!("{expr} {dir} {nulls}")) - // internal_err!("Create name does not support sort expression") + Expr::Sort { .. } => { + internal_err!("Create name does not support sort expression") } Expr::Wildcard { qualifier } => match qualifier { Some(qualifier) => internal_err!( From 759226be6c631134da3bd207400323b3f9f47321 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 28 Mar 2024 14:37:38 +0300 Subject: [PATCH 27/27] Update comment --- .../optimizer/src/common_subexpr_eliminate.rs | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index b6c51b8f8a4c4..e0638c2b6c010 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -850,10 +850,13 @@ fn replace_common_expr( } struct ProjectionAdder { + // Keeps track of cumulative usage of common expressions with its corresponding data type. + // accross plan where key is unsafe nodes that cumulative tracking is invalidated. insertion_point_map: HashMap>, depth: usize, + // Keeps track of cumulative usage of the common expressions with its corresponding data type. + // between safe nodes. complex_exprs: HashMap, - // should_update_parents: bool, } pub fn is_not_complex(op: &Operator) -> bool { matches!( @@ -967,21 +970,24 @@ impl TreeNodeRewriter for ProjectionAdder { self.depth += 1; match node { LogicalPlan::TableScan(_) => { + // Stop tracking cumulative usage at the source. + let complex_exprs = std::mem::take(&mut self.complex_exprs); self.insertion_point_map - .insert(self.depth - 1, self.complex_exprs.clone()); + .insert(self.depth - 1, complex_exprs); Ok(Transformed::no(node)) } LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => { + // These are safe operators where, expression identity is preserved during operation. self.extend_with_exprs(&node); Ok(Transformed::no(node)) } LogicalPlan::Projection(_) => { - if self.complex_exprs.is_empty() { - self.extend_with_exprs(&node); - } else { - self.insertion_point_map - .insert(self.depth - 1, self.complex_exprs.clone()); - } + // Stop tracking cumulative usage at the projection since it may invalidate expression identity. + let complex_exprs = std::mem::take(&mut self.complex_exprs); + self.insertion_point_map + .insert(self.depth - 1, complex_exprs); + // Start tracking common expressions from now on including projection. + self.extend_with_exprs(&node); Ok(Transformed::no(node)) } _ => {