From f4959210fbe66a48603ccdf91d49c2b76a6d2b3e Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Tue, 19 Mar 2024 18:53:01 -0500
Subject: [PATCH 01/27] for debug

---
 datafusion/expr/src/tree_node/expr.rs         |   2 +
 .../optimizer/src/common_subexpr_eliminate.rs | 132 +++++++++++++++---
 2 files changed, 113 insertions(+), 21 deletions(-)
diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs
index 1c672851e9b5b..64967b509cca2 100644
--- a/datafusion/expr/src/tree_node/expr.rs
+++ b/datafusion/expr/src/tree_node/expr.rs
@@ -79,6 +79,7 @@ impl TreeNode for Expr {
             | Expr::Wildcard {..}
             | Expr::Placeholder (_) => vec![],
             Expr::BinaryExpr(BinaryExpr { left, right, .. }) => {
+                println!("left is {:?}, right is {:?}", left, right);
                 vec![left.as_ref(), right.as_ref()]
             }
             Expr::Like(Like { expr, pattern, .. })
@@ -122,6 +123,7 @@ impl TreeNode for Expr {
                 let mut expr_vec = args.iter().collect::<Vec<_>>();
                 expr_vec.extend(partition_by);
                 expr_vec.extend(order_by);
+                println!("expressions vector is {:?}", expr_vec);
                 expr_vec
             }
             Expr::InList(InList { expr, list, .. }) => {
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 0c9064d0641fc..5a139b818cefd 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -23,6 +23,7 @@ use std::sync::Arc;
 use crate::utils::is_volatile_expression;
 use crate::{utils, OptimizerConfig, OptimizerRule};
 
+use arrow::array;
 use arrow::datatypes::DataType;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
@@ -34,6 +35,7 @@ use datafusion_common::{
 use datafusion_expr::expr::Alias;
 use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window};
 use datafusion_expr::{col, Expr, ExprSchemable};
+use regex_syntax::ast::print;
 
 /// A map from expression's identifier to tuple including
 /// - the expression itself (cloned)
@@ -120,14 +122,25 @@ impl CommonSubexprEliminate {
             .iter()
             .zip(arrays_list.iter())
             .map(|(exprs, arrays)| {
-                exprs
+                println!("************ \n expr is {:?} \n ************ \n", exprs);
+                let res = exprs
                     .iter()
                     .cloned()
                     .zip(arrays.iter())
                     .map(|(expr, id_array)| {
+                        println!(
+                            "************ \n current expr is {:?} \n ************ \n",
+                            expr
+                        );
+                        println!(
+                            "************ \n current id_array is {:?} \n ************ \n",
+                            id_array
+                        );
                         replace_common_expr(expr, id_array, expr_set, affected_id)
                     })
-                    .collect::<Result<Vec<_>>>()
+                    .collect::<Result<Vec<_>>>();
+                println!("************ \n expr is {:?} \n ************ \n", res);
+                res
             })
             .collect::<Result<Vec<_>>>()
     }
@@ -140,6 +153,18 @@ impl CommonSubexprEliminate {
         expr_set: &ExprSet,
         config: &dyn OptimizerConfig,
     ) -> Result<(Vec<Vec<Expr>>, LogicalPlan)> {
+        println!(
+            "************ \n current plan is {:?} \n *********** \n",
+            input
+        );
+        println!(
+            "************ \n exprs list are {:?} \n *********** \n",
+            exprs_list
+        );
+        println!(
+            "************ \n array list are {:?} \n *********** \n",
+            arrays_list
+        );
         let mut affected_id = BTreeSet::<Identifier>::new();
 
         let rewrite_exprs =
@@ -151,7 +176,14 @@ impl CommonSubexprEliminate {
         if !affected_id.is_empty() {
             new_input = build_common_expr_project_plan(new_input, affected_id, expr_set)?;
         }
-
+        println!(
+            "\n *********** \n rewrite expressions are {:?} \n *********** \n",
+            rewrite_exprs
+        );
+        println!(
+            "\n *********** \n new input is {:?} \n *********** \n",
+            new_input
+        );
         Ok((rewrite_exprs, new_input))
     }
 
@@ -187,7 +219,10 @@ impl CommonSubexprEliminate {
             window_exprs.push(window_expr);
             arrays_per_window.push(arrays);
         }
-
+        println!(
+            "\n *********** \n window exprs are {:?} \n *********** \n",
+            window_exprs
+        );
         let mut window_exprs = window_exprs
             .iter()
             .map(|expr| expr.as_slice())
@@ -196,7 +231,10 @@ impl CommonSubexprEliminate {
             .iter()
             .map(|arrays| arrays.as_slice())
             .collect::<Vec<_>>();
-
+        println!(
+            "\n *********** \n arrays per windows are  {:?} \n *********** \n",
+            arrays_per_window
+        );
         assert_eq!(window_exprs.len(), arrays_per_window.len());
         let (mut new_expr, new_input) = self.rewrite_expr(
             &window_exprs,
@@ -205,8 +243,19 @@ impl CommonSubexprEliminate {
             &expr_set,
             config,
         )?;
+        println!(
+            "\n *********** \n new exprs are  {:?} \n *********** \n",
+            new_expr
+        );
+        println!(
+            "\n *********** \n new input are  {:?} \n *********** \n",
+            new_input
+        );
         assert_eq!(window_exprs.len(), new_expr.len());
-
+        println!(
+            "\n *********** \n before plan are  {:?} \n *********** \n",
+            plan
+        );
         // Construct consecutive window operator, with their corresponding new window expressions.
         plan = new_input;
         while let Some(new_window_expr) = new_expr.pop() {
@@ -220,13 +269,17 @@ impl CommonSubexprEliminate {
                 .into_iter()
                 .zip(orig_window_expr.iter())
                 .map(|(new_window_expr, window_expr)| {
+                    println!("new_window_expr is {:?}", new_window_expr);
                     let original_name = window_expr.name_for_alias()?;
                     new_window_expr.alias_if_changed(original_name)
                 })
                 .collect::<Result<Vec<_>>>()?;
             plan = LogicalPlan::Window(Window::try_new(new_window_expr, Arc::new(plan))?);
         }
-
+        println!(
+            "\n *********** \n final plan are  {:?} \n *********** \n",
+            plan
+        );
         Ok(plan)
     }
 
@@ -353,6 +406,8 @@ impl CommonSubexprEliminate {
         config: &dyn OptimizerConfig,
     ) -> Result<LogicalPlan> {
         let expr = plan.expressions();
+        println!("********** \n expressions are {:?} \n ********* \n", expr);
+        println!("********** \n ur plan is {:?} \n ********* \n", plan);
         let inputs = plan.inputs();
         let input = inputs[0];
         let input_schema = Arc::clone(input.schema());
@@ -360,11 +415,17 @@ impl CommonSubexprEliminate {
 
         // Visit expr list and build expr identifier to occuring count map (`expr_set`).
         let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?;
-
+        println!("********* \n arrays {:?} \n ********* \n", arrays);
+        println!("********* \n expr_set {:?} \n ********* \n", expr_set);
         let (mut new_expr, new_input) =
             self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?;
-
-        plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input])
+        println!(
+            "********* \n new expr is {:?} \n and new input is {:?} ********* \n",
+            new_expr, new_input
+        );
+        let res_plan = plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]);
+        println!("********* \n res plan is {:?} \n ********* \n", res_plan);
+        res_plan
     }
 }
 
@@ -412,9 +473,17 @@ impl OptimizerRule for CommonSubexprEliminate {
         };
 
         let original_schema = plan.schema().clone();
+        println!(
+            "***************** \n original schema is {:?} \n ***************",
+            original_schema
+        );
         match optimized_plan {
             Some(optimized_plan) if optimized_plan.schema() != &original_schema => {
                 // add an additional projection if the output schema changed.
+                println!(
+                    "************** \n optimized schema {:?} \n ************",
+                    optimized_plan.schema(),
+                );
                 Ok(Some(build_recover_project_plan(
                     &original_schema,
                     optimized_plan,
@@ -454,9 +523,12 @@ fn to_arrays(
     expr_set: &mut ExprSet,
     expr_mask: ExprMask,
 ) -> Result<Vec<Vec<(usize, String)>>> {
-    expr.iter()
+    println!("schema is {:?}", input_schema);
+    let res = expr
+        .iter()
         .map(|e| {
             let mut id_array = vec![];
+            println!("current expression is {:?}", e);
             expr_to_identifier(
                 e,
                 expr_set,
@@ -467,7 +539,9 @@ fn to_arrays(
 
             Ok(id_array)
         })
-        .collect::<Result<Vec<_>>>()
+        .collect::<Result<Vec<_>>>();
+    println!("res is {:?}", res);
+    res
 }
 
 /// Build the "intermediate" projection plan that evaluates the extracted common expressions.
@@ -674,6 +748,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> {
     }
 
     fn f_up(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> {
+        println!("********* \n expr is {:?} \n *********", expr);
         self.series_number += 1;
 
         let (idx, sub_expr_identifier) = self.pop_enter_mark();
@@ -693,7 +768,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> {
         self.visit_stack.push(VisitRecord::ExprItem(desc.clone()));
 
         let data_type = expr.get_type(&self.input_schema)?;
-
+        print!("data type is {:?}", data_type);
         self.expr_set
             .entry(desc)
             .or_insert_with(|| (expr.clone(), 0, data_type))
@@ -747,6 +822,10 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> {
         // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate
         // the `id_array`, which records the expr's identifier used to rewrite expr. So if we
         // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too.
+        println!(
+            "****************** \n expr is {:?} \n ****************** \n ",
+            expr
+        );
         if expr.short_circuits() || is_volatile_expression(&expr)? {
             return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump));
         }
@@ -819,14 +898,25 @@ fn replace_common_expr(
     expr_set: &ExprSet,
     affected_id: &mut BTreeSet<Identifier>,
 ) -> Result<Expr> {
-    expr.rewrite(&mut CommonSubexprRewriter {
-        expr_set,
-        id_array,
-        affected_id,
-        max_series_number: 0,
-        curr_index: 0,
-    })
-    .data()
+    println!("************ \n expr are {:?} \n ************", expr);
+    println!(
+        "************ \n id_array are {:?} \n ************",
+        id_array
+    );
+    let rewrited = expr
+        .rewrite(&mut CommonSubexprRewriter {
+            expr_set,
+            id_array,
+            affected_id,
+            max_series_number: 0,
+            curr_index: 0,
+        })
+        .data();
+    println!(
+        "************ \n rewrited values are {:?} \n ************",
+        rewrited
+    );
+    rewrited
 }
 
 #[cfg(test)]

From cc1d4125688f98bace96adac7ab80046a3550898 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Wed, 20 Mar 2024 20:10:46 -0500
Subject: [PATCH 02/27] adding projections for subquery

---
 .../optimizer/src/common_subexpr_eliminate.rs | 204 ++++++++++++++++--
 1 file changed, 189 insertions(+), 15 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 5a139b818cefd..d2a6c3bc270d2 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -17,24 +17,28 @@
 
 //! Eliminate common sub-expression.
 
-use std::collections::{BTreeSet, HashMap};
-use std::sync::Arc;
-
 use crate::utils::is_volatile_expression;
 use crate::{utils, OptimizerConfig, OptimizerRule};
+use datafusion_expr::Expr;
+use std::collections::{BTreeSet, HashMap};
+use std::slice::Windows;
+use std::sync::Arc;
 
 use arrow::array;
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, SchemaRef};
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
     TreeNodeVisitor,
 };
 use datafusion_common::{
-    internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result,
+    internal_datafusion_err, internal_err, plan_err, Column, DFField, DFSchema,
+    DFSchemaRef, DataFusionError, Result,
 };
-use datafusion_expr::expr::Alias;
+use datafusion_expr::expr::{Alias, WindowFunction};
 use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window};
-use datafusion_expr::{col, Expr, ExprSchemable};
+use datafusion_expr::tree_node::plan;
+use datafusion_expr::{col, BinaryExpr, Cast, ExprSchemable};
+use hashbrown::HashSet;
 use regex_syntax::ast::print;
 
 /// A map from expression's identifier to tuple including
@@ -171,7 +175,7 @@ impl CommonSubexprEliminate {
             self.rewrite_exprs_list(exprs_list, arrays_list, expr_set, &mut affected_id)?;
 
         let mut new_input = self
-            .try_optimize(input, config)?
+            .common_optimize(input, config)?
             .unwrap_or_else(|| input.clone());
         if !affected_id.is_empty() {
             new_input = build_common_expr_project_plan(new_input, affected_id, expr_set)?;
@@ -262,9 +266,13 @@ impl CommonSubexprEliminate {
             // Since `new_expr` and `window_exprs` length are same. We can safely `.unwrap` here.
             let orig_window_expr = window_exprs.pop().unwrap();
             assert_eq!(new_window_expr.len(), orig_window_expr.len());
-
+            println!(
+                "************* \n origin window expressions {:?} \n ************** \n ",
+                orig_window_expr
+            );
             // Rename new re-written window expressions with original name (by giving alias)
             // Otherwise we may receive schema error, in subsequent operators.
+            println!("************* \n before changed window expressions {:?} \n ************** \n ", new_window_expr);
             let new_window_expr = new_window_expr
                 .into_iter()
                 .zip(orig_window_expr.iter())
@@ -274,6 +282,7 @@ impl CommonSubexprEliminate {
                     new_window_expr.alias_if_changed(original_name)
                 })
                 .collect::<Result<Vec<_>>>()?;
+            println!("************* \n after changed window expressions {:?} \n ************** \n ", new_window_expr);
             plan = LogicalPlan::Window(Window::try_new(new_window_expr, Arc::new(plan))?);
         }
         println!(
@@ -428,9 +437,8 @@ impl CommonSubexprEliminate {
         res_plan
     }
 }
-
-impl OptimizerRule for CommonSubexprEliminate {
-    fn try_optimize(
+impl CommonSubexprEliminate {
+    fn common_optimize(
         &self,
         plan: &LogicalPlan,
         config: &dyn OptimizerConfig,
@@ -484,6 +492,10 @@ impl OptimizerRule for CommonSubexprEliminate {
                     "************** \n optimized schema {:?} \n ************",
                     optimized_plan.schema(),
                 );
+                println!(
+                    "************** \n current plan is {:?} \n ************",
+                    optimized_plan,
+                );
                 Ok(Some(build_recover_project_plan(
                     &original_schema,
                     optimized_plan,
@@ -492,6 +504,33 @@ impl OptimizerRule for CommonSubexprEliminate {
             plan => Ok(plan),
         }
     }
+    /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the
+    ///
+    fn add_extra_projection(&self, plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
+        let res = plan
+            .clone()
+            .rewrite(&mut ProjectionAdder {
+                depth_map: HashMap::new(),
+                depth: 0,
+            })
+            .map(|transformed| Some(transformed.data));
+        println!("************** \n result plan is {:?} *********** \n", res);
+        res
+    }
+}
+impl OptimizerRule for CommonSubexprEliminate {
+    fn try_optimize(
+        &self,
+        plan: &LogicalPlan,
+        config: &dyn OptimizerConfig,
+    ) -> Result<Option<LogicalPlan>> {
+        let optimized_plan_option = self.common_optimize(plan, config)?;
+        let plan = match optimized_plan_option {
+            Some(plan) => plan,
+            _ => plan.clone(),
+        };
+        self.add_extra_projection(&plan)
+    }
 
     fn name(&self) -> &str {
         "common_sub_expression_eliminate"
@@ -557,6 +596,10 @@ fn build_common_expr_project_plan(
         match expr_set.get(&id) {
             Some((expr, _, data_type)) => {
                 // todo: check `nullable`
+                println!(
+                    "\n ********** expr is {:?} \n \n and data type is {:?} \n **************** \n",
+                    expr, data_type
+                );
                 let field = DFField::new_unqualified(&id, data_type.clone(), true);
                 fields_set.insert(field.name().to_owned());
                 project_exprs.push(expr.clone().alias(&id));
@@ -590,8 +633,19 @@ fn build_recover_project_plan(
     let col_exprs = schema
         .fields()
         .iter()
-        .map(|field| Expr::Column(field.qualified_column()))
+        .map(|field| {
+            println!("field is {:?}", field);
+            Expr::Column(field.qualified_column())
+        })
         .collect();
+    println!(
+        "************** \n col_exprs are {:?} \n **************",
+        col_exprs
+    );
+    println!(
+        "************** \n current plan is  {:?} \n **************",
+        input
+    );
     Ok(LogicalPlan::Projection(Projection::try_new(
         col_exprs,
         Arc::new(input),
@@ -919,6 +973,126 @@ fn replace_common_expr(
     rewrited
 }
 
+struct ProjectionAdder {
+    depth_map: HashMap<u128, HashSet<DFField>>,
+    depth: u128,
+}
+
+impl ProjectionAdder {
+    // TODO: adding more expressions for sub query, currently only support for Simple Binary Expressions
+    fn get_complex_expressions(
+        exprs: Vec<Expr>,
+        schema: DFSchemaRef,
+    ) -> HashSet<DFField> {
+        let mut res = HashSet::new();
+        for expr in exprs {
+            match expr {
+                Expr::BinaryExpr(BinaryExpr {
+                    left: ref l_box,
+                    op: _,
+                    right: ref r_box,
+                }) => {
+                    match (&**l_box, &**r_box) {
+                        (Expr::Column(l), Expr::Column(r)) => {
+                            // Both `left` and `right` are `Expr::Column`, so we push to `res`
+                            if schema.has_column(l) && schema.has_column(r) {
+                                res.insert(DFField::new_unqualified(
+                                    &expr.to_string(),
+                                    schema
+                                        .field_from_column(l)
+                                        .unwrap()
+                                        .data_type()
+                                        .clone(),
+                                    true,
+                                ));
+                            }
+                        }
+                        // If they are not both `Expr::Column`, you can handle other cases or do nothing
+                        _ => {}
+                    }
+                }
+                Expr::Cast(_) => {
+                    res.extend(Self::get_complex_expressions(vec![expr], schema.clone()))
+                }
+
+                Expr::WindowFunction(WindowFunction { fun, args, .. }) => {
+                    res.extend(Self::get_complex_expressions(args, schema.clone()))
+                }
+                _ => {}
+            }
+        }
+        res
+    }
+}
+impl TreeNodeRewriter for ProjectionAdder {
+    type Node = LogicalPlan;
+    /// currently we just collect the complex bianryOP
+
+    fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
+        // use depth to trace where we are in the LogicalPlan tree
+        self.depth += 1;
+        // extract all expressions + check whether it contains in depth_sets
+        let exprs = node.expressions();
+        println!("********* \n cur plan is  {:?} \n *********** \n", node);
+        println!("********* \n exprs are {:?} \n *********** \n", exprs);
+        let depth_set = self.depth_map.entry(self.depth).or_default();
+        println!("self.input schema is {:?}", node.schema());
+        depth_set.extend(Self::get_complex_expressions(exprs, node.schema().clone()));
+        Ok(Transformed::no(node))
+    }
+    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
+        self.depth -= 1;
+        println!("*********** \ncur plan is {:?} \n*************\n", node);
+
+        let current_depth_schema =
+            self.depth_map.get(&self.depth).cloned().unwrap_or_default();
+
+        // get the intersected part
+        let added_schema = self
+            .depth_map
+            .iter()
+            .filter(|(&depth, _)| depth < self.depth)
+            .fold(current_depth_schema, |acc, (_, fields)| {
+                acc.intersection(fields).cloned().collect()
+            });
+
+        // do not do extra things
+        if added_schema.is_empty() {
+            return Ok(Transformed::no(node));
+        }
+
+        println!("\n*************\n{:?}\n*************\n", added_schema);
+
+        match node {
+            // do not add for Projections
+            LogicalPlan::Projection(_) => Ok(Transformed::no(node)),
+            _ => {
+                let mut col_exprs: Vec<Expr> = node
+                    .schema()
+                    .fields()
+                    .iter()
+                    .map(|field| Expr::Column(field.qualified_column()))
+                    .collect();
+
+                col_exprs.extend(
+                    added_schema
+                        .iter()
+                        .map(|field| Expr::Column(field.qualified_column())),
+                );
+
+                // adding new plan here
+                let new_plan = LogicalPlan::Projection(Projection::try_new(
+                    col_exprs.clone(),
+                    Arc::new(node.clone()),
+                )?);
+
+                Ok(Transformed::yes(
+                    node.with_new_exprs(col_exprs, [new_plan].to_vec())?,
+                ))
+            }
+        }
+    }
+}
 #[cfg(test)]
 mod test {
     use std::iter;
@@ -943,7 +1117,7 @@ mod test {
     fn assert_optimized_plan_eq(expected: &str, plan: &LogicalPlan) {
         let optimizer = CommonSubexprEliminate {};
         let optimized_plan = optimizer
-            .try_optimize(plan, &OptimizerContext::new())
+            .common_optimize(plan, &OptimizerContext::new())
             .unwrap()
             .expect("failed to optimize plan");
         let formatted_plan = format!("{optimized_plan:?}");
@@ -1362,7 +1536,7 @@ mod test {
             .unwrap();
         let rule = CommonSubexprEliminate {};
         let optimized_plan = rule
-            .try_optimize(&plan, &OptimizerContext::new())
+            .common_optimize(&plan, &OptimizerContext::new())
             .unwrap()
             .unwrap();
 

From 1c64d23b02436dd7d20fe192b32914a6c78fde88 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Thu, 21 Mar 2024 20:12:04 -0500
Subject: [PATCH 03/27] refactor code and add tests

---
 datafusion/expr/src/tree_node/expr.rs         |   2 -
 .../optimizer/src/common_subexpr_eliminate.rs | 168 ++++++++++++------
 .../optimizer/src/optimize_projections.rs     | 104 ++++++++++-
 datafusion/sql/src/select.rs                  |   2 +
 .../test_files/project_complex_sub_query.slt  |  52 ++++++
 5 files changed, 265 insertions(+), 63 deletions(-)
 create mode 100644 datafusion/sqllogictest/test_files/project_complex_sub_query.slt

diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs
index 64967b509cca2..1c672851e9b5b 100644
--- a/datafusion/expr/src/tree_node/expr.rs
+++ b/datafusion/expr/src/tree_node/expr.rs
@@ -79,7 +79,6 @@ impl TreeNode for Expr {
             | Expr::Wildcard {..}
             | Expr::Placeholder (_) => vec![],
             Expr::BinaryExpr(BinaryExpr { left, right, .. }) => {
-                println!("left is {:?}, right is {:?}", left, right);
                 vec![left.as_ref(), right.as_ref()]
             }
             Expr::Like(Like { expr, pattern, .. })
@@ -123,7 +122,6 @@ impl TreeNode for Expr {
                 let mut expr_vec = args.iter().collect::<Vec<_>>();
                 expr_vec.extend(partition_by);
                 expr_vec.extend(order_by);
-                println!("expressions vector is {:?}", expr_vec);
                 expr_vec
             }
             Expr::InList(InList { expr, list, .. }) => {
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index d2a6c3bc270d2..11121f4bd4db7 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -21,6 +21,7 @@ use crate::utils::is_volatile_expression;
 use crate::{utils, OptimizerConfig, OptimizerRule};
 use datafusion_expr::Expr;
 use std::collections::{BTreeSet, HashMap};
+use std::ops::Deref;
 use std::slice::Windows;
 use std::sync::Arc;
 
@@ -486,8 +487,8 @@ impl CommonSubexprEliminate {
             original_schema
         );
         match optimized_plan {
+            Some(LogicalPlan::Projection(_)) => Ok(optimized_plan),
             Some(optimized_plan) if optimized_plan.schema() != &original_schema => {
-                // add an additional projection if the output schema changed.
                 println!(
                     "************** \n optimized schema {:?} \n ************",
                     optimized_plan.schema(),
@@ -510,6 +511,7 @@ impl CommonSubexprEliminate {
         let res = plan
             .clone()
             .rewrite(&mut ProjectionAdder {
+                data_type_map: HashMap::new(),
                 depth_map: HashMap::new(),
                 depth: 0,
             })
@@ -525,11 +527,15 @@ impl OptimizerRule for CommonSubexprEliminate {
         config: &dyn OptimizerConfig,
     ) -> Result<Option<LogicalPlan>> {
         let optimized_plan_option = self.common_optimize(plan, config)?;
+
+        // println!("optimized plan option is {:?}", optimized_plan_option);
         let plan = match optimized_plan_option {
             Some(plan) => plan,
             _ => plan.clone(),
         };
-        self.add_extra_projection(&plan)
+        let res = self.add_extra_projection(&plan);
+        println!("res is {:?}", res);
+        res
     }
 
     fn name(&self) -> &str {
@@ -600,6 +606,7 @@ fn build_common_expr_project_plan(
                     "\n ********** expr is {:?} \n \n and data type is {:?} \n **************** \n",
                     expr, data_type
                 );
+                println!("********* \n {:?} ***********\n", expr);
                 let field = DFField::new_unqualified(&id, data_type.clone(), true);
                 fields_set.insert(field.name().to_owned());
                 project_exprs.push(expr.clone().alias(&id));
@@ -615,7 +622,11 @@ fn build_common_expr_project_plan(
             project_exprs.push(Expr::Column(field.qualified_column()));
         }
     }
-
+    println!(
+        "************** \n exprs are {:?} \n ************** \n",
+        project_exprs
+    );
+    println!("********** \n input are {:?} \n ************** \n", input);
     Ok(LogicalPlan::Projection(Projection::try_new(
         project_exprs,
         Arc::new(input),
@@ -974,8 +985,9 @@ fn replace_common_expr(
 }
 
 struct ProjectionAdder {
-    depth_map: HashMap<u128, HashSet<DFField>>,
+    depth_map: HashMap<u128, HashSet<Expr>>,
     depth: u128,
+    data_type_map: HashMap<Expr, DataType>,
 }
 
 impl ProjectionAdder {
@@ -983,45 +995,55 @@ impl ProjectionAdder {
     fn get_complex_expressions(
         exprs: Vec<Expr>,
         schema: DFSchemaRef,
-    ) -> HashSet<DFField> {
+    ) -> (HashSet<Expr>, HashMap<Expr, DataType>) {
         let mut res = HashSet::new();
+        let mut expr_data_type: HashMap<Expr, DataType> = HashMap::new();
+        println!(
+            "********** \n current schema is {:?} \n ******** \n",
+            schema
+        );
         for expr in exprs {
+            println!("current expr is {:?}", expr);
             match expr {
                 Expr::BinaryExpr(BinaryExpr {
                     left: ref l_box,
                     op: _,
                     right: ref r_box,
-                }) => {
-                    match (&**l_box, &**r_box) {
-                        (Expr::Column(l), Expr::Column(r)) => {
-                            // Both `left` and `right` are `Expr::Column`, so we push to `res`
-                            if schema.has_column(l) && schema.has_column(r) {
-                                res.insert(DFField::new_unqualified(
-                                    &expr.to_string(),
-                                    schema
-                                        .field_from_column(l)
-                                        .unwrap()
-                                        .data_type()
-                                        .clone(),
-                                    true,
-                                ));
-                            }
-                        }
-                        // If they are not both `Expr::Column`, you can handle other cases or do nothing
-                        _ => {}
+                }) => match (&**l_box, &**r_box) {
+                    (Expr::Column(l), Expr::Column(r)) => {
+                        let l_field = schema
+                            .field_from_column(l)
+                            .expect("Field not found for left column");
+
+                        // res.insert(DFField::new_unqualified(
+                        //     &expr.to_string(),
+                        //     l_field.data_type().clone(),
+                        //     true,
+                        // ));
+                        expr_data_type
+                            .entry(expr.clone())
+                            .or_insert(l_field.data_type().clone());
+                        res.insert(expr.clone());
                     }
-                }
-                Expr::Cast(_) => {
-                    res.extend(Self::get_complex_expressions(vec![expr], schema.clone()))
+                    _ => {}
+                },
+                Expr::Cast(Cast { expr, data_type }) => {
+                    let (expr_set, type_map) =
+                        Self::get_complex_expressions(vec![*expr], schema.clone());
+                    res.extend(expr_set);
+                    expr_data_type.extend(type_map);
                 }
 
                 Expr::WindowFunction(WindowFunction { fun, args, .. }) => {
-                    res.extend(Self::get_complex_expressions(args, schema.clone()))
+                    let (expr_set, type_map) =
+                        Self::get_complex_expressions(args, schema.clone());
+                    res.extend(expr_set);
+                    expr_data_type.extend(type_map);
                 }
                 _ => {}
             }
         }
-        res
+        (res, expr_data_type)
     }
 }
 impl TreeNodeRewriter for ProjectionAdder {
@@ -1037,57 +1059,97 @@ impl TreeNodeRewriter for ProjectionAdder {
         println!("********* \n exprs are {:?} \n *********** \n", exprs);
         let depth_set = self.depth_map.entry(self.depth).or_default();
         println!("self.input schema is {:?}", node.schema());
-        depth_set.extend(Self::get_complex_expressions(exprs, node.schema().clone()));
+        let mut schema = node.schema().deref().clone();
+        for ip in node.inputs() {
+            schema.merge(ip.schema());
+        }
+        let (extended_set, data_map) =
+            Self::get_complex_expressions(exprs, Arc::new(schema));
+        println!(
+            "*********** \n extened set is {:?} \n ********** \n",
+            extended_set
+        );
+        depth_set.extend(extended_set);
+        self.data_type_map.extend(data_map);
         Ok(Transformed::no(node))
     }
     fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        self.depth -= 1;
-        println!("*********** \ncur plan is {:?} \n*************\n", node);
+        println!("*********** \n cur plan is {:?} \n*************\n", node);
 
         let current_depth_schema =
             self.depth_map.get(&self.depth).cloned().unwrap_or_default();
 
         // get the intersected part
-        let added_schema = self
+        let added_expr = self
             .depth_map
             .iter()
             .filter(|(&depth, _)| depth < self.depth)
-            .fold(current_depth_schema, |acc, (_, fields)| {
-                acc.intersection(fields).cloned().collect()
+            .fold(current_depth_schema, |acc, (_, expr)| {
+                acc.intersection(expr).cloned().collect()
             });
-
+        println!(
+            "******** \n intersected parts are {:?} \n ***********",
+            added_expr
+        );
+        println!(
+            "******** \n depth map is {:?} \n ***********",
+            self.depth_map
+        );
+        println!(
+            "************ \n data type map is {:?} \n *********** \n",
+            self.data_type_map
+        );
+        self.depth -= 1;
         // do not do extra things
-        if added_schema.is_empty() {
+        if added_expr.is_empty() {
             return Ok(Transformed::no(node));
         }
 
-        println!("\n*************\n{:?}\n*************\n", added_schema);
+        println!("\n*************\n{:?}\n*************\n", added_expr);
 
         match node {
             // do not add for Projections
-            LogicalPlan::Projection(_) => Ok(Transformed::no(node)),
+            LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) => {
+                Ok(Transformed::no(node))
+            }
             _ => {
-                let mut col_exprs: Vec<Expr> = node
-                    .schema()
-                    .fields()
-                    .iter()
-                    .map(|field| Expr::Column(field.qualified_column()))
-                    .collect();
+                // avoid recursive add projections
+                if added_expr.iter().any(|expr| {
+                    node.inputs()[0]
+                        .schema()
+                        .has_column_with_unqualified_name(&expr.to_string())
+                }) {
+                    return Ok(Transformed::no(node));
+                }
 
-                col_exprs.extend(
-                    added_schema
-                        .iter()
-                        .map(|field| Expr::Column(field.qualified_column())),
+                let mut field_set = HashSet::new();
+                let mut project_exprs = vec![];
+                for expr in added_expr {
+                    let f = DFField::new_unqualified(
+                        &expr.to_string(),
+                        self.data_type_map[&expr].clone(),
+                        true,
+                    );
+                    field_set.insert(f.name().to_owned());
+                    project_exprs.push(expr.clone().alias(expr.to_string()));
+                }
+                for field in node.inputs()[0].schema().fields() {
+                    if field_set.insert(field.qualified_name()) {
+                        project_exprs.push(Expr::Column(field.qualified_column()));
+                    }
+                }
+                println!(
+                    "************* \n project expressions are {:?} \n ********** \n",
+                    project_exprs
                 );
-
                 // adding new plan here
                 let new_plan = LogicalPlan::Projection(Projection::try_new(
-                    col_exprs.clone(),
-                    Arc::new(node.clone()),
+                    project_exprs,
+                    Arc::new(node.inputs()[0].clone()),
                 )?);
-
+                println!("new plan is {:?}", new_plan);
                 Ok(Transformed::yes(
-                    node.with_new_exprs(col_exprs, [new_plan].to_vec())?,
+                    node.with_new_exprs(node.expressions(), [new_plan].to_vec())?,
                 ))
             }
         }
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 08ee38f64abd8..ce8bcbc132d63 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -31,7 +31,8 @@ use crate::{OptimizerConfig, OptimizerRule};
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::{
-    get_required_group_by_exprs_indices, Column, DFSchema, DFSchemaRef, JoinType, Result,
+    get_required_group_by_exprs_indices, Column, DFField, DFSchema, DFSchemaRef,
+    JoinType, Result,
 };
 use datafusion_expr::expr::{Alias, ScalarFunction};
 use datafusion_expr::{
@@ -70,6 +71,10 @@ impl OptimizerRule for OptimizeProjections {
     ) -> Result<Option<LogicalPlan>> {
         // All output fields are necessary:
         let indices = (0..plan.schema().fields().len()).collect::<Vec<_>>();
+        println!(
+            "\n ************ \n plan.schema is {:?} \n ************ \n",
+            plan.schema()
+        );
         optimize_projections(plan, config, &indices)
     }
 
@@ -106,6 +111,8 @@ fn optimize_projections(
     config: &dyn OptimizerConfig,
     indices: &[usize],
 ) -> Result<Option<LogicalPlan>> {
+    println!("**********\n cur plan is {:?} \n ********* \n", plan);
+    println!("**********\n indices is {:?} \n ********* \n", indices);
     // `child_required_indices` stores
     // - indices of the columns required for each child
     // - a flag indicating whether putting a projection above children is beneficial for the parent.
@@ -277,7 +284,10 @@ fn optimize_projections(
             // Only use window expressions that are absolutely necessary according
             // to parent requirements:
             let new_window_expr = get_at_indices(&window.window_expr, &window_reqs);
-
+            println!(
+                "************ \n new_window_expr is {:?} \n *********** \n",
+                new_window_expr
+            );
             // Get all the required column indices at the input, either by the
             // parent or window expression requirements.
             let required_indices = get_all_required_indices(
@@ -292,7 +302,10 @@ fn optimize_projections(
             } else {
                 window.input.as_ref().clone()
             };
-
+            println!(
+                "************ \n window child is {:?} *********** \n",
+                window_child
+            );
             return if new_window_expr.is_empty() {
                 // When no window expression is necessary, use the input directly:
                 Ok(Some(window_child))
@@ -302,8 +315,16 @@ fn optimize_projections(
                 // refers to `old_child`.
                 let required_exprs =
                     get_required_exprs(window.input.schema(), &required_indices);
+                println!(
+                    "\n ************** \n required exprs {:?} \n ***********\n",
+                    required_exprs
+                );
                 let (window_child, _) =
                     add_projection_on_top_if_helpful(window_child, required_exprs)?;
+                println!(
+                    "\n ************** \nwindow child is {:?} \n ************ \n",
+                    window_child
+                );
                 Window::try_new(new_window_expr, Arc::new(window_child))
                     .map(|window| Some(LogicalPlan::Window(window)))
             };
@@ -422,6 +443,7 @@ where
 /// - `Ok(None)`: Signals that merge is not beneficial (and has not taken place).
 /// - `Err(error)`: An error occured during the function call.
 fn merge_consecutive_projections(proj: &Projection) -> Result<Option<Projection>> {
+    println!("******** \n proj is {:?} \n ********** \n", proj);
     let LogicalPlan::Projection(prev_projection) = proj.input.as_ref() else {
         return Ok(None);
     };
@@ -662,6 +684,10 @@ fn indices_referred_by_exprs<'a>(
     input_schema: &DFSchemaRef,
     exprs: impl Iterator<Item = &'a Expr>,
 ) -> Result<Vec<usize>> {
+    println!(
+        "************ \n current schema is {:?} \n *********** \n",
+        input_schema
+    );
     let indices = exprs
         .map(|expr| indices_referred_by_expr(input_schema, expr))
         .collect::<Result<Vec<_>>>()?;
@@ -690,13 +716,50 @@ fn indices_referred_by_expr(
     input_schema: &DFSchemaRef,
     expr: &Expr,
 ) -> Result<Vec<usize>> {
-    let mut cols = expr.to_columns()?;
     // Get outer-referenced (subquery) columns:
+    // TODO: Support more Expressions
+    let mut cols = expr.to_columns()?;
     outer_columns(expr, &mut cols);
-    Ok(cols
+    let mut res_vec: Vec<usize> = cols
         .iter()
         .flat_map(|col| input_schema.index_of_column(col))
-        .collect())
+        .collect();
+    match expr {
+        Expr::BinaryExpr(_) => {
+            if let Some(index) =
+                input_schema.index_of_column_by_name(None, &expr.to_string())?
+            {
+                match res_vec.binary_search(&index) {
+                    Ok(_) => {}
+                    Err(pos) => {
+                        res_vec.insert(pos, index);
+                    }
+                }
+            }
+        }
+        _ => {}
+    }
+    Ok(res_vec)
+    // match expr {
+    //     Expr::BinaryExpr(_) => {
+    //         if let Some(index) =
+    //             input_schema.index_of_column_by_name(None, &expr.to_string())?
+    //         {
+    //             return Ok(vec![index]);
+    //         } else {
+    //             return Ok(vec![]);
+    //         }
+    //     }
+    //     _ => {
+    //         let mut cols = expr.to_columns()?;
+    //         outer_columns(expr, &mut cols);
+    //         let res_vec: Vec<usize> = cols
+    //             .iter()
+    //             .flat_map(|col| input_schema.index_of_column(col))
+    //             .collect();
+    //         Ok(res_vec)
+    //     }
+    // }
 }
 
 /// Gets all required indices for the input; i.e. those required by the parent
@@ -733,6 +796,7 @@ fn get_all_required_indices<'a>(
 ///
 /// A vector of expressions corresponding to specified indices.
 fn get_at_indices(exprs: &[Expr], indices: &[usize]) -> Vec<Expr> {
+    println!("************* \n expr is {:?} \n ************ \n", exprs);
     indices
         .iter()
         // Indices may point to further places than `exprs` len.
@@ -861,17 +925,36 @@ fn rewrite_projection_given_requirements(
     config: &dyn OptimizerConfig,
     indices: &[usize],
 ) -> Result<Option<LogicalPlan>> {
+    println!("************ \n proj is {:?} \n *********** \n", proj);
     let exprs_used = get_at_indices(&proj.expr, indices);
+    println!(
+        "************ \n exprs_used is {:?} \n *********** \n",
+        exprs_used
+    );
     let required_indices =
         indices_referred_by_exprs(proj.input.schema(), exprs_used.iter())?;
+    println!(
+        "************ \n required_indices is {:?} \n *********** \n",
+        required_indices
+    );
     return if let Some(input) =
         optimize_projections(&proj.input, config, &required_indices)?
     {
         if is_projection_unnecessary(&input, &exprs_used)? {
             Ok(Some(input))
         } else {
-            Projection::try_new(exprs_used, Arc::new(input))
-                .map(|proj| Some(LogicalPlan::Projection(proj)))
+            println!(
+                "************ \n exprs_used is {:?} \n *********** \n",
+                exprs_used
+            );
+            println!("************ \n input is {:?} \n *********** \n", input);
+            let res = Projection::try_new(exprs_used, Arc::new(input))
+                .map(|proj| Some(LogicalPlan::Projection(proj)));
+            println!(
+                "************ \n new expressions is {:?} \n *********** \n",
+                res
+            );
+            res
         }
     } else if exprs_used.len() < proj.expr.len() {
         // Projection expression used is different than the existing projection.
@@ -893,6 +976,11 @@ fn rewrite_projection_given_requirements(
 /// - input schema of the projection, output schema of the projection are same, and
 /// - all projection expressions are either Column or Literal
 fn is_projection_unnecessary(input: &LogicalPlan, proj_exprs: &[Expr]) -> Result<bool> {
+    println!(
+        "*************** \n projections schema is {:?} \n input schema is {:?} \n ********** \n",
+        &projection_schema(input, proj_exprs)?,
+        input.schema()
+    );
     Ok(&projection_schema(input, proj_exprs)? == input.schema()
         && proj_exprs.iter().all(is_expr_trivial))
 }
diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
index 1bfd60a8ce1aa..7cabfa8661dc8 100644
--- a/datafusion/sql/src/select.rs
+++ b/datafusion/sql/src/select.rs
@@ -567,6 +567,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
     /// Wrap a plan in a projection
     fn project(&self, input: LogicalPlan, expr: Vec<Expr>) -> Result<LogicalPlan> {
         self.validate_schema_satisfies_exprs(input.schema(), &expr)?;
+        println!("*********** \n input is {:?} \n *********** \n", input);
+        println!("*********** \n expr is {:?} \n *********** \n", expr);
         LogicalPlanBuilder::from(input).project(expr)?.build()
     }
 
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
new file mode 100644
index 0000000000000..e778d3d0cebbe
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## Prepare Statement Tests
+##########
+
+statement ok
+CREATE EXTERNAL TABLE t (
+c1 INT NOT NULL,
+c2 INT NOT NULL,
+c3 INT NOT NULL,
+c4 INT NOT NULL,
+c5 INT NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '/Users/xiangyanxin/personal/DATAFUSION/arrow-datafusion/a.csv';
+
+query TT
+explain SELECT c3+c4, SUM(c3+c4) OVER()
+FROM t;
+----
+logical_plan
+Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+--WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+----Projection: t.c3 + t.c4 AS t.c3 + t.c4, t.c3, t.c4
+------TableScan: t projection=[c3, c4]
+
+query TT
+explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
+FROM t;
+----
+logical_plan
+Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+--WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4
+------TableScan: t projection=[c3, c4]

From 3e9683996993b3f1fe9be13e7d5e0cc460e68283 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Thu, 21 Mar 2024 21:23:47 -0500
Subject: [PATCH 04/27] optimize code

---
 datafusion/core/src/dataframe/mod.rs          |   1 +
 .../optimizer/src/common_subexpr_eliminate.rs | 215 +++---------------
 .../optimizer/src/optimize_projections.rs     |  58 +----
 3 files changed, 36 insertions(+), 238 deletions(-)

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
index eea5fc1127ceb..2c58b54fa6bd9 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -2013,6 +2013,7 @@ mod tests {
             .await?
             .select_columns(&["c1", "c3"])?;
         let left_rows = left.clone().collect().await?;
+        println!("hehehehehehehhehehehhehe");
         let right_rows = right.clone().collect().await?;
         let join = left.join(right, JoinType::Inner, &["c1"], &["c1"], None)?;
         let join_rows = join.collect().await?;
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 11121f4bd4db7..dd78ab5d0b593 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -19,25 +19,22 @@
 
 use crate::utils::is_volatile_expression;
 use crate::{utils, OptimizerConfig, OptimizerRule};
-use datafusion_expr::Expr;
+use datafusion_expr::{Expr, Operator};
 use std::collections::{BTreeSet, HashMap};
 use std::ops::Deref;
-use std::slice::Windows;
+
 use std::sync::Arc;
 
-use arrow::array;
-use arrow::datatypes::{DataType, SchemaRef};
+use arrow::datatypes::DataType;
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter,
     TreeNodeVisitor,
 };
 use datafusion_common::{
-    internal_datafusion_err, internal_err, plan_err, Column, DFField, DFSchema,
-    DFSchemaRef, DataFusionError, Result,
+    internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result,
 };
 use datafusion_expr::expr::{Alias, WindowFunction};
 use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window};
-use datafusion_expr::tree_node::plan;
 use datafusion_expr::{col, BinaryExpr, Cast, ExprSchemable};
 use hashbrown::HashSet;
 use regex_syntax::ast::print;
@@ -127,24 +124,14 @@ impl CommonSubexprEliminate {
             .iter()
             .zip(arrays_list.iter())
             .map(|(exprs, arrays)| {
-                println!("************ \n expr is {:?} \n ************ \n", exprs);
                 let res = exprs
                     .iter()
                     .cloned()
                     .zip(arrays.iter())
                     .map(|(expr, id_array)| {
-                        println!(
-                            "************ \n current expr is {:?} \n ************ \n",
-                            expr
-                        );
-                        println!(
-                            "************ \n current id_array is {:?} \n ************ \n",
-                            id_array
-                        );
                         replace_common_expr(expr, id_array, expr_set, affected_id)
                     })
                     .collect::<Result<Vec<_>>>();
-                println!("************ \n expr is {:?} \n ************ \n", res);
                 res
             })
             .collect::<Result<Vec<_>>>()
@@ -158,18 +145,6 @@ impl CommonSubexprEliminate {
         expr_set: &ExprSet,
         config: &dyn OptimizerConfig,
     ) -> Result<(Vec<Vec<Expr>>, LogicalPlan)> {
-        println!(
-            "************ \n current plan is {:?} \n *********** \n",
-            input
-        );
-        println!(
-            "************ \n exprs list are {:?} \n *********** \n",
-            exprs_list
-        );
-        println!(
-            "************ \n array list are {:?} \n *********** \n",
-            arrays_list
-        );
         let mut affected_id = BTreeSet::<Identifier>::new();
 
         let rewrite_exprs =
@@ -181,14 +156,6 @@ impl CommonSubexprEliminate {
         if !affected_id.is_empty() {
             new_input = build_common_expr_project_plan(new_input, affected_id, expr_set)?;
         }
-        println!(
-            "\n *********** \n rewrite expressions are {:?} \n *********** \n",
-            rewrite_exprs
-        );
-        println!(
-            "\n *********** \n new input is {:?} \n *********** \n",
-            new_input
-        );
         Ok((rewrite_exprs, new_input))
     }
 
@@ -224,10 +191,6 @@ impl CommonSubexprEliminate {
             window_exprs.push(window_expr);
             arrays_per_window.push(arrays);
         }
-        println!(
-            "\n *********** \n window exprs are {:?} \n *********** \n",
-            window_exprs
-        );
         let mut window_exprs = window_exprs
             .iter()
             .map(|expr| expr.as_slice())
@@ -236,10 +199,6 @@ impl CommonSubexprEliminate {
             .iter()
             .map(|arrays| arrays.as_slice())
             .collect::<Vec<_>>();
-        println!(
-            "\n *********** \n arrays per windows are  {:?} \n *********** \n",
-            arrays_per_window
-        );
         assert_eq!(window_exprs.len(), arrays_per_window.len());
         let (mut new_expr, new_input) = self.rewrite_expr(
             &window_exprs,
@@ -248,48 +207,25 @@ impl CommonSubexprEliminate {
             &expr_set,
             config,
         )?;
-        println!(
-            "\n *********** \n new exprs are  {:?} \n *********** \n",
-            new_expr
-        );
-        println!(
-            "\n *********** \n new input are  {:?} \n *********** \n",
-            new_input
-        );
         assert_eq!(window_exprs.len(), new_expr.len());
-        println!(
-            "\n *********** \n before plan are  {:?} \n *********** \n",
-            plan
-        );
         // Construct consecutive window operator, with their corresponding new window expressions.
         plan = new_input;
         while let Some(new_window_expr) = new_expr.pop() {
             // Since `new_expr` and `window_exprs` length are same. We can safely `.unwrap` here.
             let orig_window_expr = window_exprs.pop().unwrap();
             assert_eq!(new_window_expr.len(), orig_window_expr.len());
-            println!(
-                "************* \n origin window expressions {:?} \n ************** \n ",
-                orig_window_expr
-            );
             // Rename new re-written window expressions with original name (by giving alias)
             // Otherwise we may receive schema error, in subsequent operators.
-            println!("************* \n before changed window expressions {:?} \n ************** \n ", new_window_expr);
             let new_window_expr = new_window_expr
                 .into_iter()
                 .zip(orig_window_expr.iter())
                 .map(|(new_window_expr, window_expr)| {
-                    println!("new_window_expr is {:?}", new_window_expr);
                     let original_name = window_expr.name_for_alias()?;
                     new_window_expr.alias_if_changed(original_name)
                 })
                 .collect::<Result<Vec<_>>>()?;
-            println!("************* \n after changed window expressions {:?} \n ************** \n ", new_window_expr);
             plan = LogicalPlan::Window(Window::try_new(new_window_expr, Arc::new(plan))?);
         }
-        println!(
-            "\n *********** \n final plan are  {:?} \n *********** \n",
-            plan
-        );
         Ok(plan)
     }
 
@@ -416,8 +352,6 @@ impl CommonSubexprEliminate {
         config: &dyn OptimizerConfig,
     ) -> Result<LogicalPlan> {
         let expr = plan.expressions();
-        println!("********** \n expressions are {:?} \n ********* \n", expr);
-        println!("********** \n ur plan is {:?} \n ********* \n", plan);
         let inputs = plan.inputs();
         let input = inputs[0];
         let input_schema = Arc::clone(input.schema());
@@ -425,16 +359,9 @@ impl CommonSubexprEliminate {
 
         // Visit expr list and build expr identifier to occuring count map (`expr_set`).
         let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?;
-        println!("********* \n arrays {:?} \n ********* \n", arrays);
-        println!("********* \n expr_set {:?} \n ********* \n", expr_set);
         let (mut new_expr, new_input) =
             self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?;
-        println!(
-            "********* \n new expr is {:?} \n and new input is {:?} ********* \n",
-            new_expr, new_input
-        );
         let res_plan = plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]);
-        println!("********* \n res plan is {:?} \n ********* \n", res_plan);
         res_plan
     }
 }
@@ -482,21 +409,9 @@ impl CommonSubexprEliminate {
         };
 
         let original_schema = plan.schema().clone();
-        println!(
-            "***************** \n original schema is {:?} \n ***************",
-            original_schema
-        );
         match optimized_plan {
             Some(LogicalPlan::Projection(_)) => Ok(optimized_plan),
             Some(optimized_plan) if optimized_plan.schema() != &original_schema => {
-                println!(
-                    "************** \n optimized schema {:?} \n ************",
-                    optimized_plan.schema(),
-                );
-                println!(
-                    "************** \n current plan is {:?} \n ************",
-                    optimized_plan,
-                );
                 Ok(Some(build_recover_project_plan(
                     &original_schema,
                     optimized_plan,
@@ -516,7 +431,6 @@ impl CommonSubexprEliminate {
                 depth: 0,
             })
             .map(|transformed| Some(transformed.data));
-        println!("************** \n result plan is {:?} *********** \n", res);
         res
     }
 }
@@ -534,7 +448,6 @@ impl OptimizerRule for CommonSubexprEliminate {
             _ => plan.clone(),
         };
         let res = self.add_extra_projection(&plan);
-        println!("res is {:?}", res);
         res
     }
 
@@ -568,12 +481,10 @@ fn to_arrays(
     expr_set: &mut ExprSet,
     expr_mask: ExprMask,
 ) -> Result<Vec<Vec<(usize, String)>>> {
-    println!("schema is {:?}", input_schema);
     let res = expr
         .iter()
         .map(|e| {
             let mut id_array = vec![];
-            println!("current expression is {:?}", e);
             expr_to_identifier(
                 e,
                 expr_set,
@@ -585,7 +496,6 @@ fn to_arrays(
             Ok(id_array)
         })
         .collect::<Result<Vec<_>>>();
-    println!("res is {:?}", res);
     res
 }
 
@@ -602,11 +512,6 @@ fn build_common_expr_project_plan(
         match expr_set.get(&id) {
             Some((expr, _, data_type)) => {
                 // todo: check `nullable`
-                println!(
-                    "\n ********** expr is {:?} \n \n and data type is {:?} \n **************** \n",
-                    expr, data_type
-                );
-                println!("********* \n {:?} ***********\n", expr);
                 let field = DFField::new_unqualified(&id, data_type.clone(), true);
                 fields_set.insert(field.name().to_owned());
                 project_exprs.push(expr.clone().alias(&id));
@@ -622,11 +527,6 @@ fn build_common_expr_project_plan(
             project_exprs.push(Expr::Column(field.qualified_column()));
         }
     }
-    println!(
-        "************** \n exprs are {:?} \n ************** \n",
-        project_exprs
-    );
-    println!("********** \n input are {:?} \n ************** \n", input);
     Ok(LogicalPlan::Projection(Projection::try_new(
         project_exprs,
         Arc::new(input),
@@ -644,19 +544,8 @@ fn build_recover_project_plan(
     let col_exprs = schema
         .fields()
         .iter()
-        .map(|field| {
-            println!("field is {:?}", field);
-            Expr::Column(field.qualified_column())
-        })
+        .map(|field| Expr::Column(field.qualified_column()))
         .collect();
-    println!(
-        "************** \n col_exprs are {:?} \n **************",
-        col_exprs
-    );
-    println!(
-        "************** \n current plan is  {:?} \n **************",
-        input
-    );
     Ok(LogicalPlan::Projection(Projection::try_new(
         col_exprs,
         Arc::new(input),
@@ -813,7 +702,6 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> {
     }
 
     fn f_up(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> {
-        println!("********* \n expr is {:?} \n *********", expr);
         self.series_number += 1;
 
         let (idx, sub_expr_identifier) = self.pop_enter_mark();
@@ -833,7 +721,6 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> {
         self.visit_stack.push(VisitRecord::ExprItem(desc.clone()));
 
         let data_type = expr.get_type(&self.input_schema)?;
-        print!("data type is {:?}", data_type);
         self.expr_set
             .entry(desc)
             .or_insert_with(|| (expr.clone(), 0, data_type))
@@ -887,10 +774,6 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_> {
         // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate
         // the `id_array`, which records the expr's identifier used to rewrite expr. So if we
         // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too.
-        println!(
-            "****************** \n expr is {:?} \n ****************** \n ",
-            expr
-        );
         if expr.short_circuits() || is_volatile_expression(&expr)? {
             return Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump));
         }
@@ -963,11 +846,6 @@ fn replace_common_expr(
     expr_set: &ExprSet,
     affected_id: &mut BTreeSet<Identifier>,
 ) -> Result<Expr> {
-    println!("************ \n expr are {:?} \n ************", expr);
-    println!(
-        "************ \n id_array are {:?} \n ************",
-        id_array
-    );
     let rewrited = expr
         .rewrite(&mut CommonSubexprRewriter {
             expr_set,
@@ -977,10 +855,6 @@ fn replace_common_expr(
             curr_index: 0,
         })
         .data();
-    println!(
-        "************ \n rewrited values are {:?} \n ************",
-        rewrited
-    );
     rewrited
 }
 
@@ -989,7 +863,12 @@ struct ProjectionAdder {
     depth: u128,
     data_type_map: HashMap<Expr, DataType>,
 }
-
+pub fn is_not_complex(op: &Operator) -> bool {
+    matches!(
+        op,
+        &Operator::Eq | &Operator::NotEq | &Operator::And | &Operator::Lt | &Operator::Gt
+    )
+}
 impl ProjectionAdder {
     // TODO: adding more expressions for sub query, currently only support for Simple Binary Expressions
     fn get_complex_expressions(
@@ -998,35 +877,32 @@ impl ProjectionAdder {
     ) -> (HashSet<Expr>, HashMap<Expr, DataType>) {
         let mut res = HashSet::new();
         let mut expr_data_type: HashMap<Expr, DataType> = HashMap::new();
-        println!(
-            "********** \n current schema is {:?} \n ******** \n",
-            schema
-        );
         for expr in exprs {
-            println!("current expr is {:?}", expr);
             match expr {
                 Expr::BinaryExpr(BinaryExpr {
                     left: ref l_box,
-                    op: _,
+                    op,
                     right: ref r_box,
-                }) => match (&**l_box, &**r_box) {
-                    (Expr::Column(l), Expr::Column(r)) => {
-                        let l_field = schema
-                            .field_from_column(l)
-                            .expect("Field not found for left column");
-
-                        // res.insert(DFField::new_unqualified(
-                        //     &expr.to_string(),
-                        //     l_field.data_type().clone(),
-                        //     true,
-                        // ));
-                        expr_data_type
-                            .entry(expr.clone())
-                            .or_insert(l_field.data_type().clone());
-                        res.insert(expr.clone());
+                }) if !is_not_complex(&op) => {
+                    match (&**l_box, &**r_box) {
+                        (Expr::Column(l), Expr::Column(_r)) => {
+                            let l_field = schema
+                                .field_from_column(l)
+                                .expect("Field not found for left column");
+
+                            // res.insert(DFField::new_unqualified(
+                            //     &expr.to_string(),
+                            //     l_field.data_type().clone(),
+                            //     true,
+                            // ));
+                            expr_data_type
+                                .entry(expr.clone())
+                                .or_insert(l_field.data_type().clone());
+                            res.insert(expr.clone());
+                        }
+                        _ => {}
                     }
-                    _ => {}
-                },
+                }
                 Expr::Cast(Cast { expr, data_type }) => {
                     let (expr_set, type_map) =
                         Self::get_complex_expressions(vec![*expr], schema.clone());
@@ -1055,27 +931,18 @@ impl TreeNodeRewriter for ProjectionAdder {
         self.depth += 1;
         // extract all expressions + check whether it contains in depth_sets
         let exprs = node.expressions();
-        println!("********* \n cur plan is  {:?} \n *********** \n", node);
-        println!("********* \n exprs are {:?} \n *********** \n", exprs);
         let depth_set = self.depth_map.entry(self.depth).or_default();
-        println!("self.input schema is {:?}", node.schema());
         let mut schema = node.schema().deref().clone();
         for ip in node.inputs() {
             schema.merge(ip.schema());
         }
         let (extended_set, data_map) =
             Self::get_complex_expressions(exprs, Arc::new(schema));
-        println!(
-            "*********** \n extened set is {:?} \n ********** \n",
-            extended_set
-        );
         depth_set.extend(extended_set);
         self.data_type_map.extend(data_map);
         Ok(Transformed::no(node))
     }
     fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        println!("*********** \n cur plan is {:?} \n*************\n", node);
-
         let current_depth_schema =
             self.depth_map.get(&self.depth).cloned().unwrap_or_default();
 
@@ -1087,26 +954,11 @@ impl TreeNodeRewriter for ProjectionAdder {
             .fold(current_depth_schema, |acc, (_, expr)| {
                 acc.intersection(expr).cloned().collect()
             });
-        println!(
-            "******** \n intersected parts are {:?} \n ***********",
-            added_expr
-        );
-        println!(
-            "******** \n depth map is {:?} \n ***********",
-            self.depth_map
-        );
-        println!(
-            "************ \n data type map is {:?} \n *********** \n",
-            self.data_type_map
-        );
         self.depth -= 1;
         // do not do extra things
         if added_expr.is_empty() {
             return Ok(Transformed::no(node));
         }
-
-        println!("\n*************\n{:?}\n*************\n", added_expr);
-
         match node {
             // do not add for Projections
             LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) => {
@@ -1138,16 +990,11 @@ impl TreeNodeRewriter for ProjectionAdder {
                         project_exprs.push(Expr::Column(field.qualified_column()));
                     }
                 }
-                println!(
-                    "************* \n project expressions are {:?} \n ********** \n",
-                    project_exprs
-                );
                 // adding new plan here
                 let new_plan = LogicalPlan::Projection(Projection::try_new(
                     project_exprs,
                     Arc::new(node.inputs()[0].clone()),
                 )?);
-                println!("new plan is {:?}", new_plan);
                 Ok(Transformed::yes(
                     node.with_new_exprs(node.expressions(), [new_plan].to_vec())?,
                 ))
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index ce8bcbc132d63..64e5075b4ca03 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -26,15 +26,16 @@
 use std::collections::HashSet;
 use std::sync::Arc;
 
+use crate::common_subexpr_eliminate::is_not_complex;
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::{
-    get_required_group_by_exprs_indices, Column, DFField, DFSchema, DFSchemaRef,
-    JoinType, Result,
+    get_required_group_by_exprs_indices, Column, DFSchemaRef, JoinType, Result,
 };
 use datafusion_expr::expr::{Alias, ScalarFunction};
+
 use datafusion_expr::{
     logical_plan::LogicalPlan, projection_schema, Aggregate, BinaryExpr, Cast, Distinct,
     Expr, Projection, TableScan, Window,
@@ -71,10 +72,6 @@ impl OptimizerRule for OptimizeProjections {
     ) -> Result<Option<LogicalPlan>> {
         // All output fields are necessary:
         let indices = (0..plan.schema().fields().len()).collect::<Vec<_>>();
-        println!(
-            "\n ************ \n plan.schema is {:?} \n ************ \n",
-            plan.schema()
-        );
         optimize_projections(plan, config, &indices)
     }
 
@@ -111,8 +108,6 @@ fn optimize_projections(
     config: &dyn OptimizerConfig,
     indices: &[usize],
 ) -> Result<Option<LogicalPlan>> {
-    println!("**********\n cur plan is {:?} \n ********* \n", plan);
-    println!("**********\n indices is {:?} \n ********* \n", indices);
     // `child_required_indices` stores
     // - indices of the columns required for each child
     // - a flag indicating whether putting a projection above children is beneficial for the parent.
@@ -284,10 +279,6 @@ fn optimize_projections(
             // Only use window expressions that are absolutely necessary according
             // to parent requirements:
             let new_window_expr = get_at_indices(&window.window_expr, &window_reqs);
-            println!(
-                "************ \n new_window_expr is {:?} \n *********** \n",
-                new_window_expr
-            );
             // Get all the required column indices at the input, either by the
             // parent or window expression requirements.
             let required_indices = get_all_required_indices(
@@ -302,10 +293,6 @@ fn optimize_projections(
             } else {
                 window.input.as_ref().clone()
             };
-            println!(
-                "************ \n window child is {:?} *********** \n",
-                window_child
-            );
             return if new_window_expr.is_empty() {
                 // When no window expression is necessary, use the input directly:
                 Ok(Some(window_child))
@@ -315,16 +302,8 @@ fn optimize_projections(
                 // refers to `old_child`.
                 let required_exprs =
                     get_required_exprs(window.input.schema(), &required_indices);
-                println!(
-                    "\n ************** \n required exprs {:?} \n ***********\n",
-                    required_exprs
-                );
                 let (window_child, _) =
                     add_projection_on_top_if_helpful(window_child, required_exprs)?;
-                println!(
-                    "\n ************** \nwindow child is {:?} \n ************ \n",
-                    window_child
-                );
                 Window::try_new(new_window_expr, Arc::new(window_child))
                     .map(|window| Some(LogicalPlan::Window(window)))
             };
@@ -443,7 +422,6 @@ where
 /// - `Ok(None)`: Signals that merge is not beneficial (and has not taken place).
 /// - `Err(error)`: An error occured during the function call.
 fn merge_consecutive_projections(proj: &Projection) -> Result<Option<Projection>> {
-    println!("******** \n proj is {:?} \n ********** \n", proj);
     let LogicalPlan::Projection(prev_projection) = proj.input.as_ref() else {
         return Ok(None);
     };
@@ -684,10 +662,6 @@ fn indices_referred_by_exprs<'a>(
     input_schema: &DFSchemaRef,
     exprs: impl Iterator<Item = &'a Expr>,
 ) -> Result<Vec<usize>> {
-    println!(
-        "************ \n current schema is {:?} \n *********** \n",
-        input_schema
-    );
     let indices = exprs
         .map(|expr| indices_referred_by_expr(input_schema, expr))
         .collect::<Result<Vec<_>>>()?;
@@ -725,7 +699,7 @@ fn indices_referred_by_expr(
         .flat_map(|col| input_schema.index_of_column(col))
         .collect();
     match expr {
-        Expr::BinaryExpr(_) => {
+        Expr::BinaryExpr(BinaryExpr { op, .. }) if is_not_complex(op) => {
             if let Some(index) =
                 input_schema.index_of_column_by_name(None, &expr.to_string())?
             {
@@ -796,7 +770,6 @@ fn get_all_required_indices<'a>(
 ///
 /// A vector of expressions corresponding to specified indices.
 fn get_at_indices(exprs: &[Expr], indices: &[usize]) -> Vec<Expr> {
-    println!("************* \n expr is {:?} \n ************ \n", exprs);
     indices
         .iter()
         // Indices may point to further places than `exprs` len.
@@ -925,35 +898,17 @@ fn rewrite_projection_given_requirements(
     config: &dyn OptimizerConfig,
     indices: &[usize],
 ) -> Result<Option<LogicalPlan>> {
-    println!("************ \n proj is {:?} \n *********** \n", proj);
     let exprs_used = get_at_indices(&proj.expr, indices);
-    println!(
-        "************ \n exprs_used is {:?} \n *********** \n",
-        exprs_used
-    );
     let required_indices =
         indices_referred_by_exprs(proj.input.schema(), exprs_used.iter())?;
-    println!(
-        "************ \n required_indices is {:?} \n *********** \n",
-        required_indices
-    );
     return if let Some(input) =
         optimize_projections(&proj.input, config, &required_indices)?
     {
         if is_projection_unnecessary(&input, &exprs_used)? {
             Ok(Some(input))
         } else {
-            println!(
-                "************ \n exprs_used is {:?} \n *********** \n",
-                exprs_used
-            );
-            println!("************ \n input is {:?} \n *********** \n", input);
             let res = Projection::try_new(exprs_used, Arc::new(input))
                 .map(|proj| Some(LogicalPlan::Projection(proj)));
-            println!(
-                "************ \n new expressions is {:?} \n *********** \n",
-                res
-            );
             res
         }
     } else if exprs_used.len() < proj.expr.len() {
@@ -976,11 +931,6 @@ fn rewrite_projection_given_requirements(
 /// - input schema of the projection, output schema of the projection are same, and
 /// - all projection expressions are either Column or Literal
 fn is_projection_unnecessary(input: &LogicalPlan, proj_exprs: &[Expr]) -> Result<bool> {
-    println!(
-        "*************** \n projections schema is {:?} \n input schema is {:?} \n ********** \n",
-        &projection_schema(input, proj_exprs)?,
-        input.schema()
-    );
     Ok(&projection_schema(input, proj_exprs)? == input.schema()
         && proj_exprs.iter().all(is_expr_trivial))
 }

From 71b5bd7fb384ade491b45dffb10ec7095d22b44c Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Thu, 21 Mar 2024 21:25:00 -0500
Subject: [PATCH 05/27] remove useless print

---
 datafusion/core/src/dataframe/mod.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs
index 2c58b54fa6bd9..eea5fc1127ceb 100644
--- a/datafusion/core/src/dataframe/mod.rs
+++ b/datafusion/core/src/dataframe/mod.rs
@@ -2013,7 +2013,6 @@ mod tests {
             .await?
             .select_columns(&["c1", "c3"])?;
         let left_rows = left.clone().collect().await?;
-        println!("hehehehehehehhehehehhehe");
         let right_rows = right.clone().collect().await?;
         let join = left.join(right, JoinType::Inner, &["c1"], &["c1"], None)?;
         let join_rows = join.collect().await?;

From 30d29a0ec92f982ace1a47bfa5038e0f2c2cc00b Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Thu, 21 Mar 2024 21:25:38 -0500
Subject: [PATCH 06/27] remove print

---
 datafusion/sql/src/select.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
index 7cabfa8661dc8..1bfd60a8ce1aa 100644
--- a/datafusion/sql/src/select.rs
+++ b/datafusion/sql/src/select.rs
@@ -567,8 +567,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
     /// Wrap a plan in a projection
     fn project(&self, input: LogicalPlan, expr: Vec<Expr>) -> Result<LogicalPlan> {
         self.validate_schema_satisfies_exprs(input.schema(), &expr)?;
-        println!("*********** \n input is {:?} \n *********** \n", input);
-        println!("*********** \n expr is {:?} \n *********** \n", expr);
         LogicalPlanBuilder::from(input).project(expr)?.build()
     }
 

From 81523d184b13038b649a2478ed838cc6d85867ae Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Thu, 21 Mar 2024 21:32:11 -0500
Subject: [PATCH 07/27] optimize code

---
 datafusion/optimizer/src/common_subexpr_eliminate.rs | 9 ++++-----
 datafusion/optimizer/src/optimize_projections.rs     | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index dd78ab5d0b593..b2a96e4515296 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -37,7 +37,6 @@ use datafusion_expr::expr::{Alias, WindowFunction};
 use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window};
 use datafusion_expr::{col, BinaryExpr, Cast, ExprSchemable};
 use hashbrown::HashSet;
-use regex_syntax::ast::print;
 
 /// A map from expression's identifier to tuple including
 /// - the expression itself (cloned)
@@ -903,14 +902,14 @@ impl ProjectionAdder {
                         _ => {}
                     }
                 }
-                Expr::Cast(Cast { expr, data_type }) => {
-                    let (expr_set, type_map) =
+                Expr::Cast(Cast { expr, data_type: _ }) => {
+                    let (expr_set, type_data_map) =
                         Self::get_complex_expressions(vec![*expr], schema.clone());
                     res.extend(expr_set);
-                    expr_data_type.extend(type_map);
+                    expr_data_type.extend(type_data_map);
                 }
 
-                Expr::WindowFunction(WindowFunction { fun, args, .. }) => {
+                Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
                     let (expr_set, type_map) =
                         Self::get_complex_expressions(args, schema.clone());
                     res.extend(expr_set);
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 64e5075b4ca03..3e77ff09f4d4a 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -32,7 +32,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 
 use arrow::datatypes::SchemaRef;
 use datafusion_common::{
-    get_required_group_by_exprs_indices, Column, DFSchemaRef, JoinType, Result,
+    get_required_group_by_exprs_indices, Column, DFSchema, DFSchemaRef, JoinType, Result,
 };
 use datafusion_expr::expr::{Alias, ScalarFunction};
 

From 1c4010be3aec3145b2456308e880a077c296e125 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Fri, 22 Mar 2024 09:48:39 -0500
Subject: [PATCH 08/27] optimize code

---
 .../optimizer/src/common_subexpr_eliminate.rs   | 17 +++++++----------
 .../optimizer/src/optimize_projections.rs       |  7 +++----
 .../test_files/project_complex_sub_query.slt    | 10 ++++++++++
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index b2a96e4515296..f86a5f14730e3 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -360,8 +360,7 @@ impl CommonSubexprEliminate {
         let arrays = to_arrays(&expr, input_schema, &mut expr_set, ExprMask::Normal)?;
         let (mut new_expr, new_input) =
             self.rewrite_expr(&[&expr], &[&arrays], input, &expr_set, config)?;
-        let res_plan = plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input]);
-        res_plan
+        plan.with_new_exprs(pop_expr(&mut new_expr)?, vec![new_input])
     }
 }
 impl CommonSubexprEliminate {
@@ -422,15 +421,13 @@ impl CommonSubexprEliminate {
     /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the
     ///
     fn add_extra_projection(&self, plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
-        let res = plan
-            .clone()
+        plan.clone()
             .rewrite(&mut ProjectionAdder {
                 data_type_map: HashMap::new(),
                 depth_map: HashMap::new(),
                 depth: 0,
             })
-            .map(|transformed| Some(transformed.data));
-        res
+            .map(|transformed| Some(transformed.data))
     }
 }
 impl OptimizerRule for CommonSubexprEliminate {
@@ -865,7 +862,7 @@ struct ProjectionAdder {
 pub fn is_not_complex(op: &Operator) -> bool {
     matches!(
         op,
-        &Operator::Eq | &Operator::NotEq | &Operator::And | &Operator::Lt | &Operator::Gt
+        &Operator::Eq | &Operator::NotEq | &Operator::Lt | &Operator::Gt | &Operator::And
     )
 }
 impl ProjectionAdder {
@@ -960,9 +957,9 @@ impl TreeNodeRewriter for ProjectionAdder {
         }
         match node {
             // do not add for Projections
-            LogicalPlan::Projection(_) | LogicalPlan::TableScan(_) => {
-                Ok(Transformed::no(node))
-            }
+            LogicalPlan::Projection(_)
+            | LogicalPlan::TableScan(_)
+            | LogicalPlan::Join(_) => Ok(Transformed::no(node)),
             _ => {
                 // avoid recursive add projections
                 if added_expr.iter().any(|expr| {
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 3e77ff09f4d4a..e46c937b3f2ef 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -699,7 +699,7 @@ fn indices_referred_by_expr(
         .flat_map(|col| input_schema.index_of_column(col))
         .collect();
     match expr {
-        Expr::BinaryExpr(BinaryExpr { op, .. }) if is_not_complex(op) => {
+        Expr::BinaryExpr(BinaryExpr { op, .. }) if !is_not_complex(op) => {
             if let Some(index) =
                 input_schema.index_of_column_by_name(None, &expr.to_string())?
             {
@@ -907,9 +907,8 @@ fn rewrite_projection_given_requirements(
         if is_projection_unnecessary(&input, &exprs_used)? {
             Ok(Some(input))
         } else {
-            let res = Projection::try_new(exprs_used, Arc::new(input))
-                .map(|proj| Some(LogicalPlan::Projection(proj)));
-            res
+            Projection::try_new(exprs_used, Arc::new(input))
+                .map(|proj| Some(LogicalPlan::Projection(proj)))
         }
     } else if exprs_used.len() < proj.expr.len() {
         // Projection expression used is different than the existing projection.
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index e778d3d0cebbe..accc4b9abdbae 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -50,3 +50,13 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST]
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 ----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+
+query TT
+explain SELECT c3-c4, SUM(c3-c4) OVER(order by c3-c4)
+FROM t;
+----
+logical_plan
+Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+--WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4
+------TableScan: t projection=[c3, c4]

From 4add9cb5540bbf80dd24ae2c06d8e4dfdcafa2ef Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Fri, 22 Mar 2024 09:50:32 -0500
Subject: [PATCH 09/27] adding test

---
 .../test_files/project_complex_sub_query.slt           | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index accc4b9abdbae..917d165abbb8c 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -60,3 +60,13 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST]
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 ----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+
+query TT
+explain SELECT c3-c4, SUM(c3-c4) OVER()
+FROM t;
+----
+logical_plan
+Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+--WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4
+------TableScan: t projection=[c3, c4]

From 251c48d55cc81297e1bc4b97caf623d91cabb9f6 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Fri, 22 Mar 2024 10:17:47 -0500
Subject: [PATCH 10/27] fix clippy

---
 .../optimizer/src/common_subexpr_eliminate.rs | 52 ++++++++-----------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index f86a5f14730e3..57a3519c6b557 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -443,8 +443,7 @@ impl OptimizerRule for CommonSubexprEliminate {
             Some(plan) => plan,
             _ => plan.clone(),
         };
-        let res = self.add_extra_projection(&plan);
-        res
+        self.add_extra_projection(&plan)
     }
 
     fn name(&self) -> &str {
@@ -842,16 +841,14 @@ fn replace_common_expr(
     expr_set: &ExprSet,
     affected_id: &mut BTreeSet<Identifier>,
 ) -> Result<Expr> {
-    let rewrited = expr
-        .rewrite(&mut CommonSubexprRewriter {
-            expr_set,
-            id_array,
-            affected_id,
-            max_series_number: 0,
-            curr_index: 0,
-        })
-        .data();
-    rewrited
+    expr.rewrite(&mut CommonSubexprRewriter {
+        expr_set,
+        id_array,
+        affected_id,
+        max_series_number: 0,
+        curr_index: 0,
+    })
+    .data()
 }
 
 struct ProjectionAdder {
@@ -880,23 +877,20 @@ impl ProjectionAdder {
                     op,
                     right: ref r_box,
                 }) if !is_not_complex(&op) => {
-                    match (&**l_box, &**r_box) {
-                        (Expr::Column(l), Expr::Column(_r)) => {
-                            let l_field = schema
-                                .field_from_column(l)
-                                .expect("Field not found for left column");
-
-                            // res.insert(DFField::new_unqualified(
-                            //     &expr.to_string(),
-                            //     l_field.data_type().clone(),
-                            //     true,
-                            // ));
-                            expr_data_type
-                                .entry(expr.clone())
-                                .or_insert(l_field.data_type().clone());
-                            res.insert(expr.clone());
-                        }
-                        _ => {}
+                    if let (Expr::Column(l), Expr::Column(_r)) = (&**l_box, &**r_box) {
+                        let l_field = schema
+                            .field_from_column(l)
+                            .expect("Field not found for left column");
+
+                        // res.insert(DFField::new_unqualified(
+                        //     &expr.to_string(),
+                        //     l_field.data_type().clone(),
+                        //     true,
+                        // ));
+                        expr_data_type
+                            .entry(expr.clone())
+                            .or_insert(l_field.data_type().clone());
+                        res.insert(expr.clone());
                     }
                 }
                 Expr::Cast(Cast { expr, data_type: _ }) => {

From 117e49067a8a734d7a671de4b3197b9e9d1896c0 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Sun, 24 Mar 2024 08:03:02 -0500
Subject: [PATCH 11/27] optimize code

---
 .../optimizer/src/optimize_projections.rs     | 20 -------------------
 .../test_files/project_complex_sub_query.slt  |  2 +-
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index e46c937b3f2ef..8ad773d89e13e 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -714,26 +714,6 @@ fn indices_referred_by_expr(
         _ => {}
     }
     Ok(res_vec)
-    // match expr {
-    //     Expr::BinaryExpr(_) => {
-    //         if let Some(index) =
-    //             input_schema.index_of_column_by_name(None, &expr.to_string())?
-    //         {
-    //             return Ok(vec![index]);
-    //         } else {
-    //             return Ok(vec![]);
-    //         }
-    //     }
-    //     _ => {
-    //         let mut cols = expr.to_columns()?;
-    //         outer_columns(expr, &mut cols);
-    //         let res_vec: Vec<usize> = cols
-    //             .iter()
-    //             .flat_map(|col| input_schema.index_of_column(col))
-    //             .collect();
-    //         Ok(res_vec)
-    //     }
-    // }
 }
 
 /// Gets all required indices for the input; i.e. those required by the parent
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index 917d165abbb8c..ffcb9cc3d70c8 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -29,7 +29,7 @@ c5 INT NOT NULL
 )
 STORED AS CSV
 WITH HEADER ROW
-LOCATION '/Users/xiangyanxin/personal/DATAFUSION/arrow-datafusion/a.csv';
+LOCATION '../core/tests/data/demo.csv';
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER()

From 113e232ccf5f68a2f6535e84a384c900b2fadbb4 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Tue, 26 Mar 2024 14:48:15 -0500
Subject: [PATCH 12/27] optimize

---
 .../tests/data/project_complex_expression.csv | 18 ++++++
 .../optimizer/src/common_subexpr_eliminate.rs | 62 ++++++++++++++++---
 .../test_files/project_complex_sub_query.slt  |  2 +-
 3 files changed, 71 insertions(+), 11 deletions(-)
 create mode 100644 datafusion/core/tests/data/project_complex_expression.csv

diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv
new file mode 100644
index 0000000000000..c002d86970769
--- /dev/null
+++ b/datafusion/core/tests/data/project_complex_expression.csv
@@ -0,0 +1,18 @@
+c1,c2,c3,c4,c5
+1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+16,17,18,19,20
+21,22,23,24,25
+26,27,28,29,30
+31,32,33,34,35
+36,37,38,39,40
+41,42,43,44,45
+46,47,48,49,50
+51,52,53,54,55
+56,57,58,59,60
+61,62,63,64,65
+66,67,68,69,70
+71,72,73,74,75
+76,77,78,79,80
+81,82,83,84,85
\ No newline at end of file
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 57a3519c6b557..393bd799bdf9a 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -438,7 +438,6 @@ impl OptimizerRule for CommonSubexprEliminate {
     ) -> Result<Option<LogicalPlan>> {
         let optimized_plan_option = self.common_optimize(plan, config)?;
 
-        // println!("optimized plan option is {:?}", optimized_plan_option);
         let plan = match optimized_plan_option {
             Some(plan) => plan,
             _ => plan.clone(),
@@ -882,11 +881,6 @@ impl ProjectionAdder {
                             .field_from_column(l)
                             .expect("Field not found for left column");
 
-                        // res.insert(DFField::new_unqualified(
-                        //     &expr.to_string(),
-                        //     l_field.data_type().clone(),
-                        //     true,
-                        // ));
                         expr_data_type
                             .entry(expr.clone())
                             .or_insert(l_field.data_type().clone());
@@ -919,7 +913,7 @@ impl TreeNodeRewriter for ProjectionAdder {
     fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
         // use depth to trace where we are in the LogicalPlan tree
         self.depth += 1;
-        // extract all expressions + check whether it contains in depth_sets
+        // extract all expressions and check whether it contains in depth_sets
         let exprs = node.expressions();
         let depth_set = self.depth_map.entry(self.depth).or_default();
         let mut schema = node.schema().deref().clone();
@@ -936,14 +930,58 @@ impl TreeNodeRewriter for ProjectionAdder {
         let current_depth_schema =
             self.depth_map.get(&self.depth).cloned().unwrap_or_default();
 
-        // get the intersected part
+        // get the intersected part looking up
         let added_expr = self
             .depth_map
             .iter()
             .filter(|(&depth, _)| depth < self.depth)
-            .fold(current_depth_schema, |acc, (_, expr)| {
+            .fold(current_depth_schema.clone(), |acc, (_, expr)| {
                 acc.intersection(expr).cloned().collect()
             });
+        // in projection, we are trying to get intersect with lower one and if some column is not used in upper one, we just abandon them
+        match node.clone() {
+            LogicalPlan::Projection(_) => {
+                // we get the expressions that has intersect with deeper layer since those expressions could be rewrite
+                let mut cross_expr_deeper = HashSet::new();
+
+                self.depth_map
+                    .iter()
+                    .filter(|(&depth, _)| depth > self.depth)
+                    .for_each(|(_, exprs)| {
+                        // get intersection
+                        let intersection = current_depth_schema
+                            .intersection(exprs)
+                            .cloned()
+                            .collect::<HashSet<_>>();
+                        cross_expr_deeper =
+                            cross_expr_deeper.union(&intersection).cloned().collect();
+                    });
+                let mut project_exprs = vec![];
+                if cross_expr_deeper.len() > 0 {
+                    let current_expressions = node.expressions();
+                    for expr in current_expressions {
+                        if cross_expr_deeper.contains(&expr) {
+                            let f = DFField::new_unqualified(
+                                &expr.to_string(),
+                                self.data_type_map[&expr].clone(),
+                                true,
+                            );
+                            project_exprs.push(Expr::Column(f.qualified_column()));
+                            cross_expr_deeper.remove(&expr);
+                        } else {
+                            project_exprs.push(expr);
+                        }
+                    }
+                    let new_projection = LogicalPlan::Projection(Projection::try_new(
+                        project_exprs,
+                        Arc::new(node.inputs()[0].clone()),
+                    )?);
+                    self.depth -= 1;
+                    return Ok(Transformed::yes(new_projection));
+                }
+            }
+            _ => {}
+        }
         self.depth -= 1;
         // do not do extra things
         if added_expr.is_empty() {
@@ -954,6 +992,7 @@ impl TreeNodeRewriter for ProjectionAdder {
             LogicalPlan::Projection(_)
             | LogicalPlan::TableScan(_)
             | LogicalPlan::Join(_) => Ok(Transformed::no(node)),
+
             _ => {
                 // avoid recursive add projections
                 if added_expr.iter().any(|expr| {
@@ -966,6 +1005,7 @@ impl TreeNodeRewriter for ProjectionAdder {
 
                 let mut field_set = HashSet::new();
                 let mut project_exprs = vec![];
+                // here we can deduce that
                 for expr in added_expr {
                     let f = DFField::new_unqualified(
                         &expr.to_string(),
@@ -1005,7 +1045,7 @@ mod test {
     };
     use datafusion_expr::{
         grouping_set, AccumulatorFactoryFunction, AggregateUDF, Signature,
-        SimpleAggregateUDF, Volatility,
+        SimpleAggregateUDF, TableScan, Volatility,
     };
 
     use crate::optimizer::OptimizerContext;
@@ -1528,4 +1568,6 @@ mod test {
         assert!(result.len() == 1);
         Ok(())
     }
+    #[test]
+    fn test_extra_projection_for_upper() -> Result<()> {}
 }
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index ffcb9cc3d70c8..c57bf88d2e4a4 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -29,7 +29,7 @@ c5 INT NOT NULL
 )
 STORED AS CSV
 WITH HEADER ROW
-LOCATION '../core/tests/data/demo.csv';
+LOCATION '../core/tests/data/project_complex_expressions.csv';
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER()

From d4ac68fb14c10df4cfb92340c0964479ef662e7a Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Tue, 26 Mar 2024 14:58:06 -0500
Subject: [PATCH 13/27] optimize

---
 .../tests/data/project_complex_expression.csv | 14 +----
 .../optimizer/src/common_subexpr_eliminate.rs |  2 -
 .../test_files/project_complex_sub_query.slt  | 54 ++++++++++++++++++-
 3 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv
index c002d86970769..c78f35a485cd0 100644
--- a/datafusion/core/tests/data/project_complex_expression.csv
+++ b/datafusion/core/tests/data/project_complex_expression.csv
@@ -3,16 +3,4 @@ c1,c2,c3,c4,c5
 6,7,8,9,10
 11,12,13,14,15
 16,17,18,19,20
-21,22,23,24,25
-26,27,28,29,30
-31,32,33,34,35
-36,37,38,39,40
-41,42,43,44,45
-46,47,48,49,50
-51,52,53,54,55
-56,57,58,59,60
-61,62,63,64,65
-66,67,68,69,70
-71,72,73,74,75
-76,77,78,79,80
-81,82,83,84,85
\ No newline at end of file
+21,22,23,24,25
\ No newline at end of file
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 393bd799bdf9a..4db43c3f3f61f 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -1568,6 +1568,4 @@ mod test {
         assert!(result.len() == 1);
         Ok(())
     }
-    #[test]
-    fn test_extra_projection_for_upper() -> Result<()> {}
 }
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index c57bf88d2e4a4..5ba1f493fa379 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -29,7 +29,7 @@ c5 INT NOT NULL
 )
 STORED AS CSV
 WITH HEADER ROW
-LOCATION '../core/tests/data/project_complex_expressions.csv';
+LOCATION '../core/tests/data/project_complex_expression.csv';
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER()
@@ -40,6 +40,13 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND U
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 ----Projection: t.c3 + t.c4 AS t.c3 + t.c4, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[t.c3 + t.c4@0 as t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+--WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+----CoalescePartitionsExec
+------ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4, c3@0 as c3, c4@1 as c4]
+--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
@@ -50,6 +57,15 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST]
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 ----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[c3@1 + c4@2 as t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----BoundedWindowAggExec: wdw=[SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+------SortPreservingMergeExec: [t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST]
+--------SortExec: expr=[t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST]
+----------ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4]
+------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query TT
 explain SELECT c3-c4, SUM(c3-c4) OVER(order by c3-c4)
@@ -60,6 +76,15 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST]
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 ----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[c3@1 - c4@2 as t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----BoundedWindowAggExec: wdw=[SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+------SortPreservingMergeExec: [t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST]
+--------SortExec: expr=[t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST]
+----------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4]
+------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query TT
 explain SELECT c3-c4, SUM(c3-c4) OVER()
@@ -70,3 +95,30 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND U
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 ----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[t.c3 - t.c4@0 as t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+--WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+----CoalescePartitionsExec
+------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4, c3@0 as c3, c4@1 as c4]
+--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
+
+query II
+SELECT c3+c4, SUM(c3+c4) OVER()
+FROM t;
+----
+7 135
+17 135
+27 135
+37 135
+47 135
+
+query II
+SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
+FROM t
+----
+7 7
+17 24
+27 51
+37 88
+47 135

From 01b23a3282718562a3ae40aad1c789a2f733c10d Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Tue, 26 Mar 2024 15:25:01 -0500
Subject: [PATCH 14/27] fix clippy

---
 .../optimizer/src/common_subexpr_eliminate.rs | 73 +++++++++----------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 4db43c3f3f61f..c3790da6162a9 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -939,48 +939,45 @@ impl TreeNodeRewriter for ProjectionAdder {
                 acc.intersection(expr).cloned().collect()
             });
         // in projection, we are trying to get intersect with lower one and if some column is not used in upper one, we just abandon them
-        match node.clone() {
-            LogicalPlan::Projection(_) => {
-                // we get the expressions that has intersect with deeper layer since those expressions could be rewrite
-                let mut cross_expr_deeper = HashSet::new();
+        if let LogicalPlan::Projection(_) = node.clone() {
+            // we get the expressions that has intersect with deeper layer since those expressions could be rewrite
+            let mut cross_expr_deeper = HashSet::new();
 
-                self.depth_map
-                    .iter()
-                    .filter(|(&depth, _)| depth > self.depth)
-                    .for_each(|(_, exprs)| {
-                        // get intersection
-                        let intersection = current_depth_schema
-                            .intersection(exprs)
-                            .cloned()
-                            .collect::<HashSet<_>>();
-                        cross_expr_deeper =
-                            cross_expr_deeper.union(&intersection).cloned().collect();
-                    });
-                let mut project_exprs = vec![];
-                if cross_expr_deeper.len() > 0 {
-                    let current_expressions = node.expressions();
-                    for expr in current_expressions {
-                        if cross_expr_deeper.contains(&expr) {
-                            let f = DFField::new_unqualified(
-                                &expr.to_string(),
-                                self.data_type_map[&expr].clone(),
-                                true,
-                            );
-                            project_exprs.push(Expr::Column(f.qualified_column()));
-                            cross_expr_deeper.remove(&expr);
-                        } else {
-                            project_exprs.push(expr);
-                        }
+            self.depth_map
+                .iter()
+                .filter(|(&depth, _)| depth > self.depth)
+                .for_each(|(_, exprs)| {
+                    // get intersection
+                    let intersection = current_depth_schema
+                        .intersection(exprs)
+                        .cloned()
+                        .collect::<HashSet<_>>();
+                    cross_expr_deeper =
+                        cross_expr_deeper.union(&intersection).cloned().collect();
+                });
+            let mut project_exprs = vec![];
+            if !cross_expr_deeper.is_empty() {
+                let current_expressions = node.expressions();
+                for expr in current_expressions {
+                    if cross_expr_deeper.contains(&expr) {
+                        let f = DFField::new_unqualified(
+                            &expr.to_string(),
+                            self.data_type_map[&expr].clone(),
+                            true,
+                        );
+                        project_exprs.push(Expr::Column(f.qualified_column()));
+                        cross_expr_deeper.remove(&expr);
+                    } else {
+                        project_exprs.push(expr);
                     }
-                    let new_projection = LogicalPlan::Projection(Projection::try_new(
-                        project_exprs,
-                        Arc::new(node.inputs()[0].clone()),
-                    )?);
-                    self.depth -= 1;
-                    return Ok(Transformed::yes(new_projection));
                 }
+                let new_projection = LogicalPlan::Projection(Projection::try_new(
+                    project_exprs,
+                    Arc::new(node.inputs()[0].clone()),
+                )?);
+                self.depth -= 1;
+                return Ok(Transformed::yes(new_projection));
             }
-            _ => {}
         }
         self.depth -= 1;
         // do not do extra things

From ee962f14dcbfa11680b941fe6cb9fe13b1ea3b67 Mon Sep 17 00:00:00 2001
From: Yanxin Xiang <yanxinxiang0917@outlook.com>
Date: Tue, 26 Mar 2024 18:32:43 -0500
Subject: [PATCH 15/27] fix clippy

---
 datafusion/optimizer/src/common_subexpr_eliminate.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index c3790da6162a9..75510bf2a2b92 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -1042,7 +1042,7 @@ mod test {
     };
     use datafusion_expr::{
         grouping_set, AccumulatorFactoryFunction, AggregateUDF, Signature,
-        SimpleAggregateUDF, TableScan, Volatility,
+        SimpleAggregateUDF, Volatility,
     };
 
     use crate::optimizer::OptimizerContext;

From 40869a4d394af27b0a99c4061398d86e8005adec Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Tue, 26 Mar 2024 11:22:07 +0300
Subject: [PATCH 16/27] Replace with cached exprs

---
 .../src/physical_optimizer/enforce_sorting.rs |  77 ++++++
 datafusion/core/src/physical_planner.rs       |   5 +
 datafusion/core/tests/data/demo.csv           |   2 +
 .../optimizer/src/common_subexpr_eliminate.rs | 232 ++++++++++++------
 .../optimizer/src/optimize_projections.rs     |  23 +-
 .../test_files/project_complex_sub_query.slt  |  17 +-
 6 files changed, 253 insertions(+), 103 deletions(-)
 create mode 100644 datafusion/core/tests/data/demo.csv

diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index 5bf21c3dfab5a..95e4a06b516b3 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -2347,3 +2347,80 @@ mod tests {
         Ok(())
     }
 }
+
+
+#[cfg(test)]
+mod tmp_tests {
+    use crate::assert_batches_eq;
+    use crate::physical_plan::{collect, displayable, ExecutionPlan};
+    use crate::prelude::SessionContext;
+    use arrow::util::pretty::print_batches;
+    use datafusion_common::Result;
+    use datafusion_execution::config::SessionConfig;
+    use datafusion_physical_plan::get_plan_string;
+    use std::sync::Arc;
+
+    fn print_plan(plan: &Arc<dyn ExecutionPlan>) -> Result<()> {
+        let formatted = displayable(plan.as_ref()).indent(true).to_string();
+        let actual: Vec<&str> = formatted.trim().lines().collect();
+        println!("{:#?}", actual);
+        Ok(())
+    }
+
+    const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE multiple_ordered_table (
+          a0 INTEGER,
+          a INTEGER,
+          b INTEGER,
+          c INTEGER,
+          d INTEGER
+        )
+        STORED AS CSV
+        WITH HEADER ROW
+        WITH ORDER (a ASC, b ASC)
+        WITH ORDER (c ASC)
+        LOCATION '../core/tests/data/window_2.csv'";
+
+    #[tokio::test]
+    async fn test_query() -> Result<()> {
+        let config = SessionConfig::new().with_target_partitions(1);
+        let ctx = SessionContext::new_with_config(config);
+
+        ctx.sql(MULTIPLE_ORDERED_TABLE).await?;
+
+        let sql = "SELECT a+b, SUM(a+b) OVER() FROM multiple_ordered_table";
+
+        let msg = format!("Creating logical plan for '{sql}'");
+        let dataframe = ctx.sql(sql).await.expect(&msg);
+        let physical_plan = dataframe.create_physical_plan().await?;
+        print_plan(&physical_plan)?;
+        let batches = collect(physical_plan.clone(), ctx.task_ctx()).await?;
+        print_batches(&batches)?;
+
+        let expected = vec![
+            "ProjectionExec: expr=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as nth_value1]",
+            "  GlobalLimitExec: skip=0, fetch=5",
+            "    BoundedWindowAggExec: wdw=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: \"NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Sorted]",
+            "      CsvExec: file_groups={1 group: [[Users/akurmustafa/projects/synnada/arrow-datafusion-synnada/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true",
+        ];
+        // Get string representation of the plan
+        let actual = get_plan_string(&physical_plan);
+        assert_eq!(
+            expected, actual,
+            "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
+        );
+
+        let expected = [
+            "+------------+",
+            "| nth_value1 |",
+            "+------------+",
+            "| 2          |",
+            "| 2          |",
+            "| 2          |",
+            "| 2          |",
+            "| 2          |",
+            "+------------+",
+        ];
+        assert_batches_eq!(expected, &batches);
+        Ok(())
+    }
+}
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index ca708b05823e7..4334bb617f78c 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -855,6 +855,11 @@ impl DefaultPhysicalPlanner {
                     )?))
                 }
                 LogicalPlan::Projection(Projection { input, expr, .. }) => {
+                    println!("---------------");
+                    for expr in expr{
+                        println!("expr:{:?}", expr);
+                    }
+                    println!("---------------");
                     let input_exec = self.create_initial_plan(input, session_state).await?;
                     let input_schema = input.as_ref().schema();
 
diff --git a/datafusion/core/tests/data/demo.csv b/datafusion/core/tests/data/demo.csv
new file mode 100644
index 0000000000000..d1492228246f0
--- /dev/null
+++ b/datafusion/core/tests/data/demo.csv
@@ -0,0 +1,2 @@
+c1,c2,c3,c4,c5
+1,2,3,4,5
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 57a3519c6b557..ef6b15fca3131 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -123,15 +123,14 @@ impl CommonSubexprEliminate {
             .iter()
             .zip(arrays_list.iter())
             .map(|(exprs, arrays)| {
-                let res = exprs
+                exprs
                     .iter()
                     .cloned()
                     .zip(arrays.iter())
                     .map(|(expr, id_array)| {
                         replace_common_expr(expr, id_array, expr_set, affected_id)
                     })
-                    .collect::<Result<Vec<_>>>();
-                res
+                    .collect::<Result<Vec<_>>>()
             })
             .collect::<Result<Vec<_>>>()
     }
@@ -424,8 +423,10 @@ impl CommonSubexprEliminate {
         plan.clone()
             .rewrite(&mut ProjectionAdder {
                 data_type_map: HashMap::new(),
-                depth_map: HashMap::new(),
+                cumulative_expr_map: HashMap::new(),
+                cumulative_map: HashSet::new(),
                 depth: 0,
+                should_update_parents: false,
             })
             .map(|transformed| Some(transformed.data))
     }
@@ -438,7 +439,6 @@ impl OptimizerRule for CommonSubexprEliminate {
     ) -> Result<Option<LogicalPlan>> {
         let optimized_plan_option = self.common_optimize(plan, config)?;
 
-        // println!("optimized plan option is {:?}", optimized_plan_option);
         let plan = match optimized_plan_option {
             Some(plan) => plan,
             _ => plan.clone(),
@@ -476,8 +476,7 @@ fn to_arrays(
     expr_set: &mut ExprSet,
     expr_mask: ExprMask,
 ) -> Result<Vec<Vec<(usize, String)>>> {
-    let res = expr
-        .iter()
+    expr.iter()
         .map(|e| {
             let mut id_array = vec![];
             expr_to_identifier(
@@ -490,8 +489,7 @@ fn to_arrays(
 
             Ok(id_array)
         })
-        .collect::<Result<Vec<_>>>();
-    res
+        .collect::<Result<Vec<_>>>()
 }
 
 /// Build the "intermediate" projection plan that evaluates the extracted common expressions.
@@ -852,9 +850,11 @@ fn replace_common_expr(
 }
 
 struct ProjectionAdder {
-    depth_map: HashMap<u128, HashSet<Expr>>,
+    cumulative_expr_map: HashMap<u128, HashSet<Expr>>,
+    cumulative_map: HashSet<Expr>,
     depth: u128,
     data_type_map: HashMap<Expr, DataType>,
+    should_update_parents: bool,
 }
 pub fn is_not_complex(op: &Operator) -> bool {
     matches!(
@@ -882,11 +882,6 @@ impl ProjectionAdder {
                             .field_from_column(l)
                             .expect("Field not found for left column");
 
-                        // res.insert(DFField::new_unqualified(
-                        //     &expr.to_string(),
-                        //     l_field.data_type().clone(),
-                        //     true,
-                        // ));
                         expr_data_type
                             .entry(expr.clone())
                             .or_insert(l_field.data_type().clone());
@@ -917,81 +912,166 @@ impl TreeNodeRewriter for ProjectionAdder {
     /// currently we just collect the complex bianryOP
 
     fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        // use depth to trace where we are in the LogicalPlan tree
-        self.depth += 1;
-        // extract all expressions + check whether it contains in depth_sets
-        let exprs = node.expressions();
-        let depth_set = self.depth_map.entry(self.depth).or_default();
-        let mut schema = node.schema().deref().clone();
-        for ip in node.inputs() {
-            schema.merge(ip.schema());
+        let depth_set = &mut self.cumulative_map;
+        // Insert for other end points
+        if let LogicalPlan::TableScan(_) = node{
+            self.cumulative_expr_map.insert(self.depth, depth_set.clone());
+            self.depth += 1;
+            return Ok(Transformed::no(node));
+        } else {
+            // TODO: Assumes operators doesn't modify name of the fields.
+            // use depth to trace where we are in the LogicalPlan tree
+            self.depth += 1;
+            // extract all expressions + check whether it contains in depth_sets
+            let exprs = node.expressions();
+            // let depth_set = self.cumulative_expr_map.entry(self.depth).or_default();
+            let mut schema = node.schema().deref().clone();
+            for ip in node.inputs() {
+                schema.merge(ip.schema());
+            }
+            let (extended_set, data_map) =
+                Self::get_complex_expressions(exprs, Arc::new(schema));
+            depth_set.extend(extended_set);
+            self.data_type_map.extend(data_map);
+            Ok(Transformed::no(node))
         }
-        let (extended_set, data_map) =
-            Self::get_complex_expressions(exprs, Arc::new(schema));
-        depth_set.extend(extended_set);
-        self.data_type_map.extend(data_map);
-        Ok(Transformed::no(node))
     }
-    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        let current_depth_schema =
-            self.depth_map.get(&self.depth).cloned().unwrap_or_default();
 
-        // get the intersected part
-        let added_expr = self
-            .depth_map
-            .iter()
-            .filter(|(&depth, _)| depth < self.depth)
-            .fold(current_depth_schema, |acc, (_, expr)| {
-                acc.intersection(expr).cloned().collect()
-            });
+    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
+        let added_expr =
+            self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default();
+        println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map);
+        println!("self.cumulative_map: {:?}", self.cumulative_map);
+        println!("node:{:?}", node);
         self.depth -= 1;
         // do not do extra things
-        if added_expr.is_empty() {
+        let should_add_projection = !added_expr.is_empty();
+        if !should_add_projection && !self.should_update_parents {
+            println!("not updating node");
             return Ok(Transformed::no(node));
         }
-        match node {
-            // do not add for Projections
-            LogicalPlan::Projection(_)
-            | LogicalPlan::TableScan(_)
-            | LogicalPlan::Join(_) => Ok(Transformed::no(node)),
-            _ => {
-                // avoid recursive add projections
-                if added_expr.iter().any(|expr| {
-                    node.inputs()[0]
-                        .schema()
-                        .has_column_with_unqualified_name(&expr.to_string())
-                }) {
-                    return Ok(Transformed::no(node));
+        // if added_expr.is_empty() {
+        //     println!("not updating node");
+        //     return Ok(Transformed::no(node));
+        // }
+
+        let child = node.inputs()[0];
+        let child = if should_add_projection {
+            self.should_update_parents = true;
+            let mut field_set = HashSet::new();
+            let mut project_exprs = vec![];
+            for expr in &added_expr {
+                let f = DFField::new_unqualified(
+                    &expr.to_string(),
+                    self.data_type_map[expr].clone(),
+                    true,
+                );
+                field_set.insert(f.name().to_owned());
+                project_exprs.push(expr.clone().alias(expr.to_string()));
+            }
+            // Do not lose fields in the child.
+            for field in child.schema().fields() {
+                if field_set.insert(field.qualified_name()) {
+                    project_exprs.push(Expr::Column(field.qualified_column()));
                 }
+            }
 
-                let mut field_set = HashSet::new();
-                let mut project_exprs = vec![];
-                for expr in added_expr {
-                    let f = DFField::new_unqualified(
-                        &expr.to_string(),
-                        self.data_type_map[&expr].clone(),
-                        true,
-                    );
-                    field_set.insert(f.name().to_owned());
-                    project_exprs.push(expr.clone().alias(expr.to_string()));
-                }
-                for field in node.inputs()[0].schema().fields() {
-                    if field_set.insert(field.qualified_name()) {
-                        project_exprs.push(Expr::Column(field.qualified_column()));
-                    }
+            // adding new plan here
+            let new_child = LogicalPlan::Projection(Projection::try_new(
+                project_exprs,
+                Arc::new(node.inputs()[0].clone()),
+            )?);
+            new_child
+        } else {
+            child.clone()
+        };
+        let mut expressions = node.expressions();
+        // for expr in &added_expr{
+        //     println!("expr.display_name() {:?}", expr.display_name()?);
+        // }
+        println!("    original expressions: {:?}", expressions);
+        let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::<Vec<_>>();
+        // Replace expressions with its pre-computed variant if available.
+        expressions.iter_mut().try_for_each(|expr|{
+            update_expr_with_available_columns(expr, &available_columns)
+        })?;
+        println!("after update expressions: {:?}", expressions);
+
+        let new_node = node.with_new_exprs(expressions, [child].to_vec())?;
+        println!("new node: {:?}", new_node);
+        Ok(Transformed::yes(
+            new_node,
+        ))
+
+        // match node {
+        //     // do not add for Projections
+        //     LogicalPlan::Projection(_)
+        //     | LogicalPlan::TableScan(_)
+        //     | LogicalPlan::Join(_) => Ok(Transformed::no(node)),
+        //     _ => {
+        //         // avoid recursive add projections
+        //         if added_expr.iter().any(|expr| {
+        //             node.inputs()[0]
+        //                 .schema()
+        //                 .has_column_with_unqualified_name(&expr.to_string())
+        //         }) {
+        //             return Ok(Transformed::no(node));
+        //         }
+        //
+        //         let mut field_set = HashSet::new();
+        //         let mut project_exprs = vec![];
+        //         for expr in added_expr {
+        //             let f = DFField::new_unqualified(
+        //                 &expr.to_string(),
+        //                 self.data_type_map[&expr].clone(),
+        //                 true,
+        //             );
+        //             field_set.insert(f.name().to_owned());
+        //             project_exprs.push(expr.clone().alias(expr.to_string()));
+        //         }
+        //         for field in node.inputs()[0].schema().fields() {
+        //             if field_set.insert(field.qualified_name()) {
+        //                 project_exprs.push(Expr::Column(field.qualified_column()));
+        //             }
+        //         }
+        //         // adding new plan here
+        //         let new_plan = LogicalPlan::Projection(Projection::try_new(
+        //             project_exprs,
+        //             Arc::new(node.inputs()[0].clone()),
+        //         )?);
+        //         Ok(Transformed::yes(
+        //             node.with_new_exprs(node.expressions(), [new_plan].to_vec())?,
+        //         ))
+        //     }
+        // }
+    }
+}
+
+fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Column]) -> Result<()> {
+    match expr {
+        Expr::BinaryExpr(_) => {
+            for available_col in available_columns{
+                // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?);
+                if available_col.flat_name() == expr.display_name()?{
+                    println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
+                    *expr = Expr::Column(available_col.clone());
                 }
-                // adding new plan here
-                let new_plan = LogicalPlan::Projection(Projection::try_new(
-                    project_exprs,
-                    Arc::new(node.inputs()[0].clone()),
-                )?);
-                Ok(Transformed::yes(
-                    node.with_new_exprs(node.expressions(), [new_plan].to_vec())?,
-                ))
             }
+        },
+        Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
+            args.iter_mut().try_for_each(|arg| update_expr_with_available_columns(arg, available_columns))?
+        },
+        Expr::Cast(Cast{ expr, .. }) => {
+            update_expr_with_available_columns(expr, available_columns)?
+        }
+        _ => {
+            println!("unsupported expression : {:?}", expr);
+            // cannot rewrite
         }
     }
+    Ok(())
 }
+
 #[cfg(test)]
 mod test {
     use std::iter;
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 8ad773d89e13e..284ea6673f8b2 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -26,7 +26,6 @@
 use std::collections::HashSet;
 use std::sync::Arc;
 
-use crate::common_subexpr_eliminate::is_not_complex;
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
@@ -70,6 +69,8 @@ impl OptimizerRule for OptimizeProjections {
         plan: &LogicalPlan,
         config: &dyn OptimizerConfig,
     ) -> Result<Option<LogicalPlan>> {
+        println!("plan at the start:");
+        println!("{:?}", plan);
         // All output fields are necessary:
         let indices = (0..plan.schema().fields().len()).collect::<Vec<_>>();
         optimize_projections(plan, config, &indices)
@@ -694,26 +695,10 @@ fn indices_referred_by_expr(
     // TODO: Support more Expressions
     let mut cols = expr.to_columns()?;
     outer_columns(expr, &mut cols);
-    let mut res_vec: Vec<usize> = cols
+    Ok(cols
         .iter()
         .flat_map(|col| input_schema.index_of_column(col))
-        .collect();
-    match expr {
-        Expr::BinaryExpr(BinaryExpr { op, .. }) if !is_not_complex(op) => {
-            if let Some(index) =
-                input_schema.index_of_column_by_name(None, &expr.to_string())?
-            {
-                match res_vec.binary_search(&index) {
-                    Ok(_) => {}
-                    Err(pos) => {
-                        res_vec.insert(pos, index);
-                    }
-                }
-            }
-        }
-        _ => {}
-    }
-    Ok(res_vec)
+        .collect())
 }
 
 /// Gets all required indices for the input; i.e. those required by the parent
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index ffcb9cc3d70c8..30a84f1fc397f 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -31,15 +31,17 @@ STORED AS CSV
 WITH HEADER ROW
 LOCATION '../core/tests/data/demo.csv';
 
+statement ok
+set datafusion.explain.logical_plan_only = true;
+
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER()
 FROM t;
 ----
 logical_plan
-Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
---WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-----Projection: t.c3 + t.c4 AS t.c3 + t.c4, t.c3, t.c4
-------TableScan: t projection=[c3, c4]
+WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+--Projection: t.c3 + t.c4 AS t.c3 + t.c4
+----TableScan: t projection=[c3, c4]
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
@@ -66,7 +68,6 @@ explain SELECT c3-c4, SUM(c3-c4) OVER()
 FROM t;
 ----
 logical_plan
-Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
---WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4
-------TableScan: t projection=[c3, c4]
+WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+--Projection: t.c3 - t.c4 AS t.c3 - t.c4
+----TableScan: t projection=[c3, c4]

From 4396682fa15ca4ef111f1d300562af9504d5cdba Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Wed, 27 Mar 2024 15:21:41 +0300
Subject: [PATCH 17/27] Minor changes

---
 datafusion/core/src/physical_planner.rs        |  5 -----
 .../optimizer/src/common_subexpr_eliminate.rs  | 18 +++++++++---------
 .../optimizer/src/optimize_projections.rs      |  2 --
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index 4334bb617f78c..ca708b05823e7 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -855,11 +855,6 @@ impl DefaultPhysicalPlanner {
                     )?))
                 }
                 LogicalPlan::Projection(Projection { input, expr, .. }) => {
-                    println!("---------------");
-                    for expr in expr{
-                        println!("expr:{:?}", expr);
-                    }
-                    println!("---------------");
                     let input_exec = self.create_initial_plan(input, session_state).await?;
                     let input_schema = input.as_ref().schema();
 
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index ef6b15fca3131..3aef3cf703c17 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -940,14 +940,14 @@ impl TreeNodeRewriter for ProjectionAdder {
     fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
         let added_expr =
             self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default();
-        println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map);
-        println!("self.cumulative_map: {:?}", self.cumulative_map);
-        println!("node:{:?}", node);
+        // println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map);
+        // println!("self.cumulative_map: {:?}", self.cumulative_map);
+        // println!("node:{:?}", node);
         self.depth -= 1;
         // do not do extra things
         let should_add_projection = !added_expr.is_empty();
         if !should_add_projection && !self.should_update_parents {
-            println!("not updating node");
+            // println!("not updating node");
             return Ok(Transformed::no(node));
         }
         // if added_expr.is_empty() {
@@ -989,16 +989,16 @@ impl TreeNodeRewriter for ProjectionAdder {
         // for expr in &added_expr{
         //     println!("expr.display_name() {:?}", expr.display_name()?);
         // }
-        println!("    original expressions: {:?}", expressions);
+        // println!("    original expressions: {:?}", expressions);
         let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::<Vec<_>>();
         // Replace expressions with its pre-computed variant if available.
         expressions.iter_mut().try_for_each(|expr|{
             update_expr_with_available_columns(expr, &available_columns)
         })?;
-        println!("after update expressions: {:?}", expressions);
+        // println!("after update expressions: {:?}", expressions);
 
         let new_node = node.with_new_exprs(expressions, [child].to_vec())?;
-        println!("new node: {:?}", new_node);
+        // println!("new node: {:?}", new_node);
         Ok(Transformed::yes(
             new_node,
         ))
@@ -1053,7 +1053,7 @@ fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Colu
             for available_col in available_columns{
                 // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?);
                 if available_col.flat_name() == expr.display_name()?{
-                    println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
+                    // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
                     *expr = Expr::Column(available_col.clone());
                 }
             }
@@ -1065,7 +1065,7 @@ fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Colu
             update_expr_with_available_columns(expr, available_columns)?
         }
         _ => {
-            println!("unsupported expression : {:?}", expr);
+            // println!("unsupported expression : {:?}", expr);
             // cannot rewrite
         }
     }
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 284ea6673f8b2..39dc56d2af93f 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -69,8 +69,6 @@ impl OptimizerRule for OptimizeProjections {
         plan: &LogicalPlan,
         config: &dyn OptimizerConfig,
     ) -> Result<Option<LogicalPlan>> {
-        println!("plan at the start:");
-        println!("{:?}", plan);
         // All output fields are necessary:
         let indices = (0..plan.schema().fields().len()).collect::<Vec<_>>();
         optimize_projections(plan, config, &indices)

From 83002cefa8f8f19407dcf49f236e74b58d2eb010 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Wed, 27 Mar 2024 16:24:32 +0300
Subject: [PATCH 18/27] Minor changes

---
 .../optimizer/src/common_subexpr_eliminate.rs | 165 +++++++-----------
 1 file changed, 64 insertions(+), 101 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 3aef3cf703c17..bab58b5880e70 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -906,6 +906,47 @@ impl ProjectionAdder {
         }
         (res, expr_data_type)
     }
+
+    fn update_expr_with_available_columns(&self, expr: &mut Expr, available_columns: &[Column]) -> Result<()> {
+        match expr {
+            Expr::BinaryExpr(_) => {
+                for available_col in available_columns{
+                    // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?);
+                    if available_col.flat_name() == expr.display_name()?{
+                        // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
+                        *expr = Expr::Column(available_col.clone());
+                    }
+                }
+            },
+            Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
+                args.iter_mut().try_for_each(|arg| self.update_expr_with_available_columns(arg, available_columns))?
+            },
+            Expr::Cast(Cast{ expr, .. }) => {
+                self.update_expr_with_available_columns(expr, available_columns)?
+            }
+            _ => {
+                // cannot rewrite
+            }
+        }
+        Ok(())
+    }
+
+
+    // Assumes operators doesn't modify name of the fields.
+    // Otherwise this operation is not safe.
+    fn extend_with_exprs(&mut self, node: &LogicalPlan) {
+        // use depth to trace where we are in the LogicalPlan tree
+        // extract all expressions + check whether it contains in depth_sets
+        let exprs = node.expressions();
+        let mut schema = node.schema().deref().clone();
+        for ip in node.inputs() {
+            schema.merge(ip.schema());
+        }
+        let (extended_set, data_map) =
+            Self::get_complex_expressions(exprs, Arc::new(schema));
+        self.cumulative_map.extend(extended_set);
+        self.data_type_map.extend(data_map);
+    }
 }
 impl TreeNodeRewriter for ProjectionAdder {
     type Node = LogicalPlan;
@@ -914,35 +955,35 @@ impl TreeNodeRewriter for ProjectionAdder {
     fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
         let depth_set = &mut self.cumulative_map;
         // Insert for other end points
-        if let LogicalPlan::TableScan(_) = node{
-            self.cumulative_expr_map.insert(self.depth, depth_set.clone());
-            self.depth += 1;
-            return Ok(Transformed::no(node));
-        } else {
-            // TODO: Assumes operators doesn't modify name of the fields.
-            // use depth to trace where we are in the LogicalPlan tree
-            self.depth += 1;
-            // extract all expressions + check whether it contains in depth_sets
-            let exprs = node.expressions();
-            // let depth_set = self.cumulative_expr_map.entry(self.depth).or_default();
-            let mut schema = node.schema().deref().clone();
-            for ip in node.inputs() {
-                schema.merge(ip.schema());
+        self.depth += 1;
+        match node {
+            LogicalPlan::TableScan(_) => {
+                self.cumulative_expr_map.insert(self.depth-1, depth_set.clone());
+                return Ok(Transformed::no(node));
+            },
+            LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => {
+                self.extend_with_exprs(&node);
+                Ok(Transformed::no(node))
+            }
+            LogicalPlan::Projection(_) => {
+                if depth_set.is_empty(){
+                    self.extend_with_exprs(&node);
+                } else {
+                    self.cumulative_expr_map.insert(self.depth-1, depth_set.clone());
+                }
+                Ok(Transformed::no(node))
+            },
+            _ => {
+                // Unsupported operators
+                self.cumulative_map.clear();
+                Ok(Transformed::no(node))
             }
-            let (extended_set, data_map) =
-                Self::get_complex_expressions(exprs, Arc::new(schema));
-            depth_set.extend(extended_set);
-            self.data_type_map.extend(data_map);
-            Ok(Transformed::no(node))
         }
     }
 
     fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
         let added_expr =
             self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default();
-        // println!("self.cumulative_expr_map: {:?}", self.cumulative_expr_map);
-        // println!("self.cumulative_map: {:?}", self.cumulative_map);
-        // println!("node:{:?}", node);
         self.depth -= 1;
         // do not do extra things
         let should_add_projection = !added_expr.is_empty();
@@ -950,10 +991,6 @@ impl TreeNodeRewriter for ProjectionAdder {
             // println!("not updating node");
             return Ok(Transformed::no(node));
         }
-        // if added_expr.is_empty() {
-        //     println!("not updating node");
-        //     return Ok(Transformed::no(node));
-        // }
 
         let child = node.inputs()[0];
         let child = if should_add_projection {
@@ -986,92 +1023,18 @@ impl TreeNodeRewriter for ProjectionAdder {
             child.clone()
         };
         let mut expressions = node.expressions();
-        // for expr in &added_expr{
-        //     println!("expr.display_name() {:?}", expr.display_name()?);
-        // }
-        // println!("    original expressions: {:?}", expressions);
         let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::<Vec<_>>();
         // Replace expressions with its pre-computed variant if available.
         expressions.iter_mut().try_for_each(|expr|{
-            update_expr_with_available_columns(expr, &available_columns)
+            self.update_expr_with_available_columns(expr, &available_columns)
         })?;
-        // println!("after update expressions: {:?}", expressions);
 
         let new_node = node.with_new_exprs(expressions, [child].to_vec())?;
-        // println!("new node: {:?}", new_node);
         Ok(Transformed::yes(
             new_node,
         ))
-
-        // match node {
-        //     // do not add for Projections
-        //     LogicalPlan::Projection(_)
-        //     | LogicalPlan::TableScan(_)
-        //     | LogicalPlan::Join(_) => Ok(Transformed::no(node)),
-        //     _ => {
-        //         // avoid recursive add projections
-        //         if added_expr.iter().any(|expr| {
-        //             node.inputs()[0]
-        //                 .schema()
-        //                 .has_column_with_unqualified_name(&expr.to_string())
-        //         }) {
-        //             return Ok(Transformed::no(node));
-        //         }
-        //
-        //         let mut field_set = HashSet::new();
-        //         let mut project_exprs = vec![];
-        //         for expr in added_expr {
-        //             let f = DFField::new_unqualified(
-        //                 &expr.to_string(),
-        //                 self.data_type_map[&expr].clone(),
-        //                 true,
-        //             );
-        //             field_set.insert(f.name().to_owned());
-        //             project_exprs.push(expr.clone().alias(expr.to_string()));
-        //         }
-        //         for field in node.inputs()[0].schema().fields() {
-        //             if field_set.insert(field.qualified_name()) {
-        //                 project_exprs.push(Expr::Column(field.qualified_column()));
-        //             }
-        //         }
-        //         // adding new plan here
-        //         let new_plan = LogicalPlan::Projection(Projection::try_new(
-        //             project_exprs,
-        //             Arc::new(node.inputs()[0].clone()),
-        //         )?);
-        //         Ok(Transformed::yes(
-        //             node.with_new_exprs(node.expressions(), [new_plan].to_vec())?,
-        //         ))
-        //     }
-        // }
     }
 }
-
-fn update_expr_with_available_columns(expr: &mut Expr, available_columns: &[Column]) -> Result<()> {
-    match expr {
-        Expr::BinaryExpr(_) => {
-            for available_col in available_columns{
-                // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?);
-                if available_col.flat_name() == expr.display_name()?{
-                    // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
-                    *expr = Expr::Column(available_col.clone());
-                }
-            }
-        },
-        Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
-            args.iter_mut().try_for_each(|arg| update_expr_with_available_columns(arg, available_columns))?
-        },
-        Expr::Cast(Cast{ expr, .. }) => {
-            update_expr_with_available_columns(expr, available_columns)?
-        }
-        _ => {
-            // println!("unsupported expression : {:?}", expr);
-            // cannot rewrite
-        }
-    }
-    Ok(())
-}
-
 #[cfg(test)]
 mod test {
     use std::iter;

From b77efabd7ddba2540e2b22e6549686175c933a73 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 11:24:07 +0300
Subject: [PATCH 19/27] Tmp

---
 .../src/physical_optimizer/enforce_sorting.rs | 25 ++++----
 .../tests/data/project_complex_expression.csv |  6 ++
 .../optimizer/src/common_subexpr_eliminate.rs | 27 +++++++-
 .../optimizer/src/optimize_projections.rs     |  1 +
 .../test_files/project_complex_sub_query.slt  | 63 ++++++++++++++++---
 5 files changed, 100 insertions(+), 22 deletions(-)
 create mode 100644 datafusion/core/tests/data/project_complex_expression.csv

diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index 95e4a06b516b3..1c4a453d2c252 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -2367,18 +2367,16 @@ mod tmp_tests {
         Ok(())
     }
 
-    const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE multiple_ordered_table (
-          a0 INTEGER,
-          a INTEGER,
-          b INTEGER,
-          c INTEGER,
-          d INTEGER
-        )
-        STORED AS CSV
-        WITH HEADER ROW
-        WITH ORDER (a ASC, b ASC)
-        WITH ORDER (c ASC)
-        LOCATION '../core/tests/data/window_2.csv'";
+    const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE t (
+c1 INT NOT NULL,
+c2 INT NOT NULL,
+c3 INT NOT NULL,
+c4 INT NOT NULL,
+c5 INT NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../core/tests/data/project_complex_expression.csv'";
 
     #[tokio::test]
     async fn test_query() -> Result<()> {
@@ -2387,7 +2385,8 @@ mod tmp_tests {
 
         ctx.sql(MULTIPLE_ORDERED_TABLE).await?;
 
-        let sql = "SELECT a+b, SUM(a+b) OVER() FROM multiple_ordered_table";
+        let sql = "SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
+FROM t";
 
         let msg = format!("Creating logical plan for '{sql}'");
         let dataframe = ctx.sql(sql).await.expect(&msg);
diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv
new file mode 100644
index 0000000000000..f37f11cc7f1ca
--- /dev/null
+++ b/datafusion/core/tests/data/project_complex_expression.csv
@@ -0,0 +1,6 @@
+c1,c2,c3,c4,c5
+1,2,3,4,5
+6,7,8,9,10
+11,12,13,14,15
+16,17,18,19,20
+21,22,23,24,25
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index bab58b5880e70..987618513642c 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -893,8 +893,13 @@ impl ProjectionAdder {
                         Self::get_complex_expressions(vec![*expr], schema.clone());
                     res.extend(expr_set);
                     expr_data_type.extend(type_data_map);
+                },
+                Expr::Alias(Alias{expr, relation: _, name: _}) => {
+                    let (expr_set, type_data_map) =
+                        Self::get_complex_expressions(vec![*expr], schema.clone());
+                    res.extend(expr_set);
+                    expr_data_type.extend(type_data_map);
                 }
-
                 Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
                     let (expr_set, type_map) =
                         Self::get_complex_expressions(args, schema.clone());
@@ -923,14 +928,31 @@ impl ProjectionAdder {
             },
             Expr::Cast(Cast{ expr, .. }) => {
                 self.update_expr_with_available_columns(expr, available_columns)?
+            },
+            Expr::Alias(alias) => {
+                self.update_expr_with_available_columns(&mut alias.expr, available_columns)?;
             }
             _ => {
                 // cannot rewrite
             }
         }
+        Self::trim_expr(expr)?;
         Ok(())
     }
 
+    fn trim_expr(expr: &mut Expr) -> Result<()> {
+        let orig_name = expr.display_name()?;
+        match expr {
+            Expr::Alias(alias) => {
+                Self::trim_expr(&mut alias.expr)?;
+                if orig_name == alias.expr.display_name()?{
+                    *expr = *alias.expr.clone();
+                }
+                Ok(())
+            }
+            _ => Ok(()),
+        }
+    }
 
     // Assumes operators doesn't modify name of the fields.
     // Otherwise this operation is not safe.
@@ -966,6 +988,7 @@ impl TreeNodeRewriter for ProjectionAdder {
                 Ok(Transformed::no(node))
             }
             LogicalPlan::Projection(_) => {
+                // println!("proj node exprs: {:?}", node.expressions());
                 if depth_set.is_empty(){
                     self.extend_with_exprs(&node);
                 } else {
@@ -988,7 +1011,7 @@ impl TreeNodeRewriter for ProjectionAdder {
         // do not do extra things
         let should_add_projection = !added_expr.is_empty();
         if !should_add_projection && !self.should_update_parents {
-            // println!("not updating node");
+            // clearln!("not updating node");
             return Ok(Transformed::no(node));
         }
 
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 39dc56d2af93f..9201d0a5e9b39 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -174,6 +174,7 @@ fn optimize_projections(
             return Ok(None);
         }
         LogicalPlan::Projection(proj) => {
+            // println!("expr:{:?}", proj.expr);
             return if let Some(proj) = merge_consecutive_projections(proj)? {
                 Ok(Some(
                     rewrite_projection_given_requirements(&proj, config, indices)?
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index 30a84f1fc397f..06bfaf5289219 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -29,10 +29,7 @@ c5 INT NOT NULL
 )
 STORED AS CSV
 WITH HEADER ROW
-LOCATION '../core/tests/data/demo.csv';
-
-statement ok
-set datafusion.explain.logical_plan_only = true;
+LOCATION '../core/tests/data/project_complex_expression.csv';
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER()
@@ -42,6 +39,12 @@ logical_plan
 WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
 --Projection: t.c3 + t.c4 AS t.c3 + t.c4
 ----TableScan: t projection=[c3, c4]
+physical_plan
+WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+--CoalescePartitionsExec
+----ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4]
+------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query TT
 explain SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
@@ -52,6 +55,15 @@ Projection: t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST]
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 AS Int64)) ORDER BY [t.c3 + t.c4t.c4t.c3 AS t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 ----Projection: t.c3 + t.c4 AS t.c3 + t.c4t.c4t.c3, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[c3@1 + c4@2 as t.c3 + t.c4, SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----BoundedWindowAggExec: wdw=[SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 + t.c4) ORDER BY [t.c3 + t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+------SortPreservingMergeExec: [t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST]
+--------SortExec: expr=[t.c3 + t.c4t.c4t.c3@0 ASC NULLS LAST]
+----------ProjectionExec: expr=[c3@0 + c4@1 as t.c3 + t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4]
+------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query TT
 explain SELECT c3-c4, SUM(c3-c4) OVER(order by c3-c4)
@@ -62,12 +74,49 @@ Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST]
 --WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 AS Int64)) ORDER BY [t.c3 - t.c4t.c4t.c3 AS t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
 ----Projection: t.c3 - t.c4 AS t.c3 - t.c4t.c4t.c3, t.c3, t.c4
 ------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[c3@1 - c4@2 as t.c3 - t.c4, SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]
+--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----BoundedWindowAggExec: wdw=[SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(t.c3 - t.c4) ORDER BY [t.c3 - t.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
+------SortPreservingMergeExec: [t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST]
+--------SortExec: expr=[t.c3 - t.c4t.c4t.c3@0 ASC NULLS LAST]
+----------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4t.c4t.c3, c3@0 as c3, c4@1 as c4]
+------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query TT
 explain SELECT c3-c4, SUM(c3-c4) OVER()
 FROM t;
 ----
 logical_plan
-WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
---Projection: t.c3 - t.c4 AS t.c3 - t.c4
-----TableScan: t projection=[c3, c4]
+Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+--WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4
+------TableScan: t projection=[c3, c4]
+physical_plan
+ProjectionExec: expr=[t.c3 - t.c4@0 as t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+--WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+----CoalescePartitionsExec
+------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4, c3@0 as c3, c4@1 as c4]
+--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
+
+query II
+SELECT c3+c4, SUM(c3+c4) OVER()
+FROM t;
+----
+7 135
+17 135
+27 135
+37 135
+47 135
+
+query II
+SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
+FROM t
+----
+7 7
+17 24
+27 51
+37 88
+47 135

From 59e0385dc0f8ac47d6a2a0e6d9205eb85e5b75ca Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 13:25:53 +0300
Subject: [PATCH 20/27] slt tests pass

---
 .../src/physical_optimizer/enforce_sorting.rs |   3 +-
 datafusion/expr/src/expr.rs                   |  15 +-
 .../optimizer/src/common_subexpr_eliminate.rs | 186 +++++++++++-------
 datafusion/sqllogictest/test_files/insert.slt |   2 +-
 .../test_files/project_complex_sub_query.slt  |  20 +-
 .../sqllogictest/test_files/subquery.slt      |   2 +-
 6 files changed, 137 insertions(+), 91 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index 1c4a453d2c252..63f35b70b4bfc 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -2348,7 +2348,6 @@ mod tests {
     }
 }
 
-
 #[cfg(test)]
 mod tmp_tests {
     use crate::assert_batches_eq;
@@ -2385,7 +2384,7 @@ LOCATION '../core/tests/data/project_complex_expression.csv'";
 
         ctx.sql(MULTIPLE_ORDERED_TABLE).await?;
 
-        let sql = "SELECT c3+c4, SUM(c3+c4) OVER(order by c3+c4)
+        let sql = "SELECT c3+c4, SUM(c3+c4) OVER()
 FROM t";
 
         let msg = format!("Creating logical plan for '{sql}'");
diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
index 7ede4cd8ffc9e..2e4930a1a0cb3 100644
--- a/datafusion/expr/src/expr.rs
+++ b/datafusion/expr/src/expr.rs
@@ -1866,8 +1866,19 @@ fn create_name(e: &Expr) -> Result<String> {
                 Ok(format!("{expr} BETWEEN {low} AND {high}"))
             }
         }
-        Expr::Sort { .. } => {
-            internal_err!("Create name does not support sort expression")
+        Expr::Sort(Sort {
+            expr,
+            asc,
+            nulls_first,
+        }) => {
+            let dir = if *asc { "ASC" } else { "DESC" };
+            let nulls = if *nulls_first {
+                "NULLS_FIRST"
+            } else {
+                "NULLS_LAST"
+            };
+            Ok(format!("{expr} {dir} {nulls}"))
+            // internal_err!("Create name does not support sort expression")
         }
         Expr::Wildcard { qualifier } => match qualifier {
             Some(qualifier) => internal_err!(
diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 987618513642c..51e704d30ecc9 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -420,15 +420,18 @@ impl CommonSubexprEliminate {
     /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the
     ///
     fn add_extra_projection(&self, plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
-        plan.clone()
+        // println!("plan at the start: {:?}", plan);
+        let res = plan
+            .clone()
             .rewrite(&mut ProjectionAdder {
-                data_type_map: HashMap::new(),
-                cumulative_expr_map: HashMap::new(),
-                cumulative_map: HashSet::new(),
+                insertion_point_map: HashMap::new(),
                 depth: 0,
-                should_update_parents: false,
-            })
-            .map(|transformed| Some(transformed.data))
+                complex_exprs: HashMap::new(),
+            })?
+            .data;
+        // .map(|transformed| Some(transformed.data))
+        // println!("plan at the end: {:?}", res);
+        Ok(Some(res))
     }
 }
 impl OptimizerRule for CommonSubexprEliminate {
@@ -850,11 +853,10 @@ fn replace_common_expr(
 }
 
 struct ProjectionAdder {
-    cumulative_expr_map: HashMap<u128, HashSet<Expr>>,
-    cumulative_map: HashSet<Expr>,
-    depth: u128,
-    data_type_map: HashMap<Expr, DataType>,
-    should_update_parents: bool,
+    insertion_point_map: HashMap<usize, HashMap<Expr, (DataType, u32)>>,
+    depth: usize,
+    complex_exprs: HashMap<Expr, (DataType, u32)>,
+    // should_update_parents: bool,
 }
 pub fn is_not_complex(op: &Operator) -> bool {
     matches!(
@@ -862,14 +864,14 @@ pub fn is_not_complex(op: &Operator) -> bool {
         &Operator::Eq | &Operator::NotEq | &Operator::Lt | &Operator::Gt | &Operator::And
     )
 }
+
 impl ProjectionAdder {
     // TODO: adding more expressions for sub query, currently only support for Simple Binary Expressions
     fn get_complex_expressions(
         exprs: Vec<Expr>,
         schema: DFSchemaRef,
-    ) -> (HashSet<Expr>, HashMap<Expr, DataType>) {
+    ) -> HashSet<(Expr, DataType)> {
         let mut res = HashSet::new();
-        let mut expr_data_type: HashMap<Expr, DataType> = HashMap::new();
         for expr in exprs {
             match expr {
                 Expr::BinaryExpr(BinaryExpr {
@@ -881,56 +883,62 @@ impl ProjectionAdder {
                         let l_field = schema
                             .field_from_column(l)
                             .expect("Field not found for left column");
-
-                        expr_data_type
-                            .entry(expr.clone())
-                            .or_insert(l_field.data_type().clone());
-                        res.insert(expr.clone());
+                        res.insert((expr.clone(), l_field.data_type().clone()));
                     }
                 }
                 Expr::Cast(Cast { expr, data_type: _ }) => {
-                    let (expr_set, type_data_map) =
+                    let exprs_with_type =
                         Self::get_complex_expressions(vec![*expr], schema.clone());
-                    res.extend(expr_set);
-                    expr_data_type.extend(type_data_map);
-                },
-                Expr::Alias(Alias{expr, relation: _, name: _}) => {
-                    let (expr_set, type_data_map) =
+                    res.extend(exprs_with_type);
+                }
+                Expr::Alias(Alias {
+                    expr,
+                    relation: _,
+                    name: _,
+                }) => {
+                    let exprs_with_type =
                         Self::get_complex_expressions(vec![*expr], schema.clone());
-                    res.extend(expr_set);
-                    expr_data_type.extend(type_data_map);
+                    res.extend(exprs_with_type);
                 }
                 Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
-                    let (expr_set, type_map) =
+                    let exprs_with_type =
                         Self::get_complex_expressions(args, schema.clone());
-                    res.extend(expr_set);
-                    expr_data_type.extend(type_map);
+                    res.extend(exprs_with_type);
                 }
                 _ => {}
             }
         }
-        (res, expr_data_type)
+        res
     }
 
-    fn update_expr_with_available_columns(&self, expr: &mut Expr, available_columns: &[Column]) -> Result<()> {
+    fn update_expr_with_available_columns(
+        &self,
+        expr: &mut Expr,
+        available_columns: &[Column],
+    ) -> Result<()> {
         match expr {
             Expr::BinaryExpr(_) => {
-                for available_col in available_columns{
+                for available_col in available_columns {
                     // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?);
-                    if available_col.flat_name() == expr.display_name()?{
+                    if available_col.flat_name() == expr.display_name()? {
                         // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
                         *expr = Expr::Column(available_col.clone());
                     }
                 }
-            },
+            }
             Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
-                args.iter_mut().try_for_each(|arg| self.update_expr_with_available_columns(arg, available_columns))?
-            },
-            Expr::Cast(Cast{ expr, .. }) => {
+                args.iter_mut().try_for_each(|arg| {
+                    self.update_expr_with_available_columns(arg, available_columns)
+                })?
+            }
+            Expr::Cast(Cast { expr, .. }) => {
                 self.update_expr_with_available_columns(expr, available_columns)?
-            },
+            }
             Expr::Alias(alias) => {
-                self.update_expr_with_available_columns(&mut alias.expr, available_columns)?;
+                self.update_expr_with_available_columns(
+                    &mut alias.expr,
+                    available_columns,
+                )?;
             }
             _ => {
                 // cannot rewrite
@@ -945,7 +953,7 @@ impl ProjectionAdder {
         match expr {
             Expr::Alias(alias) => {
                 Self::trim_expr(&mut alias.expr)?;
-                if orig_name == alias.expr.display_name()?{
+                if orig_name == alias.expr.display_name()? {
                     *expr = *alias.expr.clone();
                 }
                 Ok(())
@@ -960,14 +968,18 @@ impl ProjectionAdder {
         // use depth to trace where we are in the LogicalPlan tree
         // extract all expressions + check whether it contains in depth_sets
         let exprs = node.expressions();
+        // println!("extended exprs: {:?}", exprs);
         let mut schema = node.schema().deref().clone();
         for ip in node.inputs() {
             schema.merge(ip.schema());
         }
-        let (extended_set, data_map) =
-            Self::get_complex_expressions(exprs, Arc::new(schema));
-        self.cumulative_map.extend(extended_set);
-        self.data_type_map.extend(data_map);
+        let expr_with_type = Self::get_complex_expressions(exprs, Arc::new(schema));
+        for (expr, dtype) in expr_with_type {
+            let (_, count) = self.complex_exprs.entry(expr).or_insert_with(|| (dtype, 0));
+            *count += 1;
+        }
+        // self.cumulative_map.extend(extended_set);
+        // self.data_type_map.extend(data_map);
     }
 }
 impl TreeNodeRewriter for ProjectionAdder {
@@ -975,59 +987,72 @@ impl TreeNodeRewriter for ProjectionAdder {
     /// currently we just collect the complex bianryOP
 
     fn f_down(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        let depth_set = &mut self.cumulative_map;
         // Insert for other end points
         self.depth += 1;
         match node {
             LogicalPlan::TableScan(_) => {
-                self.cumulative_expr_map.insert(self.depth-1, depth_set.clone());
+                self.insertion_point_map
+                    .insert(self.depth - 1, self.complex_exprs.clone());
                 return Ok(Transformed::no(node));
-            },
+            }
             LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => {
                 self.extend_with_exprs(&node);
                 Ok(Transformed::no(node))
             }
             LogicalPlan::Projection(_) => {
                 // println!("proj node exprs: {:?}", node.expressions());
-                if depth_set.is_empty(){
+                if self.complex_exprs.is_empty() {
                     self.extend_with_exprs(&node);
                 } else {
-                    self.cumulative_expr_map.insert(self.depth-1, depth_set.clone());
+                    self.insertion_point_map
+                        .insert(self.depth - 1, self.complex_exprs.clone());
                 }
                 Ok(Transformed::no(node))
-            },
+            }
             _ => {
                 // Unsupported operators
-                self.cumulative_map.clear();
+                self.complex_exprs.clear();
                 Ok(Transformed::no(node))
             }
         }
     }
 
     fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
-        let added_expr =
-            self.cumulative_expr_map.get(&self.depth).cloned().unwrap_or_default();
+        let cached_exprs = self
+            .insertion_point_map
+            .get(&self.depth)
+            .cloned()
+            .unwrap_or_default();
         self.depth -= 1;
         // do not do extra things
-        let should_add_projection = !added_expr.is_empty();
-        if !should_add_projection && !self.should_update_parents {
-            // clearln!("not updating node");
+        // println!("---------------------");
+        // for (depth, complex_exprs) in &self.insertion_point_map{
+        //     println!("depth: {:?}, complex_exprs: {:?}", depth, complex_exprs);
+        // }
+        // println!("---------------------");
+        // let should_add_projection = !cached_exprs.is_empty();
+        let should_add_projection =
+            cached_exprs.iter().any(|(_expr, (_, count))| *count > 1);
+        // if let LogicalPlan::Projection(projection) = node {
+        //
+        // }
+
+        let children = node.inputs();
+        if children.len() != 1 {
+            // Only can rewrite node with single child
             return Ok(Transformed::no(node));
         }
-
-        let child = node.inputs()[0];
+        let child = children[0].clone();
         let child = if should_add_projection {
-            self.should_update_parents = true;
             let mut field_set = HashSet::new();
             let mut project_exprs = vec![];
-            for expr in &added_expr {
-                let f = DFField::new_unqualified(
-                    &expr.to_string(),
-                    self.data_type_map[expr].clone(),
-                    true,
-                );
-                field_set.insert(f.name().to_owned());
-                project_exprs.push(expr.clone().alias(expr.to_string()));
+            for (expr, (dtype, count)) in &cached_exprs {
+                if *count > 1 {
+                    let f =
+                        DFField::new_unqualified(&expr.to_string(), dtype.clone(), true);
+                    field_set.insert(f.name().to_owned());
+                    project_exprs.push(expr.clone().alias(expr.to_string()));
+                }
             }
             // Do not lose fields in the child.
             for field in child.schema().fields() {
@@ -1043,19 +1068,32 @@ impl TreeNodeRewriter for ProjectionAdder {
             )?);
             new_child
         } else {
-            child.clone()
+            child
         };
         let mut expressions = node.expressions();
-        let available_columns = child.schema().fields().iter().map(|field| field.qualified_column()).collect::<Vec<_>>();
+        let available_columns = child
+            .schema()
+            .fields()
+            .iter()
+            .map(|field| field.qualified_column())
+            .collect::<Vec<_>>();
         // Replace expressions with its pre-computed variant if available.
-        expressions.iter_mut().try_for_each(|expr|{
+        // println!("-----------------------");
+        // for expr in &expressions{
+        //     println!("old expr: {:?}", expr);
+        // }
+        expressions.iter_mut().try_for_each(|expr| {
             self.update_expr_with_available_columns(expr, &available_columns)
         })?;
+        // for expr in &expressions{
+        //     println!("new expr: {:?}", expr);
+        // }
+        // println!("-----------------------");
 
+        // println!("old node:{:?}", node);
         let new_node = node.with_new_exprs(expressions, [child].to_vec())?;
-        Ok(Transformed::yes(
-            new_node,
-        ))
+        // println!("new node:{:?}", new_node);
+        Ok(Transformed::yes(new_node))
     }
 }
 #[cfg(test)]
diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt
index fc272326973b1..113e544ad0636 100644
--- a/datafusion/sqllogictest/test_files/insert.slt
+++ b/datafusion/sqllogictest/test_files/insert.slt
@@ -169,7 +169,7 @@ ORDER BY c1
 ----
 logical_plan
 Dml: op=[Insert Into] table=[table_without_values]
---Projection: a1 AS a1, a2 AS a2
+--Projection: a1, a2
 ----Sort: aggregate_test_100.c1 ASC NULLS LAST
 ------Projection: SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1
 --------WindowAggr: windowExpr=[[SUM(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, COUNT(UInt8(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]]
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index 06bfaf5289219..9f88de66ea3bd 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -37,7 +37,7 @@ FROM t;
 ----
 logical_plan
 WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
---Projection: t.c3 + t.c4 AS t.c3 + t.c4
+--Projection: t.c3 + t.c4
 ----TableScan: t projection=[c3, c4]
 physical_plan
 WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
@@ -89,17 +89,15 @@ explain SELECT c3-c4, SUM(c3-c4) OVER()
 FROM t;
 ----
 logical_plan
-Projection: t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
---WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
-----Projection: t.c3 - t.c4 AS t.c3 - t.c4, t.c3, t.c4
-------TableScan: t projection=[c3, c4]
+WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+--Projection: t.c3 - t.c4
+----TableScan: t projection=[c3, c4]
 physical_plan
-ProjectionExec: expr=[t.c3 - t.c4@0 as t.c3 - t.c4, SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
---WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
-----CoalescePartitionsExec
-------ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4, c3@0 as c3, c4@1 as c4]
---------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
+WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
+--CoalescePartitionsExec
+----ProjectionExec: expr=[c3@0 - c4@1 as t.c3 - t.c4]
+------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/project_complex_expression.csv]]}, projection=[c3, c4], has_header=true
 
 query II
 SELECT c3+c4, SUM(c3+c4) OVER()
diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt
index 4fb94cfab5233..aac64b55b535a 100644
--- a/datafusion/sqllogictest/test_files/subquery.slt
+++ b/datafusion/sqllogictest/test_files/subquery.slt
@@ -449,7 +449,7 @@ query TT
 explain SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 1) as t2_int from t1
 ----
 logical_plan
-Projection: t1.t1_id, (<subquery>) AS t2_int
+Projection: t1.t1_id, (<subquery>)
 --Subquery:
 ----Limit: skip=0, fetch=1
 ------Projection: t2.t2_int

From c1a16f9018af639be57db04e05f766718c5ce134 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 13:29:15 +0300
Subject: [PATCH 21/27] Remove unnecessary test

---
 .../src/physical_optimizer/enforce_sorting.rs | 75 -------------------
 1 file changed, 75 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index 63f35b70b4bfc..5bf21c3dfab5a 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -2347,78 +2347,3 @@ mod tests {
         Ok(())
     }
 }
-
-#[cfg(test)]
-mod tmp_tests {
-    use crate::assert_batches_eq;
-    use crate::physical_plan::{collect, displayable, ExecutionPlan};
-    use crate::prelude::SessionContext;
-    use arrow::util::pretty::print_batches;
-    use datafusion_common::Result;
-    use datafusion_execution::config::SessionConfig;
-    use datafusion_physical_plan::get_plan_string;
-    use std::sync::Arc;
-
-    fn print_plan(plan: &Arc<dyn ExecutionPlan>) -> Result<()> {
-        let formatted = displayable(plan.as_ref()).indent(true).to_string();
-        let actual: Vec<&str> = formatted.trim().lines().collect();
-        println!("{:#?}", actual);
-        Ok(())
-    }
-
-    const MULTIPLE_ORDERED_TABLE: &str = "CREATE EXTERNAL TABLE t (
-c1 INT NOT NULL,
-c2 INT NOT NULL,
-c3 INT NOT NULL,
-c4 INT NOT NULL,
-c5 INT NOT NULL
-)
-STORED AS CSV
-WITH HEADER ROW
-LOCATION '../core/tests/data/project_complex_expression.csv'";
-
-    #[tokio::test]
-    async fn test_query() -> Result<()> {
-        let config = SessionConfig::new().with_target_partitions(1);
-        let ctx = SessionContext::new_with_config(config);
-
-        ctx.sql(MULTIPLE_ORDERED_TABLE).await?;
-
-        let sql = "SELECT c3+c4, SUM(c3+c4) OVER()
-FROM t";
-
-        let msg = format!("Creating logical plan for '{sql}'");
-        let dataframe = ctx.sql(sql).await.expect(&msg);
-        let physical_plan = dataframe.create_physical_plan().await?;
-        print_plan(&physical_plan)?;
-        let batches = collect(physical_plan.clone(), ctx.task_ctx()).await?;
-        print_batches(&batches)?;
-
-        let expected = vec![
-            "ProjectionExec: expr=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as nth_value1]",
-            "  GlobalLimitExec: skip=0, fetch=5",
-            "    BoundedWindowAggExec: wdw=[NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: \"NTH_VALUE(annotated_data_finite2.d,Int64(2)) ORDER BY [annotated_data_finite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Sorted]",
-            "      CsvExec: file_groups={1 group: [[Users/akurmustafa/projects/synnada/arrow-datafusion-synnada/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true",
-        ];
-        // Get string representation of the plan
-        let actual = get_plan_string(&physical_plan);
-        assert_eq!(
-            expected, actual,
-            "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"
-        );
-
-        let expected = [
-            "+------------+",
-            "| nth_value1 |",
-            "+------------+",
-            "| 2          |",
-            "| 2          |",
-            "| 2          |",
-            "| 2          |",
-            "| 2          |",
-            "+------------+",
-        ];
-        assert_batches_eq!(expected, &batches);
-        Ok(())
-    }
-}

From fb9e4d68e0167f0a7845fc3ceb423f83e2c960ec Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 13:58:20 +0300
Subject: [PATCH 22/27] Remove unnecessary changes

---
 datafusion/optimizer/src/common_subexpr_eliminate.rs          | 2 +-
 datafusion/sqllogictest/test_files/insert.slt                 | 2 +-
 .../sqllogictest/test_files/project_complex_sub_query.slt     | 4 ++--
 datafusion/sqllogictest/test_files/subquery.slt               | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index 51e704d30ecc9..dcb6f23e237f3 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -944,7 +944,7 @@ impl ProjectionAdder {
                 // cannot rewrite
             }
         }
-        Self::trim_expr(expr)?;
+        // Self::trim_expr(expr)?;
         Ok(())
     }
 
diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt
index 113e544ad0636..fc272326973b1 100644
--- a/datafusion/sqllogictest/test_files/insert.slt
+++ b/datafusion/sqllogictest/test_files/insert.slt
@@ -169,7 +169,7 @@ ORDER BY c1
 ----
 logical_plan
 Dml: op=[Insert Into] table=[table_without_values]
---Projection: a1, a2
+--Projection: a1 AS a1, a2 AS a2
 ----Sort: aggregate_test_100.c1 ASC NULLS LAST
 ------Projection: SUM(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1
 --------WindowAggr: windowExpr=[[SUM(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, COUNT(UInt8(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS COUNT(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]]
diff --git a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
index 9f88de66ea3bd..56e728325b790 100644
--- a/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
+++ b/datafusion/sqllogictest/test_files/project_complex_sub_query.slt
@@ -37,7 +37,7 @@ FROM t;
 ----
 logical_plan
 WindowAggr: windowExpr=[[SUM(CAST(t.c3 + t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
---Projection: t.c3 + t.c4
+--Projection: t.c3 + t.c4 AS t.c3 + t.c4
 ----TableScan: t projection=[c3, c4]
 physical_plan
 WindowAggExec: wdw=[SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 + t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
@@ -90,7 +90,7 @@ FROM t;
 ----
 logical_plan
 WindowAggr: windowExpr=[[SUM(CAST(t.c3 - t.c4 AS Int64)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
---Projection: t.c3 - t.c4
+--Projection: t.c3 - t.c4 AS t.c3 - t.c4
 ----TableScan: t projection=[c3, c4]
 physical_plan
 WindowAggExec: wdw=[SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "SUM(t.c3 - t.c4) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt
index aac64b55b535a..4fb94cfab5233 100644
--- a/datafusion/sqllogictest/test_files/subquery.slt
+++ b/datafusion/sqllogictest/test_files/subquery.slt
@@ -449,7 +449,7 @@ query TT
 explain SELECT t1_id, (SELECT t2_int FROM t2 WHERE t2.t2_int = t1.t1_int limit 1) as t2_int from t1
 ----
 logical_plan
-Projection: t1.t1_id, (<subquery>)
+Projection: t1.t1_id, (<subquery>) AS t2_int
 --Subquery:
 ----Limit: skip=0, fetch=1
 ------Projection: t2.t2_int

From bdf626ad2f11973b7ae1a3a06ed121ad630c39a1 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 14:03:46 +0300
Subject: [PATCH 23/27] Resolve linter errors

---
 .../optimizer/src/common_subexpr_eliminate.rs | 64 +++----------------
 1 file changed, 9 insertions(+), 55 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index dcb6f23e237f3..b6c51b8f8a4c4 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -420,8 +420,7 @@ impl CommonSubexprEliminate {
     /// currently the implemention is not optimal, Basically I just do a top-down iteration over all the
     ///
     fn add_extra_projection(&self, plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
-        // println!("plan at the start: {:?}", plan);
-        let res = plan
+        let result = plan
             .clone()
             .rewrite(&mut ProjectionAdder {
                 insertion_point_map: HashMap::new(),
@@ -429,9 +428,7 @@ impl CommonSubexprEliminate {
                 complex_exprs: HashMap::new(),
             })?
             .data;
-        // .map(|transformed| Some(transformed.data))
-        // println!("plan at the end: {:?}", res);
-        Ok(Some(res))
+        Ok(Some(result))
     }
 }
 impl OptimizerRule for CommonSubexprEliminate {
@@ -912,30 +909,27 @@ impl ProjectionAdder {
     }
 
     fn update_expr_with_available_columns(
-        &self,
         expr: &mut Expr,
         available_columns: &[Column],
     ) -> Result<()> {
         match expr {
             Expr::BinaryExpr(_) => {
                 for available_col in available_columns {
-                    // println!("cached_expr.display_name(), expr.display_name() {:?}, {:?}", cached_expr.display_name()?, expr.display_name()?);
                     if available_col.flat_name() == expr.display_name()? {
-                        // println!("replacing expr: {:?} with available_col: {:?}", expr, available_col);
                         *expr = Expr::Column(available_col.clone());
                     }
                 }
             }
             Expr::WindowFunction(WindowFunction { fun: _, args, .. }) => {
                 args.iter_mut().try_for_each(|arg| {
-                    self.update_expr_with_available_columns(arg, available_columns)
+                    Self::update_expr_with_available_columns(arg, available_columns)
                 })?
             }
             Expr::Cast(Cast { expr, .. }) => {
-                self.update_expr_with_available_columns(expr, available_columns)?
+                Self::update_expr_with_available_columns(expr, available_columns)?
             }
             Expr::Alias(alias) => {
-                self.update_expr_with_available_columns(
+                Self::update_expr_with_available_columns(
                     &mut alias.expr,
                     available_columns,
                 )?;
@@ -944,31 +938,15 @@ impl ProjectionAdder {
                 // cannot rewrite
             }
         }
-        // Self::trim_expr(expr)?;
         Ok(())
     }
 
-    fn trim_expr(expr: &mut Expr) -> Result<()> {
-        let orig_name = expr.display_name()?;
-        match expr {
-            Expr::Alias(alias) => {
-                Self::trim_expr(&mut alias.expr)?;
-                if orig_name == alias.expr.display_name()? {
-                    *expr = *alias.expr.clone();
-                }
-                Ok(())
-            }
-            _ => Ok(()),
-        }
-    }
-
     // Assumes operators doesn't modify name of the fields.
     // Otherwise this operation is not safe.
     fn extend_with_exprs(&mut self, node: &LogicalPlan) {
         // use depth to trace where we are in the LogicalPlan tree
         // extract all expressions + check whether it contains in depth_sets
         let exprs = node.expressions();
-        // println!("extended exprs: {:?}", exprs);
         let mut schema = node.schema().deref().clone();
         for ip in node.inputs() {
             schema.merge(ip.schema());
@@ -978,8 +956,6 @@ impl ProjectionAdder {
             let (_, count) = self.complex_exprs.entry(expr).or_insert_with(|| (dtype, 0));
             *count += 1;
         }
-        // self.cumulative_map.extend(extended_set);
-        // self.data_type_map.extend(data_map);
     }
 }
 impl TreeNodeRewriter for ProjectionAdder {
@@ -993,14 +969,13 @@ impl TreeNodeRewriter for ProjectionAdder {
             LogicalPlan::TableScan(_) => {
                 self.insertion_point_map
                     .insert(self.depth - 1, self.complex_exprs.clone());
-                return Ok(Transformed::no(node));
+                Ok(Transformed::no(node))
             }
             LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => {
                 self.extend_with_exprs(&node);
                 Ok(Transformed::no(node))
             }
             LogicalPlan::Projection(_) => {
-                // println!("proj node exprs: {:?}", node.expressions());
                 if self.complex_exprs.is_empty() {
                     self.extend_with_exprs(&node);
                 } else {
@@ -1025,17 +1000,8 @@ impl TreeNodeRewriter for ProjectionAdder {
             .unwrap_or_default();
         self.depth -= 1;
         // do not do extra things
-        // println!("---------------------");
-        // for (depth, complex_exprs) in &self.insertion_point_map{
-        //     println!("depth: {:?}, complex_exprs: {:?}", depth, complex_exprs);
-        // }
-        // println!("---------------------");
-        // let should_add_projection = !cached_exprs.is_empty();
         let should_add_projection =
             cached_exprs.iter().any(|(_expr, (_, count))| *count > 1);
-        // if let LogicalPlan::Projection(projection) = node {
-        //
-        // }
 
         let children = node.inputs();
         if children.len() != 1 {
@@ -1062,11 +1028,10 @@ impl TreeNodeRewriter for ProjectionAdder {
             }
 
             // adding new plan here
-            let new_child = LogicalPlan::Projection(Projection::try_new(
+            LogicalPlan::Projection(Projection::try_new(
                 project_exprs,
                 Arc::new(node.inputs()[0].clone()),
-            )?);
-            new_child
+            )?)
         } else {
             child
         };
@@ -1078,21 +1043,10 @@ impl TreeNodeRewriter for ProjectionAdder {
             .map(|field| field.qualified_column())
             .collect::<Vec<_>>();
         // Replace expressions with its pre-computed variant if available.
-        // println!("-----------------------");
-        // for expr in &expressions{
-        //     println!("old expr: {:?}", expr);
-        // }
         expressions.iter_mut().try_for_each(|expr| {
-            self.update_expr_with_available_columns(expr, &available_columns)
+            Self::update_expr_with_available_columns(expr, &available_columns)
         })?;
-        // for expr in &expressions{
-        //     println!("new expr: {:?}", expr);
-        // }
-        // println!("-----------------------");
-
-        // println!("old node:{:?}", node);
         let new_node = node.with_new_exprs(expressions, [child].to_vec())?;
-        // println!("new node:{:?}", new_node);
         Ok(Transformed::yes(new_node))
     }
 }

From 31cc2d7d0052418b06389d2cf6f105c284267766 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 14:20:37 +0300
Subject: [PATCH 24/27] Remove left over file

---
 datafusion/core/tests/data/demo.csv | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 datafusion/core/tests/data/demo.csv

diff --git a/datafusion/core/tests/data/demo.csv b/datafusion/core/tests/data/demo.csv
deleted file mode 100644
index d1492228246f0..0000000000000
--- a/datafusion/core/tests/data/demo.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-c1,c2,c3,c4,c5
-1,2,3,4,5

From e7f4b49694803bb41be2500c8003cc65a2b91363 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 14:23:17 +0300
Subject: [PATCH 25/27] Minor changes

---
 datafusion/core/tests/data/project_complex_expression.csv | 2 +-
 datafusion/optimizer/src/optimize_projections.rs          | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/datafusion/core/tests/data/project_complex_expression.csv b/datafusion/core/tests/data/project_complex_expression.csv
index f37f11cc7f1ca..c78f35a485cd0 100644
--- a/datafusion/core/tests/data/project_complex_expression.csv
+++ b/datafusion/core/tests/data/project_complex_expression.csv
@@ -3,4 +3,4 @@ c1,c2,c3,c4,c5
 6,7,8,9,10
 11,12,13,14,15
 16,17,18,19,20
-21,22,23,24,25
+21,22,23,24,25
\ No newline at end of file
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index 9201d0a5e9b39..39dc56d2af93f 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -174,7 +174,6 @@ fn optimize_projections(
             return Ok(None);
         }
         LogicalPlan::Projection(proj) => {
-            // println!("expr:{:?}", proj.expr);
             return if let Some(proj) = merge_consecutive_projections(proj)? {
                 Ok(Some(
                     rewrite_projection_given_requirements(&proj, config, indices)?

From 313cccd78cc782217daed56f6a27baf12bb101ed Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 14:27:05 +0300
Subject: [PATCH 26/27] Remove unnecessary changes

---
 datafusion/expr/src/expr.rs | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
index 2e4930a1a0cb3..7ede4cd8ffc9e 100644
--- a/datafusion/expr/src/expr.rs
+++ b/datafusion/expr/src/expr.rs
@@ -1866,19 +1866,8 @@ fn create_name(e: &Expr) -> Result<String> {
                 Ok(format!("{expr} BETWEEN {low} AND {high}"))
             }
         }
-        Expr::Sort(Sort {
-            expr,
-            asc,
-            nulls_first,
-        }) => {
-            let dir = if *asc { "ASC" } else { "DESC" };
-            let nulls = if *nulls_first {
-                "NULLS_FIRST"
-            } else {
-                "NULLS_LAST"
-            };
-            Ok(format!("{expr} {dir} {nulls}"))
-            // internal_err!("Create name does not support sort expression")
+        Expr::Sort { .. } => {
+            internal_err!("Create name does not support sort expression")
         }
         Expr::Wildcard { qualifier } => match qualifier {
             Some(qualifier) => internal_err!(

From 759226be6c631134da3bd207400323b3f9f47321 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <mustafa.akur@synnada.ai>
Date: Thu, 28 Mar 2024 14:37:38 +0300
Subject: [PATCH 27/27] Update comment

---
 .../optimizer/src/common_subexpr_eliminate.rs | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
index b6c51b8f8a4c4..e0638c2b6c010 100644
--- a/datafusion/optimizer/src/common_subexpr_eliminate.rs
+++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -850,10 +850,13 @@ fn replace_common_expr(
 }
 
 struct ProjectionAdder {
+    // Keeps track of cumulative usage of common expressions with its corresponding data type.
+    // accross plan where key is unsafe nodes that cumulative tracking is invalidated.
     insertion_point_map: HashMap<usize, HashMap<Expr, (DataType, u32)>>,
     depth: usize,
+    // Keeps track of cumulative usage of the common expressions with its corresponding data type.
+    // between safe nodes.
     complex_exprs: HashMap<Expr, (DataType, u32)>,
-    // should_update_parents: bool,
 }
 pub fn is_not_complex(op: &Operator) -> bool {
     matches!(
@@ -967,21 +970,24 @@ impl TreeNodeRewriter for ProjectionAdder {
         self.depth += 1;
         match node {
             LogicalPlan::TableScan(_) => {
+                // Stop tracking cumulative usage at the source.
+                let complex_exprs = std::mem::take(&mut self.complex_exprs);
                 self.insertion_point_map
-                    .insert(self.depth - 1, self.complex_exprs.clone());
+                    .insert(self.depth - 1, complex_exprs);
                 Ok(Transformed::no(node))
             }
             LogicalPlan::Sort(_) | LogicalPlan::Filter(_) | LogicalPlan::Window(_) => {
+                // These are safe operators where, expression identity is preserved during operation.
                 self.extend_with_exprs(&node);
                 Ok(Transformed::no(node))
             }
             LogicalPlan::Projection(_) => {
-                if self.complex_exprs.is_empty() {
-                    self.extend_with_exprs(&node);
-                } else {
-                    self.insertion_point_map
-                        .insert(self.depth - 1, self.complex_exprs.clone());
-                }
+                // Stop tracking cumulative usage at the projection since it may invalidate expression identity.
+                let complex_exprs = std::mem::take(&mut self.complex_exprs);
+                self.insertion_point_map
+                    .insert(self.depth - 1, complex_exprs);
+                // Start tracking common expressions from now on including projection.
+                self.extend_with_exprs(&node);
                 Ok(Transformed::no(node))
             }
             _ => {