apache · jorgecarleitao · Jul 12, 2020 · Jul 12, 2020 · Jul 12, 2020 · Jul 17, 2020
diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs
@@ -36,8 +36,7 @@ use crate::execution::physical_plan::common;
 use crate::execution::physical_plan::csv::{CsvExec, CsvReadOptions};
 use crate::execution::physical_plan::datasource::DatasourceExec;
 use crate::execution::physical_plan::expressions::{
-    Alias, Avg, BinaryExpr, CastExpr, Column, Count, Literal, Max, Min, PhysicalSortExpr,
-    Sum,
+    Avg, BinaryExpr, CastExpr, Column, Count, Literal, Max, Min, PhysicalSortExpr, Sum,
 };
 use crate::execution::physical_plan::hash_aggregate::HashAggregateExec;
 use crate::execution::physical_plan::limit::LimitExec;
@@ -51,10 +50,11 @@ use crate::execution::physical_plan::sort::{SortExec, SortOptions};
 use crate::execution::physical_plan::udf::{ScalarFunction, ScalarFunctionExpr};
 use crate::execution::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr};
 use crate::execution::table_impl::TableImpl;
-use crate::logicalplan::*;
+use crate::logicalplan::{
+    Expr, FunctionMeta, FunctionType, LogicalPlan, LogicalPlanBuilder,
+};
 use crate::optimizer::optimizer::OptimizerRule;
 use crate::optimizer::projection_push_down::ProjectionPushDown;
-use crate::optimizer::resolve_columns::ResolveColumnsRule;
 use crate::optimizer::type_coercion::TypeCoercionRule;
 use crate::sql::parser::{DFASTNode, DFParser, FileType};
 use crate::sql::planner::{SchemaProvider, SqlToRel};
@@ -67,6 +67,15 @@ pub struct ExecutionContext {
     scalar_functions: HashMap<String, Box<ScalarFunction>>,
 }
 
+fn tuple_err<T, R>(value: (Result<T>, Result<R>)) -> Result<(T, R)> {
+    match value {
+        (Ok(e), Ok(e1)) => Ok((e, e1)),
+        (Err(e), Ok(_)) => Err(e),
+        (Ok(_), Err(e1)) => Err(e1),
+        (Err(e), Err(_)) => Err(e),
+    }
+}
+
 impl ExecutionContext {
     /// Create a new execution context for in-memory queries
     pub fn new() -> Self {
@@ -275,7 +284,6 @@ impl ExecutionContext {
     /// Optimize the logical plan by applying optimizer rules
     pub fn optimize(&self, plan: &LogicalPlan) -> Result<LogicalPlan> {
         let rules: Vec<Box<dyn OptimizerRule>> = vec![
-            Box::new(ResolveColumnsRule::new()),
             Box::new(ProjectionPushDown::new()),
             Box::new(TypeCoercionRule::new(&self.scalar_functions)),
         ];
@@ -356,7 +364,12 @@ impl ExecutionContext {
                 let input_schema = input.as_ref().schema().clone();
                 let runtime_expr = expr
                     .iter()
-                    .map(|e| self.create_physical_expr(e, &input_schema))
+                    .map(|e| {
+                        tuple_err((
+                            self.create_physical_expr(e, &input_schema),
+                            e.name(&input_schema),
+                        ))
+                    })
                     .collect::<Result<Vec<_>>>()?;
                 Ok(Arc::new(ProjectionExec::try_new(runtime_expr, input)?))
             }
@@ -370,17 +383,30 @@ impl ExecutionContext {
                 let input = self.create_physical_plan(input, batch_size)?;
                 let input_schema = input.as_ref().schema().clone();
 
-                let group_expr = group_expr
+                let groups = group_expr
                     .iter()
-                    .map(|e| self.create_physical_expr(e, &input_schema))
+                    .map(|e| {
+                        tuple_err((
+                            self.create_physical_expr(e, &input_schema),
+                            e.name(&input_schema),
+                        ))
+                    })
                     .collect::<Result<Vec<_>>>()?;
-                let aggr_expr = aggr_expr
+                let aggregates = aggr_expr
                     .iter()
-                    .map(|e| self.create_aggregate_expr(e, &input_schema))
+                    .map(|e| {
+                        tuple_err((
+                            self.create_aggregate_expr(e, &input_schema),
+                            e.name(&input_schema),
+                        ))
+                    })
                     .collect::<Result<Vec<_>>>()?;
 
-                let initial_aggr =
-                    HashAggregateExec::try_new(group_expr, aggr_expr, input)?;
+                let initial_aggr = HashAggregateExec::try_new(
+                    groups.clone(),
+                    aggregates.clone(),
+                    input,
+                )?;
 
                 let schema = initial_aggr.schema();
                 let partitions = initial_aggr.partitions()?;
@@ -389,13 +415,27 @@ impl ExecutionContext {
                     return Ok(Arc::new(initial_aggr));
                 }
 
-                let (final_group, final_aggr) = initial_aggr.make_final_expr();
-
                 let merge = Arc::new(MergeExec::new(schema.clone(), partitions));
 
+                // construct the expressions for the final aggregation
+                let (final_group, final_aggr) = initial_aggr.make_final_expr(
+                    groups.iter().map(|x| x.1.clone()).collect(),
+                    aggregates.iter().map(|x| x.1.clone()).collect(),
+                );
+
+                // construct a second aggregation, keeping the final column name equal to the first aggregation
+                // and the expressions corresponding to the respective aggregate
                 Ok(Arc::new(HashAggregateExec::try_new(
-                    final_group,
-                    final_aggr,
+                    final_group
+                        .iter()
+                        .enumerate()
+                        .map(|(i, expr)| (expr.clone(), groups[i].1.clone()))
+                        .collect(),
+                    final_aggr
+                        .iter()
+                        .enumerate()
+                        .map(|(i, expr)| (expr.clone(), aggregates[i].1.clone()))
+                        .collect(),
                     merge,
                 )?))
             }
@@ -455,12 +495,11 @@ impl ExecutionContext {
         input_schema: &Schema,
     ) -> Result<Arc<dyn PhysicalExpr>> {
         match e {
-            Expr::Alias(expr, name) => {
-                let expr = self.create_physical_expr(expr, input_schema)?;
-                Ok(Arc::new(Alias::new(expr, &name)))
-            }
-            Expr::Column(i) => {
-                Ok(Arc::new(Column::new(*i, &input_schema.field(*i).name())))
+            Expr::Alias(expr, ..) => Ok(self.create_physical_expr(expr, input_schema)?),
+            Expr::Column(name) => {
+                // check that name exists
+                input_schema.field_with_name(&name)?;
+                Ok(Arc::new(Column::new(name)))
             }
             Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))),
             Expr::BinaryExpr { left, op, right } => Ok(Arc::new(BinaryExpr::new(
@@ -484,7 +523,6 @@ impl ExecutionContext {
                         physical_args.push(self.create_physical_expr(e, input_schema)?);
                     }
                     Ok(Arc::new(ScalarFunctionExpr::new(
-                        name,
                         Box::new(f.fun.clone()),
                         physical_args,
                         return_type,
@@ -650,6 +688,7 @@ mod tests {
     use super::*;
     use crate::datasource::MemTable;
     use crate::execution::physical_plan::udf::ScalarUdf;
+    use crate::logicalplan::{aggregate_expr, col, scalar_function};
     use crate::test;
     use arrow::array::{ArrayRef, Int32Array};
     use arrow::compute::add;
@@ -666,10 +705,12 @@ mod tests {
         // there should be one batch per partition
         assert_eq!(results.len(), partition_count);
 
-        // each batch should contain 2 columns and 10 rows
+        // each batch should contain 2 columns and 10 rows with correct field names
         for batch in &results {
             assert_eq!(batch.num_columns(), 2);
             assert_eq!(batch.num_rows(), 10);
+
+            assert_eq!(field_names(batch), vec!["c1", "c2"]);
         }
 
         Ok(())
@@ -706,7 +747,7 @@ mod tests {
 
         let table = ctx.table("test")?;
         let logical_plan = LogicalPlanBuilder::from(&table.to_logical_plan())
-            .project(vec![Expr::UnresolvedColumn("c2".to_string())])?
+            .project(vec![col("c2")])?
             .build()?;
 
         let optimized_plan = ctx.optimize(&logical_plan)?;
@@ -725,7 +766,7 @@ mod tests {
             _ => assert!(false, "expect optimized_plan to be projection"),
         }
 
-        let expected = "Projection: #0\
+        let expected = "Projection: #c2\
         \n  TableScan: test projection=Some([1])";
         assert_eq!(format!("{:?}", optimized_plan), expected);
 
@@ -747,19 +788,19 @@ mod tests {
         let tmp_dir = TempDir::new("execute")?;
         let ctx = create_ctx(&tmp_dir, 1)?;
 
-        let schema = Arc::new(Schema::new(vec![Field::new(
-            "state",
-            DataType::Utf8,
-            false,
-        )]));
+        let schema = ctx.datasources.get("test").unwrap().schema();
+        assert_eq!(schema.field_with_name("c1")?.is_nullable(), false);
 
         let plan = LogicalPlanBuilder::scan("default", "test", schema.as_ref(), None)?
-            .project(vec![col("state")])?
+            .project(vec![col("c1")])?
             .build()?;
 
         let plan = ctx.optimize(&plan)?;
         let physical_plan = ctx.create_physical_plan(&Arc::new(plan), 1024)?;
-        assert_eq!(physical_plan.schema().field(0).is_nullable(), false);
+        assert_eq!(
+            physical_plan.schema().field_with_name("c1")?.is_nullable(),
+            false
+        );
         Ok(())
     }
 
@@ -783,7 +824,7 @@ mod tests {
             projection: None,
             projected_schema: Box::new(schema.clone()),
         })
-        .project(vec![Expr::UnresolvedColumn("b".to_string())])?
+        .project(vec![col("b")])?
         .build()?;
         assert_fields_eq(&plan, vec!["b"]);
 
@@ -804,7 +845,7 @@ mod tests {
             _ => assert!(false, "expect optimized_plan to be projection"),
         }
 
-        let expected = "Projection: #0\
+        let expected = "Projection: #b\
         \n  InMemoryScan: projection=Some([1])";
         assert_eq!(format!("{:?}", optimized_plan), expected);
 
@@ -844,6 +885,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["SUM(c1)", "SUM(c2)"]);
+
         let expected: Vec<&str> = vec!["60,220"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -858,6 +902,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["AVG(c1)", "AVG(c2)"]);
+
         let expected: Vec<&str> = vec!["1.5,5.5"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -872,6 +919,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["MAX(c1)", "MAX(c2)"]);
+
         let expected: Vec<&str> = vec!["3,10"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -886,6 +936,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["MIN(c1)", "MIN(c2)"]);
+
         let expected: Vec<&str> = vec!["0,1"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -900,6 +953,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["c1", "SUM(c2)"]);
+
         let expected: Vec<&str> = vec!["0,55", "1,55", "2,55", "3,55"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -914,6 +970,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["c1", "AVG(c2)"]);
+
         let expected: Vec<&str> = vec!["0,5.5", "1,5.5", "2,5.5", "3,5.5"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -928,6 +987,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["c1", "MAX(c2)"]);
+
         let expected: Vec<&str> = vec!["0,10", "1,10", "2,10", "3,10"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -942,6 +1004,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["c1", "MIN(c2)"]);
+
         let expected: Vec<&str> = vec!["0,1", "1,1", "2,1", "3,1"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -956,6 +1021,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["COUNT(c1)", "COUNT(c2)"]);
+
         let expected: Vec<&str> = vec!["10,10"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -969,6 +1037,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["COUNT(c1)", "COUNT(c2)"]);
+
         let expected: Vec<&str> = vec!["40,40"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -982,6 +1053,9 @@ mod tests {
         assert_eq!(results.len(), 1);
 
         let batch = &results[0];
+
+        assert_eq!(field_names(batch), vec!["c1", "COUNT(c2)"]);
+
         let expected = vec!["0,10", "1,10", "2,10", "3,10"];
         let mut rows = test::format_batch(&batch);
         rows.sort();
@@ -995,16 +1069,16 @@ mod tests {
         let ctx = create_ctx(&tmp_dir, 1)?;
 
         let schema = Arc::new(Schema::new(vec![
-            Field::new("state", DataType::Utf8, false),
-            Field::new("salary", DataType::UInt32, false),
+            Field::new("c1", DataType::Utf8, false),
+            Field::new("c2", DataType::UInt32, false),
         ]));
 
         let plan = LogicalPlanBuilder::scan("default", "test", schema.as_ref(), None)?
             .aggregate(
-                vec![col("state")],
-                vec![aggregate_expr("SUM", col("salary"), DataType::UInt32)],
+                vec![col("c1")],
+                vec![aggregate_expr("SUM", col("c2"), DataType::UInt32)],
             )?
-            .project(vec![col("state"), col_index(1).alias("total_salary")])?
+            .project(vec![col("c1"), col("SUM(c2)").alias("total_salary")])?
             .build()?;
 
         let plan = ctx.optimize(&plan)?;
@@ -1131,6 +1205,7 @@ mod tests {
         let batch = &result[0];
         assert_eq!(3, batch.num_columns());
         assert_eq!(4, batch.num_rows());
+        assert_eq!(field_names(batch), vec!["a", "b", "my_add(a,b)"]);
 
         let a = batch
             .column(0)
@@ -1166,6 +1241,15 @@ mod tests {
         ctx.collect(physical_plan.as_ref())
     }
 
+    fn field_names(result: &RecordBatch) -> Vec<String> {
+        result
+            .schema()
+            .fields()
+            .iter()
+            .map(|x| x.name().clone())
+            .collect::<Vec<String>>()
+    }
+
     /// Execute SQL and return results
     fn execute(sql: &str, partition_count: usize) -> Result<Vec<RecordBatch>> {
         let tmp_dir = TempDir::new("execute")?;