diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 36a1161541756..a38d6ef1ac200 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -2744,23 +2744,20 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { assert_snapshot!( pretty_format_batches(&sql_results).unwrap(), - @r###" - +---------------+---------------------------------------------------------+ - | plan_type | plan | - +---------------+---------------------------------------------------------+ - | logical_plan | LeftSemi Join: | - | | TableScan: t1 projection=[a, b] | - | | SubqueryAlias: __correlated_sq_1 | - | | Projection: | - | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | - | | TableScan: t2 projection=[] | - | physical_plan | NestedLoopJoinExec: join_type=RightSemi | - | | ProjectionExec: expr=[] | - | | PlaceholderRowExec | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+---------------------------------------------------------+ - "### + @r" + +---------------+-----------------------------------------------------+ + | plan_type | plan | + +---------------+-----------------------------------------------------+ + | logical_plan | LeftSemi Join: | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | EmptyRelation | + | physical_plan | NestedLoopJoinExec: join_type=RightSemi | + | | PlaceholderRowExec | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+-----------------------------------------------------+ + " ); let df_results = ctx @@ -2783,23 +2780,20 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { assert_snapshot!( pretty_format_batches(&df_results).unwrap(), - @r###" - +---------------+---------------------------------------------------------------------+ - | plan_type | plan | - +---------------+---------------------------------------------------------------------+ - | logical_plan | LeftSemi Join: | - | | TableScan: t1 projection=[a, b] | - | | SubqueryAlias: __correlated_sq_1 | - | | Projection: | - | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | - | | TableScan: t2 projection=[] | - | physical_plan | NestedLoopJoinExec: join_type=RightSemi | - | | ProjectionExec: expr=[] | - | | PlaceholderRowExec | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+---------------------------------------------------------------------+ - "### + @r" + +---------------+-----------------------------------------------------+ + | plan_type | plan | + +---------------+-----------------------------------------------------+ + | logical_plan | LeftSemi Join: | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | EmptyRelation | + | physical_plan | NestedLoopJoinExec: join_type=RightSemi | + | | PlaceholderRowExec | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+-----------------------------------------------------+ + " ); Ok(()) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index d68e6cd812725..ea21da29849ee 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -3522,7 +3522,10 @@ impl Aggregate { ) -> Result { if group_expr.is_empty() && aggr_expr.is_empty() { return plan_err!( - "Aggregate requires at least one grouping or aggregate expression" + "Aggregate requires at least one grouping or aggregate expression. \ + Aggregate without grouping expressions nor aggregate expressions is \ + logically equivalent to, but less efficient than, VALUES producing \ + single row. Please use VALUES instead." ); } let group_expr_count = grouping_set_expr_count(&group_expr)?; diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 7b7be82b70ca0..08909f5f86675 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -26,12 +26,12 @@ use std::sync::Arc; use datafusion_common::{ get_required_group_by_exprs_indices, internal_datafusion_err, internal_err, Column, - HashMap, JoinType, Result, + DFSchema, HashMap, JoinType, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::{ - logical_plan::LogicalPlan, Aggregate, Distinct, Expr, Projection, TableScan, Unnest, - Window, + logical_plan::LogicalPlan, Aggregate, Distinct, EmptyRelation, Expr, Projection, + TableScan, Unnest, Window, }; use crate::optimize_projections::required_indices::RequiredIndices; @@ -153,23 +153,16 @@ fn optimize_projections( // Only use the absolutely necessary aggregate expressions required // by the parent: - let mut new_aggr_expr = aggregate_reqs.get_at_indices(&aggregate.aggr_expr); - - // Aggregations always need at least one aggregate expression. - // With a nested count, we don't require any column as input, but - // still need to create a correct aggregate, which may be optimized - // out later. As an example, consider the following query: - // - // SELECT count(*) FROM (SELECT count(*) FROM [...]) - // - // which always returns 1. - if new_aggr_expr.is_empty() - && new_group_bys.is_empty() - && !aggregate.aggr_expr.is_empty() - { - // take the old, first aggregate expression - new_aggr_expr = aggregate.aggr_expr; - new_aggr_expr.resize_with(1, || unreachable!()); + let new_aggr_expr = aggregate_reqs.get_at_indices(&aggregate.aggr_expr); + + if new_group_bys.is_empty() && new_aggr_expr.is_empty() { + // Global aggregation with no aggregate functions always produces 1 row and no columns. + return Ok(Transformed::yes(LogicalPlan::EmptyRelation( + EmptyRelation { + produce_one_row: true, + schema: Arc::new(DFSchema::empty()), + }, + ))); } let all_exprs_iter = new_group_bys.iter().chain(new_aggr_expr.iter()); @@ -1146,9 +1139,7 @@ mod tests { plan, @r" Aggregate: groupBy=[[]], aggr=[[count(Int32(1))]] - Projection: - Aggregate: groupBy=[[]], aggr=[[count(Int32(1))]] - TableScan: ?table? projection=[] + EmptyRelation " ) } diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 4d61b254f5077..c32a5e6b33d62 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -429,14 +429,11 @@ logical_plan 01)LeftSemi Join: 02)--TableScan: t1 projection=[a] 03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: -05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] -06)--------TableScan: t2 projection=[] +04)----EmptyRelation physical_plan 01)NestedLoopJoinExec: join_type=LeftSemi 02)--DataSourceExec: partitions=1, partition_sizes=[0] -03)--ProjectionExec: expr=[] -04)----PlaceholderRowExec +03)--PlaceholderRowExec statement ok drop table t1; diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index f57c505068939..21de2d49fba2d 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -1263,14 +1263,11 @@ physical_plan 04)│ join_type: LeftSemi │ │ 05)└─────────────┬─────────────┘ │ 06)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -07)│ DataSourceExec ││ ProjectionExec │ +07)│ DataSourceExec ││ PlaceholderRowExec │ 08)│ -------------------- ││ │ 09)│ files: 1 ││ │ 10)│ format: csv ││ │ -11)└───────────────────────────┘└─────────────┬─────────────┘ -12)-----------------------------┌─────────────┴─────────────┐ -13)-----------------------------│ PlaceholderRowExec │ -14)-----------------------------└───────────────────────────┘ +11)└───────────────────────────┘└───────────────────────────┘ # Query with cross join. query TT diff --git a/datafusion/sqllogictest/test_files/expr/date_part.slt b/datafusion/sqllogictest/test_files/expr/date_part.slt index df17a8bca968d..64f16f72421a0 100644 --- a/datafusion/sqllogictest/test_files/expr/date_part.slt +++ b/datafusion/sqllogictest/test_files/expr/date_part.slt @@ -1089,4 +1089,4 @@ SELECT EXTRACT("isodow" FROM to_timestamp('2020-09-08T12:00:00+00:00')) query I SELECT EXTRACT('isodow' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- -1 \ No newline at end of file +1 diff --git a/datafusion/sqllogictest/test_files/issue_17138.slt b/datafusion/sqllogictest/test_files/issue_17138.slt new file mode 100644 index 0000000000000..d7dcf8d4dbdc2 --- /dev/null +++ b/datafusion/sqllogictest/test_files/issue_17138.slt @@ -0,0 +1,36 @@ +statement ok +CREATE TABLE tab1(col0 INTEGER, col1 INTEGER, col2 INTEGER) + +statement ok +INSERT INTO tab1 VALUES(51,14,96) + +query R +SELECT NULL * AVG(DISTINCT 4) + SUM(col1) AS col0 FROM tab1 +---- +NULL + +query TT +EXPLAIN SELECT NULL * AVG(DISTINCT 4) + SUM(col1) AS col0 FROM tab1 +---- +logical_plan +01)Projection: Float64(NULL) AS col0 +02)--EmptyRelation +physical_plan +01)ProjectionExec: expr=[NULL as col0] +02)--PlaceholderRowExec + +# Similar, with a few more arithmetic operations +query R +SELECT + CAST ( NULL AS INTEGER ) * + + AVG ( DISTINCT 4 ) + - SUM ( ALL + col1 ) AS col0 FROM tab1 +---- +NULL + +query TT +EXPLAIN SELECT + CAST ( NULL AS INTEGER ) * + + AVG ( DISTINCT 4 ) + - SUM ( ALL + col1 ) AS col0 FROM tab1 +---- +logical_plan +01)Projection: Float64(NULL) AS col0 +02)--EmptyRelation +physical_plan +01)ProjectionExec: expr=[NULL as col0] +02)--PlaceholderRowExec diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt b/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt index ca1b2724a8ce8..7cfdfe8257277 100644 --- a/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt +++ b/datafusion/sqllogictest/test_files/spark/bitwise/getbit.slt @@ -73,4 +73,3 @@ query I SELECT getbit(11, NULL); ---- NULL - diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index ed73eecda03e2..e33271cf6de99 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -1453,9 +1453,7 @@ logical_plan 01)LeftSemi Join: 02)--TableScan: t1 projection=[a] 03)--SubqueryAlias: __correlated_sq_1 -04)----Projection: -05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] -06)--------TableScan: t2 projection=[] +04)----EmptyRelation statement count 0 drop table t1;