From d05275dd5c9638faa0ddf4f3b7d6f2f330b1a5aa Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Fri, 26 May 2023 18:00:27 +0800 Subject: [PATCH 01/13] init checkin --- benchmarks/expected-plans/q11.txt | 178 ++++---- benchmarks/expected-plans/q15.txt | 130 +++--- benchmarks/expected-plans/q17.txt | 98 ++--- benchmarks/expected-plans/q2.txt | 10 +- benchmarks/expected-plans/q20.txt | 160 +++---- benchmarks/expected-plans/q22.txt | 112 +++-- datafusion/core/tests/sql/joins.rs | 34 +- datafusion/core/tests/sql/subqueries.rs | 401 +++++++++++++++--- datafusion/core/tests/tpcds_planning.rs | 11 +- datafusion/expr/src/logical_plan/plan.rs | 73 ++++ datafusion/optimizer/src/analyzer/subquery.rs | 10 + .../src/decorrelate_predicate_subquery.rs | 335 ++++++--------- .../optimizer/src/scalar_subquery_to_join.rs | 397 +++++++---------- datafusion/optimizer/src/utils.rs | 261 ++++++++++-- .../optimizer/tests/integration-test.rs | 11 +- 15 files changed, 1318 insertions(+), 903 deletions(-) diff --git a/benchmarks/expected-plans/q11.txt b/benchmarks/expected-plans/q11.txt index 0a732897c38f1..fae9e0ea7f133 100644 --- a/benchmarks/expected-plans/q11.txt +++ b/benchmarks/expected-plans/q11.txt @@ -1,89 +1,89 @@ -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: value DESC NULLS FIRST | -| | Projection: partsupp.ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS value | -| | Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.__value | -| | CrossJoin: | -| | Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | -| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost | -| | Inner Join: supplier.s_nationkey = nation.n_nationkey | -| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | -| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | -| | TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] | -| | TableScan: supplier projection=[s_suppkey, s_nationkey] | -| | Projection: nation.n_nationkey | -| | Filter: nation.n_name = Utf8("GERMANY") | -| | TableScan: nation projection=[n_nationkey, n_name] | -| | SubqueryAlias: __scalar_sq_1 | -| | Projection: CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) AS __value | -| | Aggregate: groupBy=[[]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | -| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost | -| | Inner Join: supplier.s_nationkey = nation.n_nationkey | -| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | -| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | -| | TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] | -| | TableScan: supplier projection=[s_suppkey, s_nationkey] | -| | Projection: nation.n_nationkey | -| | Filter: nation.n_name = Utf8("GERMANY") | -| | TableScan: nation projection=[n_nationkey, n_name] | -| physical_plan | SortExec: expr=[value@1 DESC] | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) > __value@2 | -| | CrossJoinExec | -| | CoalescePartitionsExec | -| | AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=2 | -| | AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@5 as s_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 1 }, Column { name: "s_suppkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: n_name@1 = GERMANY | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | ProjectionExec: expr=[CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as __value] | -| | AggregateExec: mode=Final, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | CoalescePartitionsExec | -| | AggregateExec: mode=Partial, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | ProjectionExec: expr=[ps_availqty@0 as ps_availqty, ps_supplycost@1 as ps_supplycost] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 2 }, Column { name: "n_nationkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@4 as s_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 0 }, Column { name: "s_suppkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: n_name@1 = GERMANY | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: value DESC NULLS FIRST | +| | Projection: partsupp.ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS value | +| | Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001) | +| | CrossJoin: | +| | Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | +| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost | +| | Inner Join: supplier.s_nationkey = nation.n_nationkey | +| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | +| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | +| | TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] | +| | TableScan: supplier projection=[s_suppkey, s_nationkey] | +| | Projection: nation.n_nationkey | +| | Filter: nation.n_name = Utf8("GERMANY") | +| | TableScan: nation projection=[n_nationkey, n_name] | +| | SubqueryAlias: __scalar_sq_1 | +| | Projection: CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) | +| | Aggregate: groupBy=[[]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | +| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost | +| | Inner Join: supplier.s_nationkey = nation.n_nationkey | +| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | +| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | +| | TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] | +| | TableScan: supplier projection=[s_suppkey, s_nationkey] | +| | Projection: nation.n_nationkey | +| | Filter: nation.n_name = Utf8("GERMANY") | +| | TableScan: nation projection=[n_nationkey, n_name] | +| physical_plan | SortExec: expr=[value@1 DESC] | +| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) > SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@2 | +| | CrossJoinExec | +| | CoalescePartitionsExec | +| | AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=2 | +| | AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@5 as s_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 1 }, Column { name: "s_suppkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: n_name@1 = GERMANY | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | ProjectionExec: expr=[CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] | +| | AggregateExec: mode=Final, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=Partial, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | ProjectionExec: expr=[ps_availqty@0 as ps_availqty, ps_supplycost@1 as ps_supplycost] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 2 }, Column { name: "n_nationkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@4 as s_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 0 }, Column { name: "s_suppkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: n_name@1 = GERMANY | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/benchmarks/expected-plans/q15.txt b/benchmarks/expected-plans/q15.txt index 208f4c6690fb8..ea62c08aeb8d5 100644 --- a/benchmarks/expected-plans/q15.txt +++ b/benchmarks/expected-plans/q15.txt @@ -1,66 +1,64 @@ -+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: supplier.s_suppkey ASC NULLS LAST | -| | Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue | -| | Inner Join: revenue0.total_revenue = __scalar_sq_1.__value | -| | Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue | -| | Inner Join: supplier.s_suppkey = revenue0.supplier_no | -| | TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone] | -| | SubqueryAlias: revenue0 | -| | Projection: lineitem.l_suppkey AS supplier_no, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue | -| | Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] | -| | Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount | -| | Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") | -| | TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] | -| | SubqueryAlias: __scalar_sq_1 | -| | Projection: MAX(revenue0.total_revenue) AS __value | -| | Aggregate: groupBy=[[]], aggr=[[MAX(revenue0.total_revenue)]] | -| | SubqueryAlias: revenue0 | -| | Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue | -| | Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] | -| | Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount | -| | Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") | -| | TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] | -| physical_plan | SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST] | -| | SortExec: expr=[s_suppkey@0 ASC NULLS LAST] | -| | ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address, s_phone@3 as s_phone, total_revenue@4 as total_revenue] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "total_revenue", index: 4 }, Column { name: "__value", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "total_revenue", index: 4 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address, s_phone@3 as s_phone, total_revenue@5 as total_revenue] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_suppkey", index: 0 }, Column { name: "supplier_no", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | ProjectionExec: expr=[l_suppkey@0 as supplier_no, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] | -| | AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "l_suppkey", index: 0 }], 2), input_partitions=2 | -| | AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[l_suppkey@0 as l_suppkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: l_shipdate@3 >= 9496 AND l_shipdate@3 < 9587 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "__value", index: 0 }], 2), input_partitions=1 | -| | ProjectionExec: expr=[MAX(revenue0.total_revenue)@0 as __value] | -| | AggregateExec: mode=Final, gby=[], aggr=[MAX(revenue0.total_revenue)] | -| | CoalescePartitionsExec | -| | AggregateExec: mode=Partial, gby=[], aggr=[MAX(revenue0.total_revenue)] | -| | ProjectionExec: expr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] | -| | AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "l_suppkey", index: 0 }], 2), input_partitions=2 | -| | AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[l_suppkey@0 as l_suppkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: l_shipdate@3 >= 9496 AND l_shipdate@3 < 9587 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: supplier.s_suppkey ASC NULLS LAST | +| | Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue | +| | Inner Join: revenue0.total_revenue = __scalar_sq_1.MAX(revenue0.total_revenue) | +| | Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, revenue0.total_revenue | +| | Inner Join: supplier.s_suppkey = revenue0.supplier_no | +| | TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone] | +| | SubqueryAlias: revenue0 | +| | Projection: lineitem.l_suppkey AS supplier_no, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue | +| | Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] | +| | Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount | +| | Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") | +| | TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] | +| | SubqueryAlias: __scalar_sq_1 | +| | Aggregate: groupBy=[[]], aggr=[[MAX(revenue0.total_revenue)]] | +| | SubqueryAlias: revenue0 | +| | Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue | +| | Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] | +| | Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount | +| | Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") | +| | TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate] | +| physical_plan | SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST] | +| | SortExec: expr=[s_suppkey@0 ASC NULLS LAST] | +| | ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address, s_phone@3 as s_phone, total_revenue@4 as total_revenue] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "total_revenue", index: 4 }, Column { name: "MAX(revenue0.total_revenue)", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "total_revenue", index: 4 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address, s_phone@3 as s_phone, total_revenue@5 as total_revenue] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_suppkey", index: 0 }, Column { name: "supplier_no", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | ProjectionExec: expr=[l_suppkey@0 as supplier_no, SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] | +| | AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "l_suppkey", index: 0 }], 2), input_partitions=2 | +| | AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[l_suppkey@0 as l_suppkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: l_shipdate@3 >= 9496 AND l_shipdate@3 < 9587 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "MAX(revenue0.total_revenue)", index: 0 }], 2), input_partitions=1 | +| | AggregateExec: mode=Final, gby=[], aggr=[MAX(revenue0.total_revenue)] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=Partial, gby=[], aggr=[MAX(revenue0.total_revenue)] | +| | ProjectionExec: expr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as total_revenue] | +| | AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "l_suppkey", index: 0 }], 2), input_partitions=2 | +| | AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[l_suppkey@0 as l_suppkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: l_shipdate@3 >= 9496 AND l_shipdate@3 < 9587 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/benchmarks/expected-plans/q17.txt b/benchmarks/expected-plans/q17.txt index 9924555f6d7c0..4215c9e036409 100644 --- a/benchmarks/expected-plans/q17.txt +++ b/benchmarks/expected-plans/q17.txt @@ -1,49 +1,49 @@ -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: CAST(SUM(lineitem.l_extendedprice) AS Float64) / Float64(7) AS avg_yearly | -| | Aggregate: groupBy=[[]], aggr=[[SUM(lineitem.l_extendedprice)]] | -| | Projection: lineitem.l_extendedprice | -| | Inner Join: part.p_partkey = __scalar_sq_1.l_partkey Filter: CAST(lineitem.l_quantity AS Decimal128(30, 15)) < __scalar_sq_1.__value | -| | Projection: lineitem.l_quantity, lineitem.l_extendedprice, part.p_partkey | -| | Inner Join: lineitem.l_partkey = part.p_partkey | -| | TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice] | -| | Projection: part.p_partkey | -| | Filter: part.p_brand = Utf8("Brand#23") AND part.p_container = Utf8("MED BOX") | -| | TableScan: part projection=[p_partkey, p_brand, p_container] | -| | SubqueryAlias: __scalar_sq_1 | -| | Projection: lineitem.l_partkey, CAST(Float64(0.2) * CAST(AVG(lineitem.l_quantity) AS Float64) AS Decimal128(30, 15)) AS __value | -| | Aggregate: groupBy=[[lineitem.l_partkey]], aggr=[[AVG(lineitem.l_quantity)]] | -| | TableScan: lineitem projection=[l_partkey, l_quantity] | -| physical_plan | ProjectionExec: expr=[CAST(SUM(lineitem.l_extendedprice)@0 AS Float64) / 7 as avg_yearly] | -| | AggregateExec: mode=Final, gby=[], aggr=[SUM(lineitem.l_extendedprice)] | -| | CoalescePartitionsExec | -| | AggregateExec: mode=Partial, gby=[], aggr=[SUM(lineitem.l_extendedprice)] | -| | ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 2 }, Column { name: "l_partkey", index: 0 })], filter=BinaryExpr { left: CastExpr { expr: Column { name: "l_quantity", index: 0 }, cast_type: Decimal128(30, 15), cast_options: CastOptions { safe: false } }, op: Lt, right: Column { name: "__value", index: 1 } } | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 2 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, p_partkey@3 as p_partkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "l_partkey", index: 0 }, Column { name: "p_partkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[p_partkey@0 as p_partkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | ProjectionExec: expr=[l_partkey@0 as l_partkey, CAST(0.2 * CAST(AVG(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as __value] | -| | AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[AVG(lineitem.l_quantity)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[AVG(lineitem.l_quantity)] | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: CAST(SUM(lineitem.l_extendedprice) AS Float64) / Float64(7) AS avg_yearly | +| | Aggregate: groupBy=[[]], aggr=[[SUM(lineitem.l_extendedprice)]] | +| | Projection: lineitem.l_extendedprice | +| | Inner Join: part.p_partkey = __scalar_sq_1.l_partkey Filter: CAST(lineitem.l_quantity AS Decimal128(30, 15)) < __scalar_sq_1.Float64(0.2) * AVG(lineitem.l_quantity) | +| | Projection: lineitem.l_quantity, lineitem.l_extendedprice, part.p_partkey | +| | Inner Join: lineitem.l_partkey = part.p_partkey | +| | TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice] | +| | Projection: part.p_partkey | +| | Filter: part.p_brand = Utf8("Brand#23") AND part.p_container = Utf8("MED BOX") | +| | TableScan: part projection=[p_partkey, p_brand, p_container] | +| | SubqueryAlias: __scalar_sq_1 | +| | Projection: CAST(Float64(0.2) * CAST(AVG(lineitem.l_quantity) AS Float64) AS Decimal128(30, 15)), lineitem.l_partkey | +| | Aggregate: groupBy=[[lineitem.l_partkey]], aggr=[[AVG(lineitem.l_quantity)]] | +| | TableScan: lineitem projection=[l_partkey, l_quantity] | +| physical_plan | ProjectionExec: expr=[CAST(SUM(lineitem.l_extendedprice)@0 AS Float64) / 7 as avg_yearly] | +| | AggregateExec: mode=Final, gby=[], aggr=[SUM(lineitem.l_extendedprice)] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=Partial, gby=[], aggr=[SUM(lineitem.l_extendedprice)] | +| | ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 2 }, Column { name: "l_partkey", index: 1 })], filter=BinaryExpr { left: CastExpr { expr: Column { name: "l_quantity", index: 0 }, cast_type: Decimal128(30, 15), cast_options: CastOptions { safe: false } }, op: Lt, right: Column { name: "Float64(0.2) * AVG(lineitem.l_quantity)", index: 1 } } | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 2 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, p_partkey@3 as p_partkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "l_partkey", index: 0 }, Column { name: "p_partkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[p_partkey@0 as p_partkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | ProjectionExec: expr=[CAST(0.2 * CAST(AVG(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * AVG(lineitem.l_quantity), l_partkey@0 as l_partkey] | +| | AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[AVG(lineitem.l_quantity)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[AVG(lineitem.l_quantity)] | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/benchmarks/expected-plans/q2.txt b/benchmarks/expected-plans/q2.txt index c503bd2e0b713..bb8d8930a30bb 100644 --- a/benchmarks/expected-plans/q2.txt +++ b/benchmarks/expected-plans/q2.txt @@ -3,7 +3,7 @@ +---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | logical_plan | Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST | | | Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment | -| | Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.__value | +| | Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.MIN(partsupp.ps_supplycost) | | | Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name | | | Inner Join: nation.n_regionkey = region.r_regionkey | | | Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name, nation.n_regionkey | @@ -22,7 +22,7 @@ | | Filter: region.r_name = Utf8("EUROPE") | | | TableScan: region projection=[r_regionkey, r_name] | | | SubqueryAlias: __scalar_sq_1 | -| | Projection: partsupp.ps_partkey, MIN(partsupp.ps_supplycost) AS __value | +| | Projection: MIN(partsupp.ps_supplycost), partsupp.ps_partkey | | | Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[MIN(partsupp.ps_supplycost)]] | | | Projection: partsupp.ps_partkey, partsupp.ps_supplycost | | | Inner Join: nation.n_regionkey = region.r_regionkey | @@ -40,7 +40,7 @@ | | SortExec: expr=[s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST] | | | ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@8 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment] | | | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 0 }, Column { name: "ps_partkey", index: 0 }), (Column { name: "ps_supplycost", index: 7 }, Column { name: "__value", index: 1 })] | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 0 }, Column { name: "ps_partkey", index: 1 }), (Column { name: "ps_supplycost", index: 7 }, Column { name: "MIN(partsupp.ps_supplycost)", index: 0 })] | | | CoalesceBatchesExec: target_batch_size=8192 | | | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }, Column { name: "ps_supplycost", index: 7 }], 2), input_partitions=2 | | | ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_name@2 as s_name, s_address@3 as s_address, s_phone@4 as s_phone, s_acctbal@5 as s_acctbal, s_comment@6 as s_comment, ps_supplycost@7 as ps_supplycost, n_name@8 as n_name] | @@ -85,8 +85,8 @@ | | FilterExec: r_name@1 = EUROPE | | | MemoryExec: partitions=0, partition_sizes=[] | | | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "__value", index: 1 }], 2), input_partitions=2 | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, MIN(partsupp.ps_supplycost)@1 as __value] | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 1 }, Column { name: "MIN(partsupp.ps_supplycost)", index: 0 }], 2), input_partitions=2 | +| | ProjectionExec: expr=[MIN(partsupp.ps_supplycost)@1 as MIN(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] | | | AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[MIN(partsupp.ps_supplycost)] | | | CoalesceBatchesExec: target_batch_size=8192 | | | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=2 | diff --git a/benchmarks/expected-plans/q20.txt b/benchmarks/expected-plans/q20.txt index 41f2dac58300d..683eba24946c2 100644 --- a/benchmarks/expected-plans/q20.txt +++ b/benchmarks/expected-plans/q20.txt @@ -1,80 +1,80 @@ -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: supplier.s_name ASC NULLS LAST | -| | Projection: supplier.s_name, supplier.s_address | -| | LeftSemi Join: supplier.s_suppkey = __correlated_sq_1.ps_suppkey | -| | Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address | -| | Inner Join: supplier.s_nationkey = nation.n_nationkey | -| | TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey] | -| | Projection: nation.n_nationkey | -| | Filter: nation.n_name = Utf8("CANADA") | -| | TableScan: nation projection=[n_nationkey, n_name] | -| | SubqueryAlias: __correlated_sq_1 | -| | Projection: partsupp.ps_suppkey | -| | Inner Join: partsupp.ps_partkey = __scalar_sq_1.l_partkey, partsupp.ps_suppkey = __scalar_sq_1.l_suppkey Filter: CAST(partsupp.ps_availqty AS Float64) > __scalar_sq_1.__value | -| | LeftSemi Join: partsupp.ps_partkey = __correlated_sq_2.p_partkey | -| | TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty] | -| | SubqueryAlias: __correlated_sq_2 | -| | Projection: part.p_partkey | -| | Filter: part.p_name LIKE Utf8("forest%") | -| | TableScan: part projection=[p_partkey, p_name] | -| | SubqueryAlias: __scalar_sq_1 | -| | Projection: lineitem.l_partkey, lineitem.l_suppkey, Float64(0.5) * CAST(SUM(lineitem.l_quantity) AS Float64) AS __value | -| | Aggregate: groupBy=[[lineitem.l_partkey, lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_quantity)]] | -| | Projection: lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity | -| | Filter: lineitem.l_shipdate >= Date32("8766") AND lineitem.l_shipdate < Date32("9131") | -| | TableScan: lineitem projection=[l_partkey, l_suppkey, l_quantity, l_shipdate] | -| physical_plan | SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] | -| | SortExec: expr=[s_name@0 ASC NULLS LAST] | -| | ProjectionExec: expr=[s_name@1 as s_name, s_address@2 as s_address] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: "s_suppkey", index: 0 }, Column { name: "ps_suppkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: n_name@1 = CANADA | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2), input_partitions=2 | -| | ProjectionExec: expr=[ps_suppkey@1 as ps_suppkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_partkey", index: 0 }, Column { name: "l_partkey", index: 0 }), (Column { name: "ps_suppkey", index: 1 }, Column { name: "l_suppkey", index: 1 })], filter=BinaryExpr { left: CastExpr { expr: Column { name: "ps_availqty", index: 0 }, cast_type: Float64, cast_options: CastOptions { safe: false } }, op: Gt, right: Column { name: "__value", index: 1 } } | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "ps_suppkey", index: 1 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: "ps_partkey", index: 0 }, Column { name: "p_partkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[p_partkey@0 as p_partkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: p_name@1 LIKE forest% | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | ProjectionExec: expr=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey, 0.5 * CAST(SUM(lineitem.l_quantity)@2 AS Float64) as __value] | -| | AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[SUM(lineitem.l_quantity)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }, Column { name: "l_suppkey", index: 1 }], 2), input_partitions=2 | -| | AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[SUM(lineitem.l_quantity)] | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey, l_quantity@2 as l_quantity] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: l_shipdate@3 >= 8766 AND l_shipdate@3 < 9131 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: supplier.s_name ASC NULLS LAST | +| | Projection: supplier.s_name, supplier.s_address | +| | LeftSemi Join: supplier.s_suppkey = __correlated_sq_1.ps_suppkey | +| | Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address | +| | Inner Join: supplier.s_nationkey = nation.n_nationkey | +| | TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey] | +| | Projection: nation.n_nationkey | +| | Filter: nation.n_name = Utf8("CANADA") | +| | TableScan: nation projection=[n_nationkey, n_name] | +| | SubqueryAlias: __correlated_sq_1 | +| | Projection: partsupp.ps_suppkey | +| | Inner Join: partsupp.ps_partkey = __scalar_sq_1.l_partkey, partsupp.ps_suppkey = __scalar_sq_1.l_suppkey Filter: CAST(partsupp.ps_availqty AS Float64) > __scalar_sq_1.Float64(0.5) * SUM(lineitem.l_quantity) | +| | LeftSemi Join: partsupp.ps_partkey = __correlated_sq_2.p_partkey | +| | TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty] | +| | SubqueryAlias: __correlated_sq_2 | +| | Projection: part.p_partkey | +| | Filter: part.p_name LIKE Utf8("forest%") | +| | TableScan: part projection=[p_partkey, p_name] | +| | SubqueryAlias: __scalar_sq_1 | +| | Projection: Float64(0.5) * CAST(SUM(lineitem.l_quantity) AS Float64), lineitem.l_partkey, lineitem.l_suppkey | +| | Aggregate: groupBy=[[lineitem.l_partkey, lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_quantity)]] | +| | Projection: lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity | +| | Filter: lineitem.l_shipdate >= Date32("8766") AND lineitem.l_shipdate < Date32("9131") | +| | TableScan: lineitem projection=[l_partkey, l_suppkey, l_quantity, l_shipdate] | +| physical_plan | SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] | +| | SortExec: expr=[s_name@0 ASC NULLS LAST] | +| | ProjectionExec: expr=[s_name@1 as s_name, s_address@2 as s_address] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: "s_suppkey", index: 0 }, Column { name: "ps_suppkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: n_name@1 = CANADA | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2), input_partitions=2 | +| | ProjectionExec: expr=[ps_suppkey@1 as ps_suppkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_partkey", index: 0 }, Column { name: "l_partkey", index: 1 }), (Column { name: "ps_suppkey", index: 1 }, Column { name: "l_suppkey", index: 2 })], filter=BinaryExpr { left: CastExpr { expr: Column { name: "ps_availqty", index: 0 }, cast_type: Float64, cast_options: CastOptions { safe: false } }, op: Gt, right: Column { name: "Float64(0.5) * SUM(lineitem.l_quantity)", index: 1 } } | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "ps_suppkey", index: 1 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(Column { name: "ps_partkey", index: 0 }, Column { name: "p_partkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[p_partkey@0 as p_partkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: p_name@1 LIKE forest% | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | ProjectionExec: expr=[0.5 * CAST(SUM(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * SUM(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey] | +| | AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[SUM(lineitem.l_quantity)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }, Column { name: "l_suppkey", index: 1 }], 2), input_partitions=2 | +| | AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[SUM(lineitem.l_quantity)] | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey, l_quantity@2 as l_quantity] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: l_shipdate@3 >= 8766 AND l_shipdate@3 < 9131 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/benchmarks/expected-plans/q22.txt b/benchmarks/expected-plans/q22.txt index a84830aceafe7..16aebfe90abf3 100644 --- a/benchmarks/expected-plans/q22.txt +++ b/benchmarks/expected-plans/q22.txt @@ -1,57 +1,55 @@ -+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: custsale.cntrycode ASC NULLS LAST | -| | Projection: custsale.cntrycode, COUNT(UInt8(1)) AS numcust, SUM(custsale.c_acctbal) AS totacctbal | -| | Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[COUNT(UInt8(1)), SUM(custsale.c_acctbal)]] | -| | SubqueryAlias: custsale | -| | Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal | -| | Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_1.__value | -| | CrossJoin: | -| | Projection: customer.c_phone, customer.c_acctbal | -| | LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey | -| | Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | -| | TableScan: customer projection=[c_custkey, c_phone, c_acctbal] | -| | SubqueryAlias: __correlated_sq_1 | -| | TableScan: orders projection=[o_custkey] | -| | SubqueryAlias: __scalar_sq_1 | -| | Projection: AVG(customer.c_acctbal) AS __value | -| | Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] | -| | Projection: customer.c_acctbal | -| | Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | -| | TableScan: customer projection=[c_phone, c_acctbal] | -| physical_plan | SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] | -| | SortExec: expr=[cntrycode@0 ASC NULLS LAST] | -| | ProjectionExec: expr=[cntrycode@0 as cntrycode, COUNT(UInt8(1))@1 as numcust, SUM(custsale.c_acctbal)@2 as totacctbal] | -| | AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2), input_partitions=1 | -| | AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | -| | ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: CAST(c_acctbal@1 AS Decimal128(19, 6)) > __value@2 | -| | CrossJoinExec | -| | CoalescePartitionsExec | -| | ProjectionExec: expr=[c_phone@1 as c_phone, c_acctbal@2 as c_acctbal] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(Column { name: "c_custkey", index: 0 }, Column { name: "o_custkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: Use substr(c_phone@1, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | ProjectionExec: expr=[AVG(customer.c_acctbal)@0 as __value] | -| | AggregateExec: mode=Final, gby=[], aggr=[AVG(customer.c_acctbal)] | -| | CoalescePartitionsExec | -| | AggregateExec: mode=Partial, gby=[], aggr=[AVG(customer.c_acctbal)] | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[c_acctbal@1 as c_acctbal] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: c_acctbal@1 > Some(0),15,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: custsale.cntrycode ASC NULLS LAST | +| | Projection: custsale.cntrycode, COUNT(UInt8(1)) AS numcust, SUM(custsale.c_acctbal) AS totacctbal | +| | Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[COUNT(UInt8(1)), SUM(custsale.c_acctbal)]] | +| | SubqueryAlias: custsale | +| | Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal | +| | Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_1.AVG(customer.c_acctbal) | +| | CrossJoin: | +| | Projection: customer.c_phone, customer.c_acctbal | +| | LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey | +| | Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | +| | TableScan: customer projection=[c_custkey, c_phone, c_acctbal] | +| | SubqueryAlias: __correlated_sq_1 | +| | TableScan: orders projection=[o_custkey] | +| | SubqueryAlias: __scalar_sq_1 | +| | Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] | +| | Projection: customer.c_acctbal | +| | Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | +| | TableScan: customer projection=[c_phone, c_acctbal] | +| physical_plan | SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] | +| | SortExec: expr=[cntrycode@0 ASC NULLS LAST] | +| | ProjectionExec: expr=[cntrycode@0 as cntrycode, COUNT(UInt8(1))@1 as numcust, SUM(custsale.c_acctbal)@2 as totacctbal] | +| | AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2), input_partitions=1 | +| | AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | +| | ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: CAST(c_acctbal@1 AS Decimal128(19, 6)) > AVG(customer.c_acctbal)@2 | +| | CrossJoinExec | +| | CoalescePartitionsExec | +| | ProjectionExec: expr=[c_phone@1 as c_phone, c_acctbal@2 as c_acctbal] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(Column { name: "c_custkey", index: 0 }, Column { name: "o_custkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: Use substr(c_phone@1, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | AggregateExec: mode=Final, gby=[], aggr=[AVG(customer.c_acctbal)] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=Partial, gby=[], aggr=[AVG(customer.c_acctbal)] | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[c_acctbal@1 as c_acctbal] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: c_acctbal@1 > Some(0),15,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 57343ea95c6df..273dc91b76714 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -2028,7 +2028,7 @@ async fn subquery_to_join_with_both_side_expr() -> Result<()> { " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; @@ -2072,7 +2072,7 @@ async fn subquery_to_join_with_muti_filter() -> Result<()> { " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_int:UInt32;N]", " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; @@ -2116,7 +2116,7 @@ async fn three_projection_exprs_subquery_to_join() -> Result<()> { " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; @@ -2160,7 +2160,7 @@ async fn in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { " Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; @@ -2189,7 +2189,7 @@ async fn not_in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { " LeftAnti Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; @@ -2220,7 +2220,7 @@ async fn in_subquery_to_join_with_outer_filter() -> Result<()> { " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; @@ -2267,10 +2267,10 @@ async fn two_in_subquery_to_join_with_outer_filter() -> Result<()> { " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", " SubqueryAlias: __correlated_sq_2 [CAST(t2_int AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_int AS Int64) + Int64(1) AS CAST(t2_int AS Int64) + Int64(1) [CAST(t2_int AS Int64) + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_int AS Int64) + Int64(1) AS t2.t2_int + Int64(1) AS CAST(t2_int AS Int64) + Int64(1) [CAST(t2_int AS Int64) + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", ]; @@ -2579,8 +2579,10 @@ async fn exists_distinct_subquery_to_join() -> Result<()> { " LeftAnti Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_1.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [t2_id:UInt32;N]", - " Aggregate: groupBy=[[t2.t2_id]], aggr=[[]] [t2_id:UInt32;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " Projection: t2.t2_id [t2_id:UInt32;N]", + " Aggregate: groupBy=[[t2.t2_int, t2.t2_id]], aggr=[[]] [t2_int:UInt32;N, t2_id:UInt32;N]", + " Projection: t2.t2_int, t2.t2_id [t2_int:UInt32;N, t2_id:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -2620,8 +2622,10 @@ async fn exists_distinct_subquery_to_join_with_expr() -> Result<()> { " LeftAnti Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_1.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [t2_id:UInt32;N]", - " Aggregate: groupBy=[[t2.t2_id]], aggr=[[]] [t2_id:UInt32;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " Projection: t2.t2_id [t2_id:UInt32;N]", + " Aggregate: groupBy=[[t2.t2_id + t2.t2_int, t2.t2_int, t2.t2_id]], aggr=[[]] [t2.t2_id + t2.t2_int:UInt32;N, t2_int:UInt32;N, t2_id:UInt32;N]", + " Projection: t2.t2_id + t2.t2_int, t2.t2_int, t2.t2_id [t2.t2_id + t2.t2_int:UInt32;N, t2_int:UInt32;N, t2_id:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -2661,8 +2665,10 @@ async fn exists_distinct_subquery_to_join_with_literal() -> Result<()> { " LeftAnti Join: Filter: CAST(t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_1.t2_id AS Int64) * Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " SubqueryAlias: __correlated_sq_1 [t2_id:UInt32;N]", - " Aggregate: groupBy=[[t2.t2_id]], aggr=[[]] [t2_id:UInt32;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + " Projection: t2.t2_id [t2_id:UInt32;N]", + " Aggregate: groupBy=[[Int64(1), t2.t2_int, t2.t2_id]], aggr=[[]] [Int64(1):Int64, t2_int:UInt32;N, t2_id:UInt32;N]", + " Projection: Int64(1), t2.t2_int, t2.t2_id [Int64(1):Int64, t2_int:UInt32;N, t2_id:UInt32;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); diff --git a/datafusion/core/tests/sql/subqueries.rs b/datafusion/core/tests/sql/subqueries.rs index 640628e0b5006..5b55b2e031ca4 100644 --- a/datafusion/core/tests/sql/subqueries.rs +++ b/datafusion/core/tests/sql/subqueries.rs @@ -52,16 +52,16 @@ where c_acctbal < ( let actual = format!("{}", plan.display_indent()); let expected = "Sort: customer.c_custkey ASC NULLS LAST\ \n Projection: customer.c_custkey\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_1.__value\ + \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey Filter: CAST(customer.c_acctbal AS Decimal128(25, 2)) < __scalar_sq_1.SUM(orders.o_totalprice)\ \n TableScan: customer projection=[c_custkey, c_acctbal]\ \n SubqueryAlias: __scalar_sq_1\ - \n Projection: orders.o_custkey, SUM(orders.o_totalprice) AS __value\ + \n Projection: SUM(orders.o_totalprice), orders.o_custkey\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[SUM(orders.o_totalprice)]]\ \n Projection: orders.o_custkey, orders.o_totalprice\ - \n Inner Join: orders.o_orderkey = __scalar_sq_2.l_orderkey Filter: CAST(orders.o_totalprice AS Decimal128(25, 2)) < __scalar_sq_2.__value\ + \n Inner Join: orders.o_orderkey = __scalar_sq_2.l_orderkey Filter: CAST(orders.o_totalprice AS Decimal128(25, 2)) < __scalar_sq_2.price\ \n TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice]\ \n SubqueryAlias: __scalar_sq_2\ - \n Projection: lineitem.l_orderkey, SUM(lineitem.l_extendedprice) AS price AS __value\ + \n Projection: SUM(lineitem.l_extendedprice) AS price, lineitem.l_orderkey\ \n Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[SUM(lineitem.l_extendedprice)]]\ \n TableScan: lineitem projection=[l_orderkey, l_extendedprice]"; assert_eq!(actual, expected); @@ -337,13 +337,12 @@ async fn non_aggregated_correlated_scalar_subquery_with_single_row() -> Result<( let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Projection: t1.t1_id, () AS t2_int [t1_id:UInt32;N, t2_int:Int64]", - " Subquery: [a:Int64]", - " Projection: a [a:Int64]", - " Filter: a = CAST(outer_ref(t1.t1_int) AS Int64) [a:Int64]", - " Projection: Int64(1) AS a [a:Int64]", - " EmptyRelation []", - " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + "Projection: t1.t1_id, __scalar_sq_5.a AS t2_int [t1_id:UInt32;N, t2_int:Int64;N]", + " Left Join: CAST(t1.t1_int AS Int64) = __scalar_sq_5.a [t1_id:UInt32;N, t1_int:UInt32;N, a:Int64;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_5 [a:Int64]", + " Projection: Int64(1) AS a [a:Int64]", + " EmptyRelation []", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -352,6 +351,21 @@ async fn non_aggregated_correlated_scalar_subquery_with_single_row() -> Result<( "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" ); + // TODO infer nullability in the schema has bug + // // assert data + // let results = execute_to_batches(&ctx, sql).await; + // let expected = vec![ + // "+-------+--------+", + // "| t1_id | t2_int |", + // "+-------+--------+", + // "| 22 | |", + // "| 33 | |", + // "| 11 | 1 |", + // "| 44 | |", + // "+-------+--------+", + // ]; + // assert_batches_eq!(expected, &results); + Ok(()) } @@ -382,11 +396,11 @@ async fn aggregated_correlated_scalar_subquery() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Projection: t1.t1_id, __scalar_sq_1.__value AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]", - " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, t2_id:UInt32;N, __value:UInt64;N]", + "Projection: t1.t1_id, __scalar_sq_1.SUM(t2.t2_int) AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]", + " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", - " SubqueryAlias: __scalar_sq_1 [t2_id:UInt32;N, __value:UInt64;N]", - " Projection: t2.t2_id, SUM(t2.t2_int) AS __value [t2_id:UInt32;N, __value:UInt64;N]", + " SubqueryAlias: __scalar_sq_1 [SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", + " Projection: SUM(t2.t2_int), t2.t2_id [SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", " Aggregate: groupBy=[[t2.t2_id]], aggr=[[SUM(t2.t2_int)]] [t2_id:UInt32;N, SUM(t2.t2_int):UInt64;N]", " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; @@ -397,6 +411,105 @@ async fn aggregated_correlated_scalar_subquery() -> Result<()> { "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" ); + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+--------+", + "| t1_id | t2_sum |", + "+-------+--------+", + "| 11 | 3 |", + "| 22 | 1 |", + "| 44 | 3 |", + "| 33 | |", + "+-------+--------+", + ]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn aggregated_correlated_scalar_subquery_with_having() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT sum(t2_int) FROM t2 WHERE t2.t2_id = t1.t1_id having sum(t2_int) < 3) as t2_sum from t1"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_id, __scalar_sq_1.SUM(t2.t2_int) AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]", + " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", + " Projection: SUM(t2.t2_int), t2.t2_id [SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", + " Filter: SUM(t2.t2_int) < UInt64(3) [t2_id:UInt32;N, SUM(t2.t2_int):UInt64;N]", + " Aggregate: groupBy=[[t2.t2_id]], aggr=[[SUM(t2.t2_int)]] [t2_id:UInt32;N, SUM(t2.t2_int):UInt64;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+--------+", + "| t1_id | t2_sum |", + "+-------+--------+", + "| 22 | 1 |", + "| 11 | |", + "| 33 | |", + "| 44 | |", + "+-------+--------+", + ]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn aggregated_correlated_scalar_subquery_with_cast() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT sum(t2_int * 1.0) + 1 FROM t2 WHERE t2.t2_id = t1.t1_id) as t2_sum from t1"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_id, __scalar_sq_1.SUM(t2.t2_int * Float64(1)) + Int64(1) AS t2_sum [t1_id:UInt32;N, t2_sum:Float64;N]", + " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, SUM(t2.t2_int * Float64(1)) + Int64(1):Float64;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [SUM(t2.t2_int * Float64(1)) + Int64(1):Float64;N, t2_id:UInt32;N]", + " Projection: SUM(t2.t2_int * Float64(1)) + Float64(1) AS SUM(t2.t2_int * Float64(1)) + Int64(1), t2.t2_id [SUM(t2.t2_int * Float64(1)) + Int64(1):Float64;N, t2_id:UInt32;N]", + " Aggregate: groupBy=[[t2.t2_id]], aggr=[[SUM(CAST(t2.t2_int AS Float64)) AS SUM(t2.t2_int * Float64(1))]] [t2_id:UInt32;N, SUM(t2.t2_int * Float64(1)):Float64;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+--------+", + "| t1_id | t2_sum |", + "+-------+--------+", + "| 11 | 4.0 |", + "| 22 | 2.0 |", + "| 44 | 4.0 |", + "| 33 | |", + "+-------+--------+", + ]; + assert_batches_eq!(expected, &results); + Ok(()) } @@ -429,12 +542,12 @@ async fn aggregated_correlated_scalar_subquery_with_extra_group_by_constant() -> let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Projection: t1.t1_id, __scalar_sq_1.__value AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]", - " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, t2_id:UInt32;N, __value:UInt64;N]", + "Projection: t1.t1_id, __scalar_sq_1.SUM(t2.t2_int) AS t2_sum [t1_id:UInt32;N, t2_sum:UInt64;N]", + " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", - " SubqueryAlias: __scalar_sq_1 [t2_id:UInt32;N, __value:UInt64;N]", - " Projection: t2.t2_id, SUM(t2.t2_int) AS __value [t2_id:UInt32;N, __value:UInt64;N]", - " Aggregate: groupBy=[[t2.t2_id]], aggr=[[SUM(t2.t2_int)]] [t2_id:UInt32;N, SUM(t2.t2_int):UInt64;N]", + " SubqueryAlias: __scalar_sq_1 [SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", + " Projection: SUM(t2.t2_int), t2.t2_id [SUM(t2.t2_int):UInt64;N, t2_id:UInt32;N]", + " Aggregate: groupBy=[[t2.t2_id, Utf8(\"a\")]], aggr=[[SUM(t2.t2_int)]] [t2_id:UInt32;N, Utf8(\"a\"):Utf8, SUM(t2.t2_int):UInt64;N]", " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -444,6 +557,20 @@ async fn aggregated_correlated_scalar_subquery_with_extra_group_by_constant() -> "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" ); + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+--------+", + "| t1_id | t2_sum |", + "+-------+--------+", + "| 11 | 3 |", + "| 22 | 1 |", + "| 44 | 3 |", + "| 33 | |", + "+-------+--------+", + ]; + assert_batches_eq!(expected, &results); + Ok(()) } @@ -549,7 +676,7 @@ async fn support_join_correlated_columns() -> Result<()> { } #[tokio::test] -async fn support_join_correlated_columns2() -> Result<()> { +async fn subquery_contains_join_contains_correlated_columns() -> Result<()> { let ctx = create_sub_query_join_context("t0_id", "t1_id", "t2_id", true)?; let sql = "SELECT t0_id, t0_name FROM t0 WHERE EXISTS (SELECT 1 FROM t1 INNER JOIN (select * from t2 where t2.t2_name = t0.t0_name) as t2 ON(t1.t1_id = t2.t2_id ))"; let msg = format!("Creating logical plan for '{sql}'"); @@ -557,16 +684,44 @@ async fn support_join_correlated_columns2() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Filter: EXISTS () [t0_id:UInt32;N, t0_name:Utf8;N]", - " Subquery: [Int64(1):Int64]", - " Projection: Int64(1) [Int64(1):Int64]", - " Inner Join: Filter: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t1 [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Projection: t2.t2_id, t2.t2_name, t2.t2_int [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t2.t2_name = outer_ref(t0.t0_name) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + "LeftSemi Join: t0.t0_name = __correlated_sq_1.t2_name [t0_id:UInt32;N, t0_name:Utf8;N]", + " TableScan: t0 projection=[t0_id, t0_name] [t0_id:UInt32;N, t0_name:Utf8;N]", + " SubqueryAlias: __correlated_sq_1 [t2_name:Utf8;N]", + " Projection: t2.t2_name [t2_name:Utf8;N]", + " Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + " SubqueryAlias: t2 [t2_id:UInt32;N, t2_name:Utf8;N]", + " TableScan: t2 projection=[t2_id, t2_name] [t2_id:UInt32;N, t2_name:Utf8;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + Ok(()) +} + +#[tokio::test] +async fn subquery_contains_join_contains_sub_query_alias_correlated_columns() -> Result<()> +{ + let ctx = create_sub_query_join_context("t0_id", "t1_id", "t2_id", true)?; + let sql = "SELECT t0_id, t0_name FROM t0 WHERE EXISTS (select 1 from (SELECT * FROM t1 where t1.t1_id = t0.t0_id) as x INNER JOIN (select * from t2 where t2.t2_name = t0.t0_name) as y ON(x.t1_id = y.t2_id))"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "LeftSemi Join: t0.t0_id = __correlated_sq_1.t1_id, t0.t0_name = __correlated_sq_1.t2_name [t0_id:UInt32;N, t0_name:Utf8;N]", " TableScan: t0 projection=[t0_id, t0_name] [t0_id:UInt32;N, t0_name:Utf8;N]", + " SubqueryAlias: __correlated_sq_1 [t1_id:UInt32;N, t2_name:Utf8;N]", + " Projection: x.t1_id, y.t2_name [t1_id:UInt32;N, t2_name:Utf8;N]", + " Inner Join: x.t1_id = y.t2_id [t1_id:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N]", + " SubqueryAlias: x [t1_id:UInt32;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + " SubqueryAlias: y [t2_id:UInt32;N, t2_name:Utf8;N]", + " TableScan: t2 projection=[t2_id, t2_name] [t2_id:UInt32;N, t2_name:Utf8;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -606,24 +761,21 @@ async fn support_order_by_correlated_columns() -> Result<()> { Ok(()) } -// TODO: issue https://github.com/apache/arrow-datafusion/issues/6263 -#[ignore] #[tokio::test] -async fn support_limit_subquery() -> Result<()> { +async fn exists_subquery_with_select_null() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", true)?; - let sql = "SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 1)"; + let sql = "SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT NULL)"; let msg = format!("Creating logical plan for '{sql}'"); let dataframe = ctx.sql(sql).await.expect(&msg); let plan = dataframe.into_optimized_plan()?; + // decorrelated, limit is removed let expected = vec![ "Filter: EXISTS () [t1_id:UInt32;N, t1_name:Utf8;N]", - " Subquery: [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Limit: skip=0, fetch=1 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Projection: t2.t2_id, t2.t2_name, t2.t2_int [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " Filter: t2.t2_id = outer_ref(t1.t1_id) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", - " TableScan: t2 [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", + " Subquery: [NULL:Null;N]", + " Projection: NULL [NULL:Null;N]", + " EmptyRelation []", " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -633,11 +785,127 @@ async fn support_limit_subquery() -> Result<()> { "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" ); + Ok(()) +} + +#[tokio::test] +async fn exists_subquery_with_limit() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 1)"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // de-correlated, limit is removed + let expected = vec![ + "LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " SubqueryAlias: __correlated_sq_1 [t2_id:UInt32;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+---------+", + "| t1_id | t1_name |", + "+-------+---------+", + "| 11 | a |", + "| 22 | b |", + "| 44 | d |", + "+-------+---------+", + ]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn exists_subquery_with_limit0() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 0)"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // de-correlated, limit is removed and replaced with EmptyRelation + let expected = vec![ + "LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " EmptyRelation [t2_id:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec!["++", "++"]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn not_exists_subquery_with_limit0() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, t1_name FROM t1 WHERE NOT EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 0)"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // de-correlated, limit is removed and replaced with EmptyRelation + let expected = vec![ + "LeftAnti Join: t1.t1_id = __correlated_sq_1.t2_id [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " EmptyRelation [t2_id:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+---------+", + "| t1_id | t1_name |", + "+-------+---------+", + "| 11 | a |", + "| 22 | b |", + "| 33 | c |", + "| 44 | d |", + "+-------+---------+", + ]; + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn in_correlated_subquery_with_limit() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + let sql = "SELECT t1_id, t1_name FROM t1 WHERE t1_id in (SELECT t2_id FROM t2 where t1_name = t2_name limit 10)"; let msg = format!("Creating logical plan for '{sql}'"); let dataframe = ctx.sql(sql).await.expect(&msg); let plan = dataframe.into_optimized_plan()?; + // not de-correlated let expected = vec![ "Filter: t1.t1_id IN () [t1_id:UInt32;N, t1_name:Utf8;N]", " Subquery: [t2_id:UInt32;N]", @@ -657,6 +925,34 @@ async fn support_limit_subquery() -> Result<()> { Ok(()) } +#[tokio::test] +async fn in_non_correlated_subquery_with_limit() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = + "SELECT t1_id, t1_name FROM t1 WHERE t1_id in (SELECT t2_id FROM t2 limit 10)"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // de-correlated, limit is kept + let expected = vec![ + "LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id [t1_id:UInt32;N, t1_name:Utf8;N]", + " TableScan: t1 projection=[t1_id, t1_name] [t1_id:UInt32;N, t1_name:Utf8;N]", + " SubqueryAlias: __correlated_sq_1 [t2_id:UInt32;N]", + " Limit: skip=0, fetch=10 [t2_id:UInt32;N]", + " TableScan: t2 projection=[t2_id], fetch=10 [t2_id:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + Ok(()) +} + #[tokio::test] async fn support_union_subquery() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", true)?; @@ -702,11 +998,10 @@ async fn simple_uncorrelated_scalar_subquery() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Projection: __scalar_sq_1.__value AS b [b:Int64;N]", - " SubqueryAlias: __scalar_sq_1 [__value:Int64;N]", - " Projection: COUNT(UInt8(1)) AS __value [__value:Int64;N]", - " Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]] [COUNT(UInt8(1)):Int64;N]", - " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + "Projection: __scalar_sq_1.COUNT(UInt8(1)) AS b [b:Int64;N]", + " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N]", + " Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]] [COUNT(UInt8(1)):Int64;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); @@ -734,16 +1029,14 @@ async fn simple_uncorrelated_scalar_subquery2() -> Result<()> { let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Projection: __scalar_sq_1.__value AS b, __scalar_sq_2.__value AS c [b:Int64;N, c:Int64;N]", - " CrossJoin: [__value:Int64;N, __value:Int64;N]", - " SubqueryAlias: __scalar_sq_1 [__value:Int64;N]", - " Projection: COUNT(UInt8(1)) AS __value [__value:Int64;N]", - " Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]] [COUNT(UInt8(1)):Int64;N]", - " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", - " SubqueryAlias: __scalar_sq_2 [__value:Int64;N]", - " Projection: COUNT(Int64(1)) AS __value [__value:Int64;N]", - " Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]] [COUNT(Int64(1)):Int64;N]", - " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + "Projection: __scalar_sq_1.COUNT(UInt8(1)) AS b, __scalar_sq_2.COUNT(Int64(1)) AS c [b:Int64;N, c:Int64;N]", + " CrossJoin: [COUNT(UInt8(1)):Int64;N, COUNT(Int64(1)):Int64;N]", + " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N]", + " Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]] [COUNT(UInt8(1)):Int64;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + " SubqueryAlias: __scalar_sq_2 [COUNT(Int64(1)):Int64;N]", + " Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]] [COUNT(Int64(1)):Int64;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; let formatted = plan.display_indent_schema().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs index a4875d5cbf338..b6eca18fe5ecb 100644 --- a/datafusion/core/tests/tpcds_planning.rs +++ b/datafusion/core/tests/tpcds_planning.rs @@ -557,7 +557,6 @@ async fn tpcds_physical_q5() -> Result<()> { create_physical_plan(5).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q6() -> Result<()> { create_physical_plan(6).await @@ -568,13 +567,11 @@ async fn tpcds_physical_q7() -> Result<()> { create_physical_plan(7).await } -#[ignore] // The type of Int32 = Int64 of binary physical should be same #[tokio::test] async fn tpcds_physical_q8() -> Result<()> { create_physical_plan(8).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q9() -> Result<()> { create_physical_plan(9).await @@ -601,7 +598,6 @@ async fn tpcds_physical_q13() -> Result<()> { create_physical_plan(13).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q14() -> Result<()> { create_physical_plan(14).await @@ -647,7 +643,6 @@ async fn tpcds_physical_q22() -> Result<()> { create_physical_plan(22).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q23() -> Result<()> { create_physical_plan(23).await @@ -755,7 +750,6 @@ async fn tpcds_physical_q43() -> Result<()> { create_physical_plan(43).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q44() -> Result<()> { create_physical_plan(44).await @@ -807,7 +801,7 @@ async fn tpcds_physical_q53() -> Result<()> { create_physical_plan(53).await } -#[ignore] // Physical plan does not support logical expression () +//#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q54() -> Result<()> { create_physical_plan(54).await @@ -828,7 +822,6 @@ async fn tpcds_physical_q57() -> Result<()> { create_physical_plan(57).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q58() -> Result<()> { create_physical_plan(58).await @@ -859,7 +852,6 @@ async fn tpcds_physical_q63() -> Result<()> { create_physical_plan(63).await } -#[ignore] // thread 'q64' has overflowed its stack #[tokio::test] async fn tpcds_physical_q64() -> Result<()> { create_physical_plan(64).await @@ -965,7 +957,6 @@ async fn tpcds_physical_q84() -> Result<()> { create_physical_plan(84).await } -#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q85() -> Result<()> { create_physical_plan(85).await diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index d95edb7e7c3a4..f1ef72d3148bc 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -17,6 +17,7 @@ use crate::expr::InSubquery; use crate::expr::{Exists, Placeholder}; +use crate::expr_rewriter::unnormalize_col; ///! Logical plan types use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor}; use crate::logical_plan::extension::UserDefinedLogicalNode; @@ -402,6 +403,78 @@ impl LogicalPlan { Ok(using_columns) } + pub fn head_output_expr(&self) -> Result> { + match self { + LogicalPlan::Projection(projection) => { + Ok(Some(projection.expr.as_slice()[0].clone())) + } + LogicalPlan::Aggregate(agg) => { + if agg.group_expr.is_empty() { + Ok(Some(agg.aggr_expr.as_slice()[0].clone())) + } else { + Ok(Some(agg.group_expr.as_slice()[0].clone())) + } + } + LogicalPlan::Filter(filter) => filter.input.head_output_expr(), + LogicalPlan::Distinct(distinct) => distinct.input.head_output_expr(), + LogicalPlan::Sort(sort) => sort.input.head_output_expr(), + LogicalPlan::Limit(limit) => limit.input.head_output_expr(), + LogicalPlan::Join(Join { + left, + right, + join_type, + .. + }) => match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => { + if left.schema().fields().is_empty() { + right.head_output_expr() + } else { + left.head_output_expr() + } + } + JoinType::LeftSemi | JoinType::LeftAnti => left.head_output_expr(), + JoinType::RightSemi | JoinType::RightAnti => right.head_output_expr(), + }, + LogicalPlan::CrossJoin(cross) => { + if cross.left.schema().fields().is_empty() { + cross.right.head_output_expr() + } else { + cross.left.head_output_expr() + } + } + LogicalPlan::Repartition(repartition) => repartition.input.head_output_expr(), + LogicalPlan::Window(window) => window.input.head_output_expr(), + LogicalPlan::Union(union) => Ok(Some(Expr::Column( + union.schema.fields()[0].qualified_column(), + ))), + LogicalPlan::TableScan(table) => Ok(Some(Expr::Column( + table.projected_schema.fields()[0].qualified_column(), + ))), + LogicalPlan::SubqueryAlias(subquery_alias) => { + let expr_opt = subquery_alias.input.head_output_expr()?; + Ok(expr_opt.map(|expr| { + let col_name = format!("{:?}", unnormalize_col(expr)); + Expr::Column(Column::new( + Some(subquery_alias.alias.clone()), + col_name, + )) + })) + } + LogicalPlan::Subquery(_) => Ok(None), + LogicalPlan::EmptyRelation(_) + | LogicalPlan::Prepare(_) + | LogicalPlan::Statement(_) + | LogicalPlan::Values(_) + | LogicalPlan::Explain(_) + | LogicalPlan::Analyze(_) + | LogicalPlan::Extension(_) + | LogicalPlan::Dml(_) + | LogicalPlan::Ddl(_) + | LogicalPlan::DescribeTable(_) + | LogicalPlan::Unnest(_) => Ok(None), + } + } + pub fn with_new_inputs(&self, inputs: &[LogicalPlan]) -> Result { from_plan(self, &self.expressions(), inputs) } diff --git a/datafusion/optimizer/src/analyzer/subquery.rs b/datafusion/optimizer/src/analyzer/subquery.rs index 7cdedc06b4530..e946cc22b7b91 100644 --- a/datafusion/optimizer/src/analyzer/subquery.rs +++ b/datafusion/optimizer/src/analyzer/subquery.rs @@ -100,6 +100,16 @@ pub fn check_subquery_expr( } check_correlations_in_subquery(inner_plan, true) } else { + if let Expr::InSubquery(subquery) = expr { + // InSubquery should only return one column + if subquery.subquery.subquery.schema().fields().len() > 1 { + return Err(datafusion_common::DataFusionError::Plan(format!( + "InSubquery should only return one column, but found {}: {}", + subquery.subquery.subquery.schema().fields().len(), + subquery.subquery.subquery.schema().field_names().join(", "), + ))); + } + } match outer_plan { LogicalPlan::Projection(_) | LogicalPlan::Filter(_) diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 8630c606499b3..bfa5bbba32f8b 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -18,19 +18,20 @@ use crate::alias::AliasGenerator; use crate::optimizer::ApplyOrder; use crate::utils::{ - collect_subquery_cols, conjunction, extract_join_filters, only_or_err, - replace_qualified_name, split_conjunction, + conjunction, replace_qualified_name, split_conjunction, PullUpCorrelatedExpr, }; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::{context, Column, DataFusionError, Result}; +use datafusion_common::tree_node::TreeNode; +use datafusion_common::{Column, DataFusionError, Result}; use datafusion_expr::expr::{Exists, InSubquery}; use datafusion_expr::expr_rewriter::unnormalize_col; -use datafusion_expr::logical_plan::{JoinType, Projection, Subquery}; +use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::{ - exists, in_subquery, not_exists, not_in_subquery, BinaryExpr, Distinct, Expr, Filter, + exists, in_subquery, not_exists, not_in_subquery, BinaryExpr, Expr, Filter, LogicalPlan, LogicalPlanBuilder, Operator, }; use log::debug; +use std::collections::BTreeSet; use std::ops::Deref; use std::sync::Arc; @@ -200,29 +201,82 @@ fn build_join( left: &LogicalPlan, alias: &AliasGenerator, ) -> Result> { - let in_predicate = query_info - .where_in_expr + let where_in_expr_opt = &query_info.where_in_expr; + let in_predicate_opt = where_in_expr_opt .clone() - .map(|in_expr| { - let projection = Projection::try_from_plan(&query_info.query.subquery) - .map_err(|e| context!("a projection is required", e))?; - // TODO add the validate logic to Analyzer - let subquery_expr = only_or_err(projection.expr.as_slice()) - .map_err(|e| context!("single expression projection required", e))?; - - // in_predicate may be also include in the join filters - Ok(Expr::eq(in_expr, subquery_expr.clone())) + .map(|where_in_expr| { + query_info.query.subquery.head_output_expr()?.map_or( + Err(DataFusionError::Plan( + "single expression required.".to_string(), + )), + |expr| Ok(Expr::eq(where_in_expr, expr)), + ) }) - .map_or(Ok(None), |v: Result| v.map(Some))?; + .map_or(Ok(None), |v| v.map(Some))?; let subquery = query_info.query.subquery.as_ref(); let subquery_alias = alias.next("__correlated_sq"); - if let Some((join_filter, subquery_plan)) = - pull_up_correlated_expr(subquery, in_predicate, &subquery_alias)? - { - let sub_query_alias = LogicalPlanBuilder::from(subquery_plan) - .alias(subquery_alias.clone())? - .build()?; + + let mut pull_up = PullUpCorrelatedExpr { + join_filters: vec![], + correlated_subquery_cols_map: Default::default(), + in_predicate_opt: in_predicate_opt.clone(), + exists_sub_query: in_predicate_opt.is_none(), + can_pull_up: true, + }; + let new_plan = subquery.clone().rewrite(&mut pull_up)?; + if !pull_up.can_pull_up { + return Ok(None); + } + + let sub_query_alias = LogicalPlanBuilder::from(new_plan) + .alias(subquery_alias.to_string())? + .build()?; + let mut all_correlated_cols = BTreeSet::new(); + pull_up + .correlated_subquery_cols_map + .values() + .for_each(|cols| all_correlated_cols.extend(cols.clone())); + + // alias the join filter + let join_filter_opt = + conjunction(pull_up.join_filters).map_or(Ok(None), |filter| { + replace_qualified_name(filter, &all_correlated_cols, &subquery_alias) + .map(Option::Some) + })?; + + if let Some(join_filter) = match (join_filter_opt, in_predicate_opt) { + ( + Some(join_filter), + Some(Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + })), + ) => { + let right_expr_name = format!("{:?}", unnormalize_col(right.deref().clone())); + let right_col = + Column::new(Some(subquery_alias), right_expr_name); + let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); + Some(in_predicate.and(join_filter)) + } + (Some(join_filter), _) => Some(join_filter), + ( + _, + Some(Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + })), + ) => { + let right_expr_name = format!("{:?}", unnormalize_col(right.deref().clone())); + let right_col = + Column::new(Some(subquery_alias), right_expr_name); + let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); + Some(in_predicate) + } + _ => None, + } { // join our sub query into the main plan let join_type = match query_info.negated { true => JoinType::LeftAnti, @@ -246,141 +300,6 @@ fn build_join( } } -/// This function pull up the correlated expressions(contains outer reference columns) from the inner subquery's [Filter]. -/// It adds the inner reference columns to the [Projection] of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. -/// -/// This function can't handle the non-correlated subquery, and will return None. -fn pull_up_correlated_expr( - subquery: &LogicalPlan, - in_predicate_opt: Option, - subquery_alias: &str, -) -> Result> { - match subquery { - LogicalPlan::Distinct(subqry_distinct) => { - let distinct_input = &subqry_distinct.input; - let optimized_plan = pull_up_correlated_expr( - distinct_input, - in_predicate_opt, - subquery_alias, - )? - .map(|(filters, right)| { - ( - filters, - LogicalPlan::Distinct(Distinct { - input: Arc::new(right), - }), - ) - }); - Ok(optimized_plan) - } - LogicalPlan::Projection(projection) => { - // extract join filters from the inner subquery's Filter - let (mut join_filters, subquery_input) = - extract_join_filters(&projection.input)?; - if in_predicate_opt.is_none() && join_filters.is_empty() { - // cannot rewrite non-correlated subquery - return Ok(None); - } - - if let Some(in_predicate) = &in_predicate_opt { - // in_predicate may be already included in the join filters, remove it from the join filters first. - join_filters = remove_duplicated_filter(join_filters, in_predicate); - } - let input_schema = subquery_input.schema(); - let correlated_subquery_cols = - collect_subquery_cols(&join_filters, input_schema.clone())?; - - // add missing columns to projection - let mut project_exprs: Vec = - if let Some(Expr::BinaryExpr(BinaryExpr { - left: _, - op: Operator::Eq, - right, - })) = &in_predicate_opt - { - if !matches!(right.deref(), Expr::Column(_)) { - vec![right.deref().clone().alias(format!( - "{:?}", - unnormalize_col(right.deref().clone()) - ))] - } else { - vec![right.deref().clone()] - } - } else { - vec![] - }; - // the inner reference cols need to added to the projection if they are missing. - for col in correlated_subquery_cols.iter() { - let col_expr = Expr::Column(col.clone()); - if !project_exprs.contains(&col_expr) { - project_exprs.push(col_expr) - } - } - - // alias the join filter - let join_filter_opt = - conjunction(join_filters).map_or(Ok(None), |filter| { - replace_qualified_name( - filter, - &correlated_subquery_cols, - subquery_alias, - ) - .map(Option::Some) - })?; - - let join_filter = if let Some(Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right, - })) = in_predicate_opt - { - let right_expr_name = - format!("{:?}", unnormalize_col(right.deref().clone())); - let right_col = - Column::new(Some(subquery_alias.to_string()), right_expr_name); - let in_predicate = - Expr::eq(left.deref().clone(), Expr::Column(right_col)); - join_filter_opt - .map(|filter| in_predicate.clone().and(filter)) - .unwrap_or_else(|| in_predicate) - } else { - join_filter_opt.ok_or_else(|| { - DataFusionError::Internal( - "join filters should not be empty".to_string(), - ) - })? - }; - - let right = LogicalPlanBuilder::from(subquery_input) - .project(project_exprs)? - .build()?; - Ok(Some((join_filter, right))) - } - _ => Ok(None), - } -} - -fn remove_duplicated_filter(filters: Vec, in_predicate: &Expr) -> Vec { - filters - .into_iter() - .filter(|filter| { - if filter == in_predicate { - return false; - } - - // ignore the binary order - !match (filter, in_predicate) { - (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { - (a_expr.op == b_expr.op) - && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) - || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) - } - _ => false, - } - }) - .collect::>() -} - struct SubqueryInfo { query: Subquery, where_in_expr: Option, @@ -914,11 +833,11 @@ mod tests { .build()?; // Maybe okay if the table only has a single column? - assert_optimizer_err( - Arc::new(DecorrelatePredicateSubquery::new()), - &plan, - "a projection is required", - ); + let expected = "check_analyzed_plan\ + \ncaused by\ + \nError during planning: InSubquery should only return one column, but found 4"; + assert_analyzer_check_err(vec![], &plan, expected); + Ok(()) } @@ -976,8 +895,8 @@ mod tests { let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey + Int32(1) AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey + Int32(1):Int64, o_custkey:Int64]\ - \n Projection: orders.o_custkey + Int32(1) AS o_custkey + Int32(1), orders.o_custkey [o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey + Int32(1):Int64, orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n Projection: orders.o_custkey + Int32(1) AS o_custkey + Int32(1), orders.o_custkey + Int32(1), orders.o_custkey [o_custkey + Int32(1):Int64, orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_optimized_plan_eq_display_indent( @@ -1009,11 +928,11 @@ mod tests { .project(vec![col("customer.c_custkey")])? .build()?; - assert_optimizer_err( - Arc::new(DecorrelatePredicateSubquery::new()), - &plan, - "single expression projection required", - ); + let expected = "check_analyzed_plan\ + \ncaused by\ + \nError during planning: InSubquery should only return one column"; + assert_analyzer_check_err(vec![], &plan, expected); + Ok(()) } @@ -1187,8 +1106,8 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2) [c * UInt32(2):UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32]\ + \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.c * UInt32(2) [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_eq_display_indent( @@ -1221,8 +1140,8 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.a [c * UInt32(2):UInt32, a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.c * UInt32(2), sq.a [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32]\ \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; @@ -1257,8 +1176,8 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a + test.b = __correlated_sq_1.a + __correlated_sq_1.b [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.a, sq.b [c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ + \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.c * UInt32(2), sq.a, sq.b [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; @@ -1301,11 +1220,11 @@ mod tests { \n LeftSemi Join: Filter: test.c * UInt32(2) = __correlated_sq_2.c * UInt32(2) AND test.a > __correlated_sq_2.a [a:UInt32, b:UInt32, c:UInt32]\ \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a > __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq1.c * UInt32(2) AS c * UInt32(2), sq1.a [c * UInt32(2):UInt32, a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq1.c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq1.c * UInt32(2) AS c * UInt32(2), sq1.c * UInt32(2), sq1.a [c * UInt32(2):UInt32, sq1.c * UInt32(2):UInt32, a:UInt32]\ \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_2 [c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq2.c * UInt32(2) AS c * UInt32(2), sq2.a [c * UInt32(2):UInt32, a:UInt32]\ + \n SubqueryAlias: __correlated_sq_2 [c * UInt32(2):UInt32, sq2.c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq2.c * UInt32(2) AS c * UInt32(2), sq2.c * UInt32(2), sq2.a [c * UInt32(2):UInt32, sq2.c * UInt32(2):UInt32, a:UInt32]\ \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_eq_display_indent( @@ -1466,12 +1385,11 @@ mod tests { .build()?; // Other rule will pushdown `customer.c_custkey = 1`, - // TODO revisit the logic, is it a valid physical plan when no cols in projection? let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n LeftSemi Join: Filter: customer.c_custkey = UInt32(1) [c_custkey:Int64, c_name:Utf8]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 []\ - \n Projection: []\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_optimized_plan_equal(&plan, expected) @@ -1585,7 +1503,10 @@ mod tests { fn exists_subquery_no_projection() -> Result<()> { let sq = Arc::new( LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .eq(col("orders.o_custkey")), + )? .build()?, ); @@ -1594,7 +1515,13 @@ mod tests { .project(vec![col("customer.c_custkey")])? .build()?; - assert_optimization_skipped(Arc::new(DecorrelatePredicateSubquery::new()), &plan) + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_equal(&plan, expected) } /// Test for correlated exists expressions @@ -1618,8 +1545,8 @@ mod tests { let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n SubqueryAlias: __correlated_sq_1 [orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n Projection: orders.o_custkey + Int32(1), orders.o_custkey [orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_optimized_plan_equal(&plan, expected) @@ -1698,8 +1625,8 @@ mod tests { let expected = "Projection: test.c [c:UInt32]\ \n LeftSemi Join: Filter: test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32, a:UInt32]\ + \n Projection: sq.c, sq.a [c:UInt32, a:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(&plan, expected) @@ -1758,11 +1685,11 @@ mod tests { \n LeftSemi Join: Filter: test.a = __correlated_sq_2.a [a:UInt32, b:UInt32, c:UInt32]\ \n LeftSemi Join: Filter: test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Projection: sq1.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32, a:UInt32]\ + \n Projection: sq1.c, sq1.a [c:UInt32, a:UInt32]\ \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_2 [a:UInt32]\ - \n Projection: sq2.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_2 [c:UInt32, a:UInt32]\ + \n Projection: sq2.c, sq2.a [c:UInt32, a:UInt32]\ \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(&plan, expected) @@ -1787,8 +1714,8 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: UInt32(1) + __correlated_sq_1.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [UInt32(1):UInt32, a:UInt32]\ + \n Projection: UInt32(1), sq.a [UInt32(1):UInt32, a:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(&plan, expected) @@ -1840,9 +1767,9 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: UInt32(1) + __correlated_sq_1.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Distinct: [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32, a:UInt32]\ + \n Distinct: [c:UInt32, a:UInt32]\ + \n Projection: sq.c, sq.a [c:UInt32, a:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(&plan, expected) @@ -1868,9 +1795,9 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: UInt32(1) + __correlated_sq_1.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Distinct: [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [sq.b + sq.c:UInt32, a:UInt32]\ + \n Distinct: [sq.b + sq.c:UInt32, a:UInt32]\ + \n Projection: sq.b + sq.c, sq.a [sq.b + sq.c:UInt32, a:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(&plan, expected) @@ -1896,9 +1823,9 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n LeftSemi Join: Filter: UInt32(1) + __correlated_sq_1.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Distinct: [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [UInt32(1):UInt32, c:UInt32, a:UInt32]\ + \n Distinct: [UInt32(1):UInt32, c:UInt32, a:UInt32]\ + \n Projection: UInt32(1), sq.c, sq.a [UInt32(1):UInt32, c:UInt32, a:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(&plan, expected) diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 26f86c607a22b..4d8002c1a9129 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -17,16 +17,13 @@ use crate::alias::AliasGenerator; use crate::optimizer::ApplyOrder; -use crate::utils::{ - collect_subquery_cols, conjunction, extract_join_filters, only_or_err, - replace_qualified_name, -}; +use crate::utils::{conjunction, replace_qualified_name, PullUpCorrelatedExpr}; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}; -use datafusion_common::{context, Column, Result}; +use datafusion_common::{Column, DataFusionError, Result}; use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; -use log::debug; +use std::collections::BTreeSet; use std::sync::Arc; /// Optimizer rule for rewriting subquery filters to joins @@ -81,7 +78,7 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut cur_input = filter.input.as_ref().clone(); for (subquery, alias) in subqueries { if let Some(optimized_subquery) = - optimize_scalar(&subquery, &cur_input, &alias)? + build_join(&subquery, &cur_input, &alias)? { cur_input = optimized_subquery; } else { @@ -89,8 +86,9 @@ impl OptimizerRule for ScalarSubqueryToJoin { return Ok(None); } } - let new_plan = LogicalPlanBuilder::from(cur_input); - Ok(Some(new_plan.filter(expr)?.build()?)) + let new_plan = + LogicalPlanBuilder::from(cur_input).filter(expr)?.build()?; + Ok(Some(new_plan)) } LogicalPlan::Projection(projection) => { let mut all_subqueryies = vec![]; @@ -109,7 +107,7 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut cur_input = projection.input.as_ref().clone(); for (subquery, alias) in all_subqueryies { if let Some(optimized_subquery) = - optimize_scalar(&subquery, &cur_input, &alias)? + build_join(&subquery, &cur_input, &alias)? { cur_input = optimized_subquery; } else { @@ -117,8 +115,10 @@ impl OptimizerRule for ScalarSubqueryToJoin { return Ok(None); } } - let new_plan = LogicalPlanBuilder::from(cur_input); - Ok(Some(new_plan.project(rewrite_exprs)?.build()?)) + let new_plan = LogicalPlanBuilder::from(cur_input) + .project(rewrite_exprs)? + .build()?; + Ok(Some(new_plan)) } _ => Ok(None), @@ -153,9 +153,26 @@ impl TreeNodeRewriter for ExtractScalarSubQuery { match expr { Expr::ScalarSubquery(subquery) => { let subqry_alias = self.alias_gen.next("__scalar_sq"); - self.sub_query_info.push((subquery, subqry_alias.clone())); - let scalar_column = "__value"; - Ok(Expr::Column(Column::new(Some(subqry_alias), scalar_column))) + self.sub_query_info + .push((subquery.clone(), subqry_alias.clone())); + let scalar_expr = subquery.subquery.head_output_expr()?.map_or( + Err(DataFusionError::Plan( + "single expression required.".to_string(), + )), + Ok, + )?; + match scalar_expr { + Expr::Alias(_, alias) => { + Ok(Expr::Column(Column::new(Some(subqry_alias), alias))) + } + Expr::Column(Column { relation: _, name }) => { + Ok(Expr::Column(Column::new(Some(subqry_alias), name))) + } + _ => { + let scalar_column = scalar_expr.display_name()?; + Ok(Expr::Column(Column::new(Some(subqry_alias), scalar_column))) + } + } } _ => Ok(expr), } @@ -198,82 +215,51 @@ impl TreeNodeRewriter for ExtractScalarSubQuery { /// * `filter_input` - The non-subquery portion (from customers) /// * `outer_others` - Any additional parts to the `where` expression (and c.x = y) /// * `subquery_alias` - Subquery aliases -fn optimize_scalar( +fn build_join( subquery: &Subquery, filter_input: &LogicalPlan, subquery_alias: &str, ) -> Result> { let subquery_plan = subquery.subquery.as_ref(); - let proj = match &subquery_plan { - LogicalPlan::Projection(proj) => proj, - _ => { - // this rule does not support this type of scalar subquery - // TODO support more types - debug!( - "cannot translate this type of scalar subquery to a join: {}", - subquery_plan.display_indent() - ); - return Ok(None); - } - }; - let proj = only_or_err(proj.expr.as_slice()) - .map_err(|e| context!("exactly one expression should be projected", e))?; - let proj = Expr::Alias(Box::new(proj.clone()), "__value".to_string()); - let sub_inputs = subquery_plan.inputs(); - let sub_input = only_or_err(sub_inputs.as_slice()) - .map_err(|e| context!("Exactly one input is expected. Is this a join?", e))?; - - let aggr = match sub_input { - LogicalPlan::Aggregate(aggr) => aggr, - _ => { - // this rule does not support this type of scalar subquery - // TODO support more types - debug!( - "cannot translate this type of scalar subquery to a join: {}", - subquery_plan.display_indent() - ); - return Ok(None); - } + let mut pull_up = PullUpCorrelatedExpr { + join_filters: vec![], + correlated_subquery_cols_map: Default::default(), + in_predicate_opt: None, + exists_sub_query: false, + can_pull_up: true, }; + let new_plan = subquery_plan.clone().rewrite(&mut pull_up)?; + if !pull_up.can_pull_up { + return Ok(None); + } - // extract join filters - let (join_filters, subquery_input) = extract_join_filters(&aggr.input)?; - // Only operate if one column is present and the other closed upon from outside scope - let input_schema = subquery_input.schema(); - let subqry_cols = collect_subquery_cols(&join_filters, input_schema.clone())?; - let join_filter = conjunction(join_filters).map_or(Ok(None), |filter| { - replace_qualified_name(filter, &subqry_cols, subquery_alias).map(Option::Some) - })?; - - let group_by: Vec<_> = subqry_cols - .iter() - .map(|it| Expr::Column(it.clone())) - .collect(); - let subqry_plan = LogicalPlanBuilder::from(subquery_input); - - // project the prior projection + any correlated (and now grouped) columns - let proj: Vec<_> = group_by - .iter() - .cloned() - .chain(vec![proj].iter().cloned()) - .collect(); - let subqry_plan = subqry_plan - .aggregate(group_by, aggr.aggr_expr.clone())? - .project(proj)? + let sub_query_alias = LogicalPlanBuilder::from(new_plan) .alias(subquery_alias.to_string())? .build()?; + let mut all_correlated_cols = BTreeSet::new(); + pull_up + .correlated_subquery_cols_map + .values() + .for_each(|cols| all_correlated_cols.extend(cols.clone())); + + // alias the join filter + let join_filter_opt = + conjunction(pull_up.join_filters).map_or(Ok(None), |filter| { + replace_qualified_name(filter, &all_correlated_cols, subquery_alias) + .map(Option::Some) + })?; // join our sub query into the main plan - let new_plan = if join_filter.is_none() { + let new_plan = if join_filter_opt.is_none() { match filter_input { LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: true, schema: _, - }) => subqry_plan, + }) => sub_query_alias, _ => { // if not correlated, group down to 1 row and cross join on that (preserving row count) LogicalPlanBuilder::from(filter_input.clone()) - .cross_join(subqry_plan)? + .cross_join(sub_query_alias)? .build()? } } @@ -281,23 +267,19 @@ fn optimize_scalar( // left join if correlated, grouping by the join keys so we don't change row count LogicalPlanBuilder::from(filter_input.clone()) .join( - subqry_plan, + sub_query_alias, JoinType::Left, (Vec::::new(), Vec::::new()), - join_filter, + join_filter_opt, )? .build()? }; - Ok(Some(new_plan)) } #[cfg(test)] mod tests { use super::*; - use crate::eliminate_cross_join::EliminateCrossJoin; - use crate::eliminate_outer_join::EliminateOuterJoin; - use crate::extract_equijoin_predicate::ExtractEquijoinPredicate; use crate::test::*; use arrow::datatypes::DataType; use datafusion_common::Result; @@ -337,24 +319,20 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: Int32(1) < __scalar_sq_1.__value AND Int32(1) < __scalar_sq_2.__value [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N, o_custkey:Int64, __value:Int64;N]\ - \n Inner Join: customer.c_custkey = __scalar_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N, o_custkey:Int64, __value:Int64;N]\ - \n Left Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: Int32(1) < __scalar_sq_1.MAX(orders.o_custkey) AND Int32(1) < __scalar_sq_2.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: __scalar_sq_2.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: __scalar_sq_1.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n SubqueryAlias: __scalar_sq_2 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_2 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(EliminateOuterJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -396,25 +374,21 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_acctbal < __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, o_custkey:Int64, __value:Float64;N]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64, __value:Float64;N]\ + \n Filter: customer.c_acctbal < __scalar_sq_1.SUM(orders.o_totalprice) [c_custkey:Int64, c_name:Utf8, SUM(orders.o_totalprice):Float64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: __scalar_sq_1.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8, SUM(orders.o_totalprice):Float64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Float64;N]\ - \n Projection: orders.o_custkey, SUM(orders.o_totalprice) AS __value [o_custkey:Int64, __value:Float64;N]\ + \n SubqueryAlias: __scalar_sq_1 [SUM(orders.o_totalprice):Float64;N, o_custkey:Int64]\ + \n Projection: SUM(orders.o_totalprice), orders.o_custkey [SUM(orders.o_totalprice):Float64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[SUM(orders.o_totalprice)]] [o_custkey:Int64, SUM(orders.o_totalprice):Float64;N]\ - \n Filter: orders.o_totalprice < __scalar_sq_2.__value [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, l_orderkey:Int64;N, __value:Float64;N]\ - \n Inner Join: orders.o_orderkey = __scalar_sq_2.l_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, l_orderkey:Int64;N, __value:Float64;N]\ + \n Filter: orders.o_totalprice < __scalar_sq_2.SUM(lineitem.l_extendedprice) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, SUM(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N]\ + \n Left Join: Filter: __scalar_sq_2.l_orderkey = orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N, SUM(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n SubqueryAlias: __scalar_sq_2 [l_orderkey:Int64, __value:Float64;N]\ - \n Projection: lineitem.l_orderkey, SUM(lineitem.l_extendedprice) AS __value [l_orderkey:Int64, __value:Float64;N]\ + \n SubqueryAlias: __scalar_sq_2 [SUM(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64]\ + \n Projection: SUM(lineitem.l_extendedprice), lineitem.l_orderkey [SUM(lineitem.l_extendedprice):Float64;N, l_orderkey:Int64]\ \n Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[SUM(lineitem.l_extendedprice)]] [l_orderkey:Int64, SUM(lineitem.l_extendedprice):Float64;N]\ \n TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -442,21 +416,17 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey = __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -484,18 +454,15 @@ mod tests { // it will optimize, but fail for the same reason the unoptimized query would let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, __value:Int64;N]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [__value:Int64;N]\ - \n Projection: MAX(orders.o_custkey) AS __value [__value:Int64;N]\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ + \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateCrossJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -519,20 +486,17 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, __value:Int64;N]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [__value:Int64;N]\ - \n Projection: MAX(orders.o_custkey) AS __value [__value:Int64;N]\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ - \n Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ + \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ + \n Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateCrossJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -635,24 +599,6 @@ mod tests { .project(vec![col("customer.c_custkey")])? .build()?; - // we expect the plan to be unchanged because this subquery is not supported by this rule - let expected = r#"Projection: customer.c_custkey [c_custkey:Int64] - Filter: customer.c_custkey = () [c_custkey:Int64, c_name:Utf8] - Subquery: [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - Filter: customer.c_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - TableScan: customer [c_custkey:Int64, c_name:Utf8]"#; - - assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], - &plan, - expected, - ); - let expected = "check_analyzed_plan\ \ncaused by\ \nError during planning: Scalar subquery should only return one column"; @@ -680,20 +626,16 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey = __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) + Int32(1) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) + Int32(1) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey) + Int32(1), orders.o_custkey [MAX(orders.o_custkey) + Int32(1):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -750,20 +692,16 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey >= __scalar_sq_1.__value AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: customer.c_custkey >= __scalar_sq_1.MAX(orders.o_custkey) AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -793,20 +731,16 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey = __scalar_sq_1.__value AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) AND customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(EliminateOuterJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -837,20 +771,16 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey = __scalar_sq_1.__value OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ - \n Left Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateCrossJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -874,20 +804,16 @@ mod tests { .build()?; let expected = "Projection: test.c [c:UInt32]\ - \n Filter: test.c < __scalar_sq_1.__value [a:UInt32, b:UInt32, c:UInt32, a:UInt32;N, __value:UInt32;N]\ - \n Inner Join: test.a = __scalar_sq_1.a [a:UInt32, b:UInt32, c:UInt32, a:UInt32;N, __value:UInt32;N]\ + \n Filter: test.c < __scalar_sq_1.MIN(sq.c) [a:UInt32, b:UInt32, c:UInt32, MIN(sq.c):UInt32;N, a:UInt32;N]\ + \n Left Join: Filter: test.a = __scalar_sq_1.a [a:UInt32, b:UInt32, c:UInt32, MIN(sq.c):UInt32;N, a:UInt32;N]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __scalar_sq_1 [a:UInt32, __value:UInt32;N]\ - \n Projection: sq.a, MIN(sq.c) AS __value [a:UInt32, __value:UInt32;N]\ + \n SubqueryAlias: __scalar_sq_1 [MIN(sq.c):UInt32;N, a:UInt32]\ + \n Projection: MIN(sq.c), sq.a [MIN(sq.c):UInt32;N, a:UInt32]\ \n Aggregate: groupBy=[[sq.a]], aggr=[[MIN(sq.c)]] [a:UInt32, MIN(sq.c):UInt32;N]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -910,20 +836,16 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey < __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, __value:Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, __value:Int64;N]\ + \n Filter: customer.c_custkey < __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [__value:Int64;N]\ - \n Projection: MAX(orders.o_custkey) AS __value [__value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ + \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateCrossJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -945,19 +867,16 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Inner Join: customer.c_custkey = __scalar_sq_1.__value [c_custkey:Int64, c_name:Utf8, __value:Int64;N]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [__value:Int64;N]\ - \n Projection: MAX(orders.o_custkey) AS __value [__value:Int64;N]\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ + \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateCrossJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -1000,25 +919,21 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey BETWEEN __scalar_sq_1.__value AND __scalar_sq_2.__value [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N, o_custkey:Int64;N, __value:Int64;N]\ - \n Left Join: customer.c_custkey = __scalar_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N, o_custkey:Int64;N, __value:Int64;N]\ - \n Left Join: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, o_custkey:Int64;N, __value:Int64;N]\ + \n Filter: customer.c_custkey BETWEEN __scalar_sq_1.MIN(orders.o_custkey) AND __scalar_sq_2.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, o_custkey:Int64;N, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, o_custkey:Int64;N, MAX(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ + \n Left Join: Filter: customer.c_custkey = __scalar_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, o_custkey:Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MIN(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MIN(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MIN(orders.o_custkey), orders.o_custkey [MIN(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MIN(orders.o_custkey)]] [o_custkey:Int64, MIN(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n SubqueryAlias: __scalar_sq_2 [o_custkey:Int64, __value:Int64;N]\ - \n Projection: orders.o_custkey, MAX(orders.o_custkey) AS __value [o_custkey:Int64, __value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_2 [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ + \n Projection: MAX(orders.o_custkey), orders.o_custkey [MAX(orders.o_custkey):Int64;N, o_custkey:Int64]\ \n Aggregate: groupBy=[[orders.o_custkey]], aggr=[[MAX(orders.o_custkey)]] [o_custkey:Int64, MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); @@ -1053,25 +968,21 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey BETWEEN __scalar_sq_1.__value AND __scalar_sq_2.__value [c_custkey:Int64, c_name:Utf8, __value:Int64;N, __value:Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, __value:Int64;N, __value:Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, __value:Int64;N]\ + \n Filter: customer.c_custkey BETWEEN __scalar_sq_1.MIN(orders.o_custkey) AND __scalar_sq_2.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, MAX(orders.o_custkey):Int64;N]\ + \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, MAX(orders.o_custkey):Int64;N]\ + \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __scalar_sq_1 [__value:Int64;N]\ - \n Projection: MIN(orders.o_custkey) AS __value [__value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_1 [MIN(orders.o_custkey):Int64;N]\ + \n Projection: MIN(orders.o_custkey) [MIN(orders.o_custkey):Int64;N]\ \n Aggregate: groupBy=[[]], aggr=[[MIN(orders.o_custkey)]] [MIN(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n SubqueryAlias: __scalar_sq_2 [__value:Int64;N]\ - \n Projection: MAX(orders.o_custkey) AS __value [__value:Int64;N]\ + \n SubqueryAlias: __scalar_sq_2 [MAX(orders.o_custkey):Int64;N]\ + \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ \n Aggregate: groupBy=[[]], aggr=[[MAX(orders.o_custkey)]] [MAX(orders.o_custkey):Int64;N]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_multi_rules_optimized_plan_eq_display_indent( - vec![ - Arc::new(ScalarSubqueryToJoin::new()), - Arc::new(ExtractEquijoinPredicate::new()), - Arc::new(EliminateOuterJoin::new()), - ], + vec![Arc::new(ScalarSubqueryToJoin::new())], &plan, expected, ); diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index 266d0a0be7145..fd8dda79c51d1 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -18,20 +18,23 @@ //! Collection of utility functions that are leveraged by the query optimizer rules use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::tree_node::{TreeNode, TreeNodeRewriter}; +use datafusion_common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}; use datafusion_common::{plan_err, Column, DFSchemaRef}; use datafusion_common::{DFSchema, Result}; use datafusion_expr::expr::{BinaryExpr, Sort}; -use datafusion_expr::expr_rewriter::{replace_col, strip_outer_reference}; +use datafusion_expr::expr_rewriter::{ + replace_col, strip_outer_reference, unnormalize_col, +}; use datafusion_expr::logical_plan::LogicalPlanBuilder; use datafusion_expr::utils::from_plan; use datafusion_expr::{ and, logical_plan::{Filter, LogicalPlan}, - Expr, Operator, + EmptyRelation, Expr, Operator, }; use log::{debug, trace}; use std::collections::{BTreeSet, HashMap}; +use std::ops::Deref; use std::sync::Arc; /// Convenience rule for writing optimizers: recursively invoke @@ -346,29 +349,6 @@ pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema { } } -/// Extract join predicates from the correlated subquery's [Filter] expressions. -/// The join predicate means that the expression references columns -/// from both the subquery and outer table or only from the outer table. -/// -/// Returns join predicates and subquery(extracted). -pub(crate) fn extract_join_filters( - maybe_filter: &LogicalPlan, -) -> Result<(Vec, LogicalPlan)> { - if let LogicalPlan::Filter(plan_filter) = maybe_filter { - let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); - let (join_filters, subquery_filters) = find_join_exprs(subquery_filter_exprs)?; - // if the subquery still has filter expressions, restore them. - let mut plan = LogicalPlanBuilder::from((*plan_filter.input).clone()); - if let Some(expr) = conjunction(subquery_filters) { - plan = plan.filter(expr)? - } - - Ok((join_filters, plan.build()?)) - } else { - Ok((vec![], maybe_filter.clone())) - } -} - pub(crate) fn collect_subquery_cols( exprs: &[Expr], subquery_schema: DFSchemaRef, @@ -409,6 +389,235 @@ pub fn log_plan(description: &str, plan: &LogicalPlan) { trace!("{description}::\n{}\n", plan.display_indent_schema()); } +/// This struct rewrite the sub query plan by pull up the correlated expressions(contains outer reference columns) from the inner subquery's [Filter]. +/// It adds the inner reference columns to the [Projection] or [Aggregate] of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. +pub struct PullUpCorrelatedExpr { + pub join_filters: Vec, + // map of the plan and its holding correlated columns + pub correlated_subquery_cols_map: HashMap>, + pub in_predicate_opt: Option, + // indicate whether it is Exists(Not Exists) SubQuery + pub exists_sub_query: bool, + // indicate whether the correlated expressions can pull up or not + pub can_pull_up: bool, +} + +impl TreeNodeRewriter for PullUpCorrelatedExpr { + type N = LogicalPlan; + + fn pre_visit(&mut self, plan: &LogicalPlan) -> Result { + match plan { + LogicalPlan::Filter(_) => Ok(RewriteRecursion::Continue), + LogicalPlan::Union(_) | LogicalPlan::Sort(_) | LogicalPlan::Extension(_) => { + let plan_hold_outer = !plan.all_out_ref_exprs().is_empty(); + if plan_hold_outer { + // the unsupported case + self.can_pull_up = false; + Ok(RewriteRecursion::Stop) + } else { + Ok(RewriteRecursion::Continue) + } + } + LogicalPlan::Limit(_) => { + let plan_hold_outer = !plan.all_out_ref_exprs().is_empty(); + match (self.exists_sub_query, plan_hold_outer) { + (false, true) => { + // the unsupported case + self.can_pull_up = false; + Ok(RewriteRecursion::Stop) + } + _ => Ok(RewriteRecursion::Continue), + } + } + _ if plan.expressions().iter().any(|expr| expr.contains_outer()) => { + // the unsupported cases, the plan expressions contain out reference columns(like window expressions or agg expressions) + self.can_pull_up = false; + Ok(RewriteRecursion::Stop) + } + _ => Ok(RewriteRecursion::Continue), + } + } + + fn mutate(&mut self, plan: LogicalPlan) -> Result { + let subquery_schema = plan.schema().clone(); + match &plan { + LogicalPlan::Filter(plan_filter) => { + let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); + let (mut join_filters, subquery_filters) = + find_join_exprs(subquery_filter_exprs)?; + if let Some(in_predicate) = &self.in_predicate_opt { + // in_predicate may be already included in the join filters, remove it from the join filters first. + join_filters = remove_duplicated_filter(join_filters, in_predicate); + } + let correlated_subquery_cols = + collect_subquery_cols(&join_filters, subquery_schema)?; + for expr in join_filters { + if !self.join_filters.contains(&expr) { + self.join_filters.push(expr) + } + } + // if the subquery still has filter expressions, restore them. + let mut plan = LogicalPlanBuilder::from((*plan_filter.input).clone()); + if let Some(expr) = conjunction(subquery_filters) { + plan = plan.filter(expr)? + } + let new_plan = plan.build()?; + self.correlated_subquery_cols_map + .insert(new_plan.clone(), correlated_subquery_cols); + Ok(new_plan) + } + LogicalPlan::Projection(projection) + if self.in_predicate_opt.is_some() || !self.join_filters.is_empty() => + { + let mut local_correlated_cols = BTreeSet::new(); + collect_local_correlated_cols( + &plan, + &self.correlated_subquery_cols_map, + &mut local_correlated_cols, + ); + // add missing columns to Projection + let missing_exprs = + self.collect_missing_exprs(&projection.expr, &local_correlated_cols)?; + let new_plan = LogicalPlanBuilder::from((*projection.input).clone()) + .project(missing_exprs)? + .build()?; + Ok(new_plan) + } + LogicalPlan::Aggregate(aggregate) + if self.in_predicate_opt.is_some() || !self.join_filters.is_empty() => + { + let mut local_correlated_cols = BTreeSet::new(); + collect_local_correlated_cols( + &plan, + &self.correlated_subquery_cols_map, + &mut local_correlated_cols, + ); + // add missing columns to Aggregation's group expression + let missing_exprs = self.collect_missing_exprs( + &aggregate.group_expr, + &local_correlated_cols, + )?; + let new_plan = LogicalPlanBuilder::from((*aggregate.input).clone()) + .aggregate(missing_exprs, aggregate.aggr_expr.to_vec())? + .build()?; + Ok(new_plan) + } + LogicalPlan::SubqueryAlias(alias) => { + let mut local_correlated_cols = BTreeSet::new(); + collect_local_correlated_cols( + &plan, + &self.correlated_subquery_cols_map, + &mut local_correlated_cols, + ); + let mut new_correlated_cols = BTreeSet::new(); + for col in local_correlated_cols.iter() { + new_correlated_cols + .insert(Column::new(Some(alias.alias.clone()), col.name.clone())); + } + self.correlated_subquery_cols_map + .insert(plan.clone(), new_correlated_cols); + Ok(plan) + } + LogicalPlan::Limit(limit) => { + // handling the limit clause in the subquery + match (self.exists_sub_query, self.join_filters.is_empty()) { + // un-correlated exist subquery, keep the limit + (true, true) => Ok(plan), + // Correlated exist subquery, remove the limit(so that correlated expressions can pull up) + (true, false) => { + if limit.fetch.filter(|limit_row| *limit_row == 0).is_some() { + Ok(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: limit.input.schema().clone(), + })) + } else { + LogicalPlanBuilder::from((*limit.input).clone()).build() + } + } + _ => Ok(plan), + } + } + _ => Ok(plan), + } + } +} + +impl PullUpCorrelatedExpr { + fn collect_missing_exprs( + &self, + exprs: &[Expr], + correlated_subquery_cols: &BTreeSet, + ) -> Result> { + let mut missing_exprs = vec![]; + if let Some(Expr::BinaryExpr(BinaryExpr { + left: _, + op: Operator::Eq, + right, + })) = &self.in_predicate_opt + { + if !matches!(right.deref(), Expr::Column(_)) + && !matches!(right.deref(), Expr::Literal(_)) + && !matches!(right.deref(), Expr::Alias(_, _)) + { + let alias_expr = right + .deref() + .clone() + .alias(format!("{:?}", unnormalize_col(right.deref().clone()))); + missing_exprs.push(alias_expr) + } + } + for expr in exprs { + if !missing_exprs.contains(expr) { + missing_exprs.push(expr.clone()) + } + } + for col in correlated_subquery_cols.iter() { + let col_expr = Expr::Column(col.clone()); + if !missing_exprs.contains(&col_expr) { + missing_exprs.push(col_expr) + } + } + Ok(missing_exprs) + } +} + +fn collect_local_correlated_cols( + plan: &LogicalPlan, + all_cols_map: &HashMap>, + local_cols: &mut BTreeSet, +) { + for child in plan.inputs() { + if let Some(cols) = all_cols_map.get(child) { + local_cols.extend(cols.clone()); + } + // SubqueryAlias is treated as the leaf node + if !matches!(child, LogicalPlan::SubqueryAlias(_)) { + collect_local_correlated_cols(child, all_cols_map, local_cols); + } + } +} + +fn remove_duplicated_filter(filters: Vec, in_predicate: &Expr) -> Vec { + filters + .into_iter() + .filter(|filter| { + if filter == in_predicate { + return false; + } + + // ignore the binary order + !match (filter, in_predicate) { + (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { + (a_expr.op == b_expr.op) + && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) + || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) + } + _ => false, + } + }) + .collect::>() +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/optimizer/tests/integration-test.rs b/datafusion/optimizer/tests/integration-test.rs index 761d6539b23c0..350e6e3a75448 100644 --- a/datafusion/optimizer/tests/integration-test.rs +++ b/datafusion/optimizer/tests/integration-test.rs @@ -66,15 +66,14 @@ fn subquery_filter_with_cast() -> Result<()> { )"; let plan = test_sql(sql)?; let expected = "Projection: test.col_int32\ - \n Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.__value\ + \n Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.AVG(test.col_int32)\ \n CrossJoin:\ \n TableScan: test projection=[col_int32]\ \n SubqueryAlias: __scalar_sq_1\ - \n Projection: AVG(test.col_int32) AS __value\ - \n Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]]\ - \n Projection: test.col_int32\ - \n Filter: test.col_utf8 >= Utf8(\"2002-05-08\") AND test.col_utf8 <= Utf8(\"2002-05-13\")\ - \n TableScan: test projection=[col_int32, col_utf8]"; + \n Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]]\ + \n Projection: test.col_int32\ + \n Filter: test.col_utf8 >= Utf8(\"2002-05-08\") AND test.col_utf8 <= Utf8(\"2002-05-13\")\ + \n TableScan: test projection=[col_int32, col_utf8]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } From ea27c706a5c05626d53c2032fc3a94a2d763efa5 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Fri, 26 May 2023 18:17:19 +0800 Subject: [PATCH 02/13] fix comment --- datafusion/core/tests/sql/subqueries.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/tests/sql/subqueries.rs b/datafusion/core/tests/sql/subqueries.rs index 5b55b2e031ca4..d68433a1fc92b 100644 --- a/datafusion/core/tests/sql/subqueries.rs +++ b/datafusion/core/tests/sql/subqueries.rs @@ -770,7 +770,6 @@ async fn exists_subquery_with_select_null() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let plan = dataframe.into_optimized_plan()?; - // decorrelated, limit is removed let expected = vec![ "Filter: EXISTS () [t1_id:UInt32;N, t1_name:Utf8;N]", " Subquery: [NULL:Null;N]", From d11a0ef5fb31ccbcfe119fb382c11dab9ddfb2c8 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Fri, 26 May 2023 18:26:02 +0800 Subject: [PATCH 03/13] fix fmt --- datafusion/core/tests/tpcds_planning.rs | 1 - datafusion/optimizer/src/decorrelate_predicate_subquery.rs | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs index b6eca18fe5ecb..3b6ec0ad3214f 100644 --- a/datafusion/core/tests/tpcds_planning.rs +++ b/datafusion/core/tests/tpcds_planning.rs @@ -801,7 +801,6 @@ async fn tpcds_physical_q53() -> Result<()> { create_physical_plan(53).await } -//#[ignore] // Physical plan does not support logical expression () #[tokio::test] async fn tpcds_physical_q54() -> Result<()> { create_physical_plan(54).await diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index bfa5bbba32f8b..8514a95014266 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -255,8 +255,7 @@ fn build_join( })), ) => { let right_expr_name = format!("{:?}", unnormalize_col(right.deref().clone())); - let right_col = - Column::new(Some(subquery_alias), right_expr_name); + let right_col = Column::new(Some(subquery_alias), right_expr_name); let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); Some(in_predicate.and(join_filter)) } @@ -270,8 +269,7 @@ fn build_join( })), ) => { let right_expr_name = format!("{:?}", unnormalize_col(right.deref().clone())); - let right_col = - Column::new(Some(subquery_alias), right_expr_name); + let right_col = Column::new(Some(subquery_alias), right_expr_name); let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); Some(in_predicate) } From 81286590803844d724beda6e067d8d23d4b38217 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Fri, 26 May 2023 23:01:40 +0800 Subject: [PATCH 04/13] q64 still overflow stack --- datafusion/core/tests/tpcds_planning.rs | 1 + datafusion/optimizer/src/utils.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs index 3b6ec0ad3214f..3f55049ecd3cc 100644 --- a/datafusion/core/tests/tpcds_planning.rs +++ b/datafusion/core/tests/tpcds_planning.rs @@ -851,6 +851,7 @@ async fn tpcds_physical_q63() -> Result<()> { create_physical_plan(63).await } +#[ignore] // thread 'q64' has overflowed its stack #[tokio::test] async fn tpcds_physical_q64() -> Result<()> { create_physical_plan(64).await diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index fd8dda79c51d1..2d072b1e12429 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -390,7 +390,7 @@ pub fn log_plan(description: &str, plan: &LogicalPlan) { } /// This struct rewrite the sub query plan by pull up the correlated expressions(contains outer reference columns) from the inner subquery's [Filter]. -/// It adds the inner reference columns to the [Projection] or [Aggregate] of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. +/// It adds the inner reference columns to the 'Projection' or 'Aggregate' of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. pub struct PullUpCorrelatedExpr { pub join_filters: Vec, // map of the plan and its holding correlated columns From 27c449b0a4f3c3fdf7bb0984756a7f69a2f0959c Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Tue, 30 May 2023 15:50:56 +0800 Subject: [PATCH 05/13] fix count agg bug --- benchmarks/expected-plans/q11.txt | 175 ++++----- benchmarks/expected-plans/q22.txt | 106 +++-- datafusion/core/tests/sql/subqueries.rs | 367 +++++++++++++++++- .../src/decorrelate_predicate_subquery.rs | 3 + .../optimizer/src/scalar_subquery_to_join.rs | 146 +++++-- datafusion/optimizer/src/utils.rs | 126 +++++- .../optimizer/tests/integration-test.rs | 15 +- .../physical-expr/src/aggregate/average.rs | 17 +- 8 files changed, 754 insertions(+), 201 deletions(-) diff --git a/benchmarks/expected-plans/q11.txt b/benchmarks/expected-plans/q11.txt index fae9e0ea7f133..c5b1d6a0d3925 100644 --- a/benchmarks/expected-plans/q11.txt +++ b/benchmarks/expected-plans/q11.txt @@ -1,89 +1,86 @@ -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: value DESC NULLS FIRST | -| | Projection: partsupp.ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS value | -| | Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001) | -| | CrossJoin: | -| | Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | -| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost | -| | Inner Join: supplier.s_nationkey = nation.n_nationkey | -| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | -| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | -| | TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] | -| | TableScan: supplier projection=[s_suppkey, s_nationkey] | -| | Projection: nation.n_nationkey | -| | Filter: nation.n_name = Utf8("GERMANY") | -| | TableScan: nation projection=[n_nationkey, n_name] | -| | SubqueryAlias: __scalar_sq_1 | -| | Projection: CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) | -| | Aggregate: groupBy=[[]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | -| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost | -| | Inner Join: supplier.s_nationkey = nation.n_nationkey | -| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | -| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | -| | TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] | -| | TableScan: supplier projection=[s_suppkey, s_nationkey] | -| | Projection: nation.n_nationkey | -| | Filter: nation.n_name = Utf8("GERMANY") | -| | TableScan: nation projection=[n_nationkey, n_name] | -| physical_plan | SortExec: expr=[value@1 DESC] | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) > SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@2 | -| | CrossJoinExec | -| | CoalescePartitionsExec | -| | AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=2 | -| | AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@5 as s_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 1 }, Column { name: "s_suppkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: n_name@1 = GERMANY | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | ProjectionExec: expr=[CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] | -| | AggregateExec: mode=Final, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | CoalescePartitionsExec | -| | AggregateExec: mode=Partial, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | -| | ProjectionExec: expr=[ps_availqty@0 as ps_availqty, ps_supplycost@1 as ps_supplycost] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 2 }, Column { name: "n_nationkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | -| | ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@4 as s_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 0 }, Column { name: "s_suppkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: n_name@1 = GERMANY | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: value DESC NULLS FIRST | +| | Projection: partsupp.ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS value | +| | Inner Join: Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001) | +| | Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | +| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost | +| | Inner Join: supplier.s_nationkey = nation.n_nationkey | +| | Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | +| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | +| | TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] | +| | TableScan: supplier projection=[s_suppkey, s_nationkey] | +| | Projection: nation.n_nationkey | +| | Filter: nation.n_name = Utf8("GERMANY") | +| | TableScan: nation projection=[n_nationkey, n_name] | +| | SubqueryAlias: __scalar_sq_1 | +| | Projection: CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) | +| | Aggregate: groupBy=[[]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] | +| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost | +| | Inner Join: supplier.s_nationkey = nation.n_nationkey | +| | Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey | +| | Inner Join: partsupp.ps_suppkey = supplier.s_suppkey | +| | TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] | +| | TableScan: supplier projection=[s_suppkey, s_nationkey] | +| | Projection: nation.n_nationkey | +| | Filter: nation.n_name = Utf8("GERMANY") | +| | TableScan: nation projection=[n_nationkey, n_name] | +| physical_plan | SortPreservingMergeExec: [value@1 DESC] | +| | SortExec: expr=[value@1 DESC] | +| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] | +| | NestedLoopJoinExec: join_type=Inner, filter=BinaryExpr { left: CastExpr { expr: Column { name: "SUM(partsupp.ps_supplycost * partsupp.ps_availqty)", index: 0 }, cast_type: Decimal128(38, 15), cast_options: CastOptions { safe: false, format_options: FormatOptions { safe: true, null: "", date_format: None, datetime_format: None, timestamp_format: None, timestamp_tz_format: None, time_format: None } } }, op: Gt, right: Column { name: "SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)", index: 1 } } | +| | AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2), input_partitions=2 | +| | AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@5 as s_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 1 }, Column { name: "s_suppkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: n_name@1 = GERMANY | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | ProjectionExec: expr=[CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] | +| | AggregateExec: mode=Final, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=Partial, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] | +| | ProjectionExec: expr=[ps_availqty@0 as ps_availqty, ps_supplycost@1 as ps_supplycost] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 2 }, Column { name: "n_nationkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=2 | +| | ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@4 as s_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 0 }, Column { name: "s_suppkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[n_nationkey@0 as n_nationkey] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: n_name@1 = GERMANY | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/benchmarks/expected-plans/q22.txt b/benchmarks/expected-plans/q22.txt index 16aebfe90abf3..479727b8fd555 100644 --- a/benchmarks/expected-plans/q22.txt +++ b/benchmarks/expected-plans/q22.txt @@ -1,55 +1,51 @@ -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: custsale.cntrycode ASC NULLS LAST | -| | Projection: custsale.cntrycode, COUNT(UInt8(1)) AS numcust, SUM(custsale.c_acctbal) AS totacctbal | -| | Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[COUNT(UInt8(1)), SUM(custsale.c_acctbal)]] | -| | SubqueryAlias: custsale | -| | Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal | -| | Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_1.AVG(customer.c_acctbal) | -| | CrossJoin: | -| | Projection: customer.c_phone, customer.c_acctbal | -| | LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey | -| | Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | -| | TableScan: customer projection=[c_custkey, c_phone, c_acctbal] | -| | SubqueryAlias: __correlated_sq_1 | -| | TableScan: orders projection=[o_custkey] | -| | SubqueryAlias: __scalar_sq_1 | -| | Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] | -| | Projection: customer.c_acctbal | -| | Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | -| | TableScan: customer projection=[c_phone, c_acctbal] | -| physical_plan | SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] | -| | SortExec: expr=[cntrycode@0 ASC NULLS LAST] | -| | ProjectionExec: expr=[cntrycode@0 as cntrycode, COUNT(UInt8(1))@1 as numcust, SUM(custsale.c_acctbal)@2 as totacctbal] | -| | AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2), input_partitions=1 | -| | AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | -| | ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: CAST(c_acctbal@1 AS Decimal128(19, 6)) > AVG(customer.c_acctbal)@2 | -| | CrossJoinExec | -| | CoalescePartitionsExec | -| | ProjectionExec: expr=[c_phone@1 as c_phone, c_acctbal@2 as c_acctbal] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(Column { name: "c_custkey", index: 0 }, Column { name: "o_custkey", index: 0 })] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2), input_partitions=2 | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: Use substr(c_phone@1, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | RepartitionExec: partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2), input_partitions=0 | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | AggregateExec: mode=Final, gby=[], aggr=[AVG(customer.c_acctbal)] | -| | CoalescePartitionsExec | -| | AggregateExec: mode=Partial, gby=[], aggr=[AVG(customer.c_acctbal)] | -| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | -| | ProjectionExec: expr=[c_acctbal@1 as c_acctbal] | -| | CoalesceBatchesExec: target_batch_size=8192 | -| | FilterExec: c_acctbal@1 > Some(0),15,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | -| | MemoryExec: partitions=0, partition_sizes=[] | -| | | -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: custsale.cntrycode ASC NULLS LAST | +| | Projection: custsale.cntrycode, COUNT(UInt8(1)) AS numcust, SUM(custsale.c_acctbal) AS totacctbal | +| | Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[COUNT(UInt8(1)), SUM(custsale.c_acctbal)]] | +| | SubqueryAlias: custsale | +| | Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal | +| | Inner Join: Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_1.AVG(customer.c_acctbal) | +| | Projection: customer.c_phone, customer.c_acctbal | +| | LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey | +| | Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | +| | TableScan: customer projection=[c_custkey, c_phone, c_acctbal] | +| | SubqueryAlias: __correlated_sq_1 | +| | TableScan: orders projection=[o_custkey] | +| | SubqueryAlias: __scalar_sq_1 | +| | Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] | +| | Projection: customer.c_acctbal | +| | Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) | +| | TableScan: customer projection=[c_phone, c_acctbal] | +| physical_plan | SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] | +| | SortExec: expr=[cntrycode@0 ASC NULLS LAST] | +| | ProjectionExec: expr=[cntrycode@0 as cntrycode, COUNT(UInt8(1))@1 as numcust, SUM(custsale.c_acctbal)@2 as totacctbal] | +| | AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2), input_partitions=2 | +| | AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] | +| | ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] | +| | NestedLoopJoinExec: join_type=Inner, filter=BinaryExpr { left: CastExpr { expr: Column { name: "c_acctbal", index: 0 }, cast_type: Decimal128(19, 6), cast_options: CastOptions { safe: false, format_options: FormatOptions { safe: true, null: "", date_format: None, datetime_format: None, timestamp_format: None, timestamp_tz_format: None, time_format: None } } }, op: Gt, right: Column { name: "AVG(customer.c_acctbal)", index: 1 } } | +| | ProjectionExec: expr=[c_phone@1 as c_phone, c_acctbal@2 as c_acctbal] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(Column { name: "c_custkey", index: 0 }, Column { name: "o_custkey", index: 0 })] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2), input_partitions=2 | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: Use substr(c_phone@1, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | RepartitionExec: partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2), input_partitions=0 | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | AggregateExec: mode=Final, gby=[], aggr=[AVG(customer.c_acctbal)] | +| | CoalescePartitionsExec | +| | AggregateExec: mode=Partial, gby=[], aggr=[AVG(customer.c_acctbal)] | +| | RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=0 | +| | ProjectionExec: expr=[c_acctbal@1 as c_acctbal] | +| | CoalesceBatchesExec: target_batch_size=8192 | +| | FilterExec: c_acctbal@1 > Some(0),15,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) | +| | MemoryExec: partitions=0, partition_sizes=[] | +| | | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file diff --git a/datafusion/core/tests/sql/subqueries.rs b/datafusion/core/tests/sql/subqueries.rs index d68433a1fc92b..f3da709f88b86 100644 --- a/datafusion/core/tests/sql/subqueries.rs +++ b/datafusion/core/tests/sql/subqueries.rs @@ -821,7 +821,7 @@ async fn exists_subquery_with_limit() -> Result<()> { "| 44 | d |", "+-------+---------+", ]; - assert_batches_eq!(expected, &results); + assert_batches_sorted_eq!(expected, &results); Ok(()) } @@ -890,7 +890,7 @@ async fn not_exists_subquery_with_limit0() -> Result<()> { "| 44 | d |", "+-------+---------+", ]; - assert_batches_eq!(expected, &results); + assert_batches_sorted_eq!(expected, &results); Ok(()) } @@ -952,6 +952,46 @@ async fn in_non_correlated_subquery_with_limit() -> Result<()> { Ok(()) } +#[tokio::test] +async fn uncorrelated_scalar_subquery_with_limit0() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT t2_id FROM t2 limit 0) FROM t1"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // not de-correlated + let expected = vec![ + "Projection: t1.t1_id, __scalar_sq_1.t2_id AS t2_id [t1_id:UInt32;N, t2_id:UInt32;N]", + " Left Join: [t1_id:UInt32;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", + " EmptyRelation [t2_id:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+-------+", + "| t1_id | t2_id |", + "+-------+-------+", + "| 11 | |", + "| 22 | |", + "| 33 | |", + "| 44 | |", + "+-------+-------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + #[tokio::test] async fn support_union_subquery() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", true)?; @@ -1021,15 +1061,15 @@ async fn simple_uncorrelated_scalar_subquery() -> Result<()> { async fn simple_uncorrelated_scalar_subquery2() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", true)?; - let sql = "select (select count(*) from t1) as b, (select count(1) from t2) as c"; + let sql = "select (select count(*) from t1) as b, (select count(1) from t2)"; let msg = format!("Creating logical plan for '{sql}'"); let dataframe = ctx.sql(sql).await.expect(&msg); let plan = dataframe.into_optimized_plan()?; let expected = vec![ - "Projection: __scalar_sq_1.COUNT(UInt8(1)) AS b, __scalar_sq_2.COUNT(Int64(1)) AS c [b:Int64;N, c:Int64;N]", - " CrossJoin: [COUNT(UInt8(1)):Int64;N, COUNT(Int64(1)):Int64;N]", + "Projection: __scalar_sq_1.COUNT(UInt8(1)) AS b, __scalar_sq_2.COUNT(Int64(1)) AS COUNT(Int64(1)) [b:Int64;N, COUNT(Int64(1)):Int64;N]", + " Left Join: [COUNT(UInt8(1)):Int64;N, COUNT(Int64(1)):Int64;N]", " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N]", " Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]] [COUNT(UInt8(1)):Int64;N]", " TableScan: t1 projection=[t1_id] [t1_id:UInt32;N]", @@ -1047,13 +1087,318 @@ async fn simple_uncorrelated_scalar_subquery2() -> Result<()> { // assert data let results = execute_to_batches(&ctx, sql).await; let expected = vec![ - "+---+---+", - "| b | c |", - "+---+---+", - "| 4 | 4 |", - "+---+---+", + "+---+-----------------+", + "| b | COUNT(Int64(1)) |", + "+---+-----------------+", + "| 4 | 4 |", + "+---+-----------------+", ]; - assert_batches_eq!(expected, &results); + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = + "SELECT t1_id, (SELECT count(*) FROM t2 WHERE t2.t2_int = t1.t1_int) from t1"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.COUNT(UInt8(1)) END AS COUNT(UInt8(1)) [t1_id:UInt32;N, COUNT(UInt8(1)):Int64;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_id:UInt32;N, t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)), t2.t2_int, __always_true [COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+-----------------+", + "| t1_id | COUNT(UInt8(1)) |", + "+-------+-----------------+", + "| 33 | 3 |", + "| 22 | 0 |", + "| 11 | 1 |", + "| 44 | 0 |", + "+-------+-----------------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg2() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT count(*) FROM t2 WHERE t2.t2_int = t1.t1_int) as cnt from t1"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.COUNT(UInt8(1)) END AS cnt [t1_id:UInt32;N, cnt:Int64;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_id:UInt32;N, t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)), t2.t2_int, __always_true [COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+-----+", + "| t1_id | cnt |", + "+-------+-----+", + "| 33 | 3 |", + "| 22 | 0 |", + "| 11 | 1 |", + "| 44 | 0 |", + "+-------+-----+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_with_alias() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT count(*) as _cnt FROM t2 WHERE t2.t2_int = t1.t1_int) as cnt from t1"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) AS _cnt ELSE __scalar_sq_1._cnt END AS cnt [t1_id:UInt32;N, cnt:Int64;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_id:UInt32;N, t1_int:UInt32;N, _cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [_cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)) AS _cnt, t2.t2_int, __always_true [_cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+-----+", + "| t1_id | cnt |", + "+-------+-----+", + "| 33 | 3 |", + "| 22 | 0 |", + "| 11 | 1 |", + "| 44 | 0 |", + "+-------+-----+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_complex_expr() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT count(*) + 2 as _cnt FROM t2 WHERE t2.t2_int = t1.t1_int) from t1"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) AS _cnt ELSE __scalar_sq_1._cnt END AS _cnt [t1_id:UInt32;N, _cnt:Int64;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_id:UInt32;N, t1_int:UInt32;N, _cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [_cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)) + Int64(2) AS _cnt, t2.t2_int, __always_true [_cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+------+", + "| t1_id | _cnt |", + "+-------+------+", + "| 11 | 3 |", + "| 22 | 2 |", + "| 33 | 5 |", + "| 44 | 2 |", + "+-------+------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_where_clause() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "select t1.t1_int from t1 where (select count(*) from t2 where t1.t1_id = t2.t2_id) < t1.t1_int"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_int [t1_int:UInt32;N]", + " Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.COUNT(UInt8(1)) END < CAST(t1.t1_int AS Int64) [t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " Projection: t1.t1_int, __scalar_sq_1.COUNT(UInt8(1)), __scalar_sq_1.__always_true [t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " Left Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, t2_id:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N, t2_id:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)), t2.t2_id, __always_true [COUNT(UInt8(1)):Int64;N, t2_id:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_id, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_id:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+--------+", + "| t1_int |", + "+--------+", + "| 2 |", + "| 4 |", + "| 3 |", + "+--------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +#[ignore] +async fn correlated_scalar_subquery_sum_agg_bug() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "select t1.t1_int from t1 where (select sum(t2_int) is null from t2 where t1.t1_id = t2.t2_id)"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_int [t1_int:UInt32;N]", + " Inner Join: t1.t1_id = __scalar_sq_1.t2_id [t1_id:UInt32;N, t1_int:UInt32;N, t2_id:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [t2_id:UInt32;N]", + " Projection: t2.t2_id [t2_id:UInt32;N]", + " Filter: SUM(t2.t2_int) IS NULL [t2_id:UInt32;N, SUM(t2.t2_int):UInt64;N]", + " Aggregate: groupBy=[[t2.t2_id]], aggr=[[SUM(t2.t2_int)]] [t2_id:UInt32;N, SUM(t2.t2_int):UInt64;N]", + " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+--------+", + "| t1_int |", + "+--------+", + "| 2 |", + "| 4 |", + "| 3 |", + "+--------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_in_having() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "select t1.t1_int from t1 group by t1.t1_int having (select count(*) from t2 where t1.t1_int = t2.t2_int) = 0"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_int [t1_int:UInt32;N]", + " Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.COUNT(UInt8(1)) END = Int64(0) [t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " Projection: t1.t1_int, __scalar_sq_1.COUNT(UInt8(1)), __scalar_sq_1.__always_true [t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_int:UInt32;N, COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " Aggregate: groupBy=[[t1.t1_int]], aggr=[[]] [t1_int:UInt32;N]", + " TableScan: t1 projection=[t1_int] [t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)), t2.t2_int, __always_true [COUNT(UInt8(1)):Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+--------+", + "| t1_int |", + "+--------+", + "| 2 |", + "| 4 |", + "+--------+", + ]; + assert_batches_sorted_eq!(expected, &results); Ok(()) } diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 8514a95014266..449bec48b051f 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -223,6 +223,9 @@ fn build_join( in_predicate_opt: in_predicate_opt.clone(), exists_sub_query: in_predicate_opt.is_none(), can_pull_up: true, + need_collect_count_expr_map: false, + collected_count_expr_map: Default::default(), + expr_check_map: Default::default(), }; let new_plan = subquery.clone().rewrite(&mut pull_up)?; if !pull_up.can_pull_up { diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 4d8002c1a9129..46ad5f2a80fac 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -17,13 +17,17 @@ use crate::alias::AliasGenerator; use crate::optimizer::ApplyOrder; -use crate::utils::{conjunction, replace_qualified_name, PullUpCorrelatedExpr}; +use crate::utils::{ + conjunction, replace_qualified_name, ExprCheckMap, PullUpCorrelatedExpr, +}; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}; +use datafusion_common::tree_node::{ + RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, +}; use datafusion_common::{Column, DataFusionError, Result}; use datafusion_expr::logical_plan::{JoinType, Subquery}; -use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; -use std::collections::BTreeSet; +use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; +use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; /// Optimizer rule for rewriting subquery filters to joins @@ -66,7 +70,7 @@ impl OptimizerRule for ScalarSubqueryToJoin { ) -> Result> { match plan { LogicalPlan::Filter(filter) => { - let (subqueries, expr) = + let (subqueries, mut rewrite_expr) = self.extract_subquery_exprs(&filter.predicate, self.alias.clone())?; if subqueries.is_empty() { @@ -77,27 +81,62 @@ impl OptimizerRule for ScalarSubqueryToJoin { // iterate through all subqueries in predicate, turning each into a left join let mut cur_input = filter.input.as_ref().clone(); for (subquery, alias) in subqueries { - if let Some(optimized_subquery) = - build_join(&subquery, &cur_input, &alias)? + if let Some((optimized_subquery, expr_check_map)) = + build_join(&subquery, &cur_input, &alias, true)? { + if !expr_check_map.is_empty() { + rewrite_expr = + rewrite_expr.clone().transform_up(&|expr| { + if let Expr::Column(col) = &expr { + if let Some((expr1, expr2)) = + expr_check_map.get(&col.name) + { + let new_expr = Expr::Case(expr::Case { + expr: None, + when_then_expr: vec![( + Box::new(Expr::IsNull(Box::new( + Expr::Column( + Column::new_unqualified( + "__always_true", + ), + ), + ))), + Box::new(expr2.clone()), + )], + else_expr: Some(Box::new(expr1.clone())), + }); + Ok(Transformed::Yes(new_expr)) + } else { + Ok(Transformed::No(expr)) + } + } else { + Ok(Transformed::No(expr)) + } + })?; + } cur_input = optimized_subquery; } else { // if we can't handle all of the subqueries then bail for now return Ok(None); } } - let new_plan = - LogicalPlanBuilder::from(cur_input).filter(expr)?.build()?; + let new_plan = LogicalPlanBuilder::from(cur_input) + .filter(rewrite_expr)? + .build()?; Ok(Some(new_plan)) } LogicalPlan::Projection(projection) => { let mut all_subqueryies = vec![]; - let mut rewrite_exprs = vec![]; + let mut expr_to_rewrite_expr_map = HashMap::new(); + let mut subquery_to_expr_map = HashMap::new(); for expr in projection.expr.iter() { - let (subqueries, expr) = + let (subqueries, rewrite_exprs) = self.extract_subquery_exprs(expr, self.alias.clone())?; + for (subquery, _) in &subqueries { + subquery_to_expr_map.insert(subquery.clone(), expr.clone()); + } all_subqueryies.extend(subqueries); - rewrite_exprs.push(expr); + expr_to_rewrite_expr_map.insert(expr, rewrite_exprs); } if all_subqueryies.is_empty() { // regular projection, no subquery exists clause here @@ -106,17 +145,62 @@ impl OptimizerRule for ScalarSubqueryToJoin { // iterate through all subqueries in predicate, turning each into a left join let mut cur_input = projection.input.as_ref().clone(); for (subquery, alias) in all_subqueryies { - if let Some(optimized_subquery) = - build_join(&subquery, &cur_input, &alias)? + if let Some((optimized_subquery, expr_check_map)) = + build_join(&subquery, &cur_input, &alias, true)? { cur_input = optimized_subquery; + if !expr_check_map.is_empty() { + if let Some(expr) = subquery_to_expr_map.get(&subquery) { + if let Some(rewrite_expr) = + expr_to_rewrite_expr_map.get(expr) + { + let new_expr = rewrite_expr.clone().transform_up(&|expr| { + if let Expr::Column(col) = &expr { + if let Some((expr1, expr2)) = expr_check_map.get(&col.name) + { + let new_expr = Expr::Case(expr::Case { + expr: None, + when_then_expr: vec![( + Box::new(Expr::IsNull(Box::new( + Expr::Column(Column::new_unqualified("__always_true")), + ))), + Box::new(expr2.clone()), + )], + else_expr: Some(Box::new(expr1.clone())), + }); + Ok(Transformed::Yes(new_expr)) + } else { + Ok(Transformed::No(expr)) + } + } else { + Ok(Transformed::No(expr)) + } + + })?; + expr_to_rewrite_expr_map.insert(expr, new_expr); + } + } + } } else { // if we can't handle all of the subqueries then bail for now return Ok(None); } } + + let mut proj_exprs = vec![]; + for expr in projection.expr.iter() { + let old_expr_name = expr.display_name()?; + let new_expr = expr_to_rewrite_expr_map.get(expr).unwrap(); + let new_expr_name = new_expr.display_name()?; + if new_expr_name != old_expr_name { + proj_exprs + .push(Expr::Alias(Box::new(new_expr.clone()), old_expr_name)) + } else { + proj_exprs.push(new_expr.clone()); + } + } let new_plan = LogicalPlanBuilder::from(cur_input) - .project(rewrite_exprs)? + .project(proj_exprs)? .build()?; Ok(Some(new_plan)) } @@ -205,7 +289,7 @@ impl TreeNodeRewriter for ExtractScalarSubQuery { /// /// ```text /// select c.id from customers c -/// cross join (select avg(total) as val from orders) a +/// left join (select avg(total) as val from orders) a /// where c.balance > a.val /// ``` /// @@ -219,7 +303,8 @@ fn build_join( subquery: &Subquery, filter_input: &LogicalPlan, subquery_alias: &str, -) -> Result> { + need_collect_count_expr_map: bool, +) -> Result> { let subquery_plan = subquery.subquery.as_ref(); let mut pull_up = PullUpCorrelatedExpr { join_filters: vec![], @@ -227,6 +312,9 @@ fn build_join( in_predicate_opt: None, exists_sub_query: false, can_pull_up: true, + need_collect_count_expr_map, + collected_count_expr_map: Default::default(), + expr_check_map: Default::default(), }; let new_plan = subquery_plan.clone().rewrite(&mut pull_up)?; if !pull_up.can_pull_up { @@ -236,6 +324,7 @@ fn build_join( let sub_query_alias = LogicalPlanBuilder::from(new_plan) .alias(subquery_alias.to_string())? .build()?; + let mut all_correlated_cols = BTreeSet::new(); pull_up .correlated_subquery_cols_map @@ -257,9 +346,14 @@ fn build_join( schema: _, }) => sub_query_alias, _ => { - // if not correlated, group down to 1 row and cross join on that (preserving row count) + // if not correlated, group down to 1 row and left join on that (preserving row count) LogicalPlanBuilder::from(filter_input.clone()) - .cross_join(sub_query_alias)? + .join( + sub_query_alias, + JoinType::Left, + (Vec::::new(), Vec::::new()), + None, + )? .build()? } } @@ -274,7 +368,7 @@ fn build_join( )? .build()? }; - Ok(Some(new_plan)) + Ok(Some((new_plan, pull_up.expr_check_map.clone()))) } #[cfg(test)] @@ -455,7 +549,7 @@ mod tests { // it will optimize, but fail for the same reason the unoptimized query would let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n Left Join: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ @@ -487,7 +581,7 @@ mod tests { let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n Left Join: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ @@ -837,7 +931,7 @@ mod tests { let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n Filter: customer.c_custkey < __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n Left Join: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ @@ -868,7 +962,7 @@ mod tests { let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n Filter: customer.c_custkey = __scalar_sq_1.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ + \n Left Join: [c_custkey:Int64, c_name:Utf8, MAX(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ \n SubqueryAlias: __scalar_sq_1 [MAX(orders.o_custkey):Int64;N]\ \n Projection: MAX(orders.o_custkey) [MAX(orders.o_custkey):Int64;N]\ @@ -969,8 +1063,8 @@ mod tests { let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ \n Filter: customer.c_custkey BETWEEN __scalar_sq_1.MIN(orders.o_custkey) AND __scalar_sq_2.MAX(orders.o_custkey) [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, MAX(orders.o_custkey):Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, MAX(orders.o_custkey):Int64;N]\ - \n CrossJoin: [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N]\ + \n Left Join: [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N, MAX(orders.o_custkey):Int64;N]\ + \n Left Join: [c_custkey:Int64, c_name:Utf8, MIN(orders.o_custkey):Int64;N]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ \n SubqueryAlias: __scalar_sq_1 [MIN(orders.o_custkey):Int64;N]\ \n Projection: MIN(orders.o_custkey) [MIN(orders.o_custkey):Int64;N]\ diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index 2d072b1e12429..177e9bf4ab545 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -17,9 +17,12 @@ //! Collection of utility functions that are leveraged by the query optimizer rules +use crate::simplify_expressions::{ExprSimplifier, SimplifyContext}; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}; -use datafusion_common::{plan_err, Column, DFSchemaRef}; +use datafusion_common::tree_node::{ + RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, +}; +use datafusion_common::{plan_err, Column, DFSchemaRef, ScalarValue}; use datafusion_common::{DFSchema, Result}; use datafusion_expr::expr::{BinaryExpr, Sort}; use datafusion_expr::expr_rewriter::{ @@ -30,8 +33,9 @@ use datafusion_expr::utils::from_plan; use datafusion_expr::{ and, logical_plan::{Filter, LogicalPlan}, - EmptyRelation, Expr, Operator, + AggregateFunction, EmptyRelation, Expr, Operator, }; +use datafusion_physical_expr::execution_props::ExecutionProps; use log::{debug, trace}; use std::collections::{BTreeSet, HashMap}; use std::ops::Deref; @@ -393,15 +397,22 @@ pub fn log_plan(description: &str, plan: &LogicalPlan) { /// It adds the inner reference columns to the 'Projection' or 'Aggregate' of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. pub struct PullUpCorrelatedExpr { pub join_filters: Vec, - // map of the plan and its holding correlated columns + // mapping from the plan to its holding correlated columns pub correlated_subquery_cols_map: HashMap>, pub in_predicate_opt: Option, // indicate whether it is Exists(Not Exists) SubQuery pub exists_sub_query: bool, // indicate whether the correlated expressions can pull up or not pub can_pull_up: bool, + // indicate whether the subquery need to collect count expr mapping + pub need_collect_count_expr_map: bool, + // mapping from expr name to the pair of agg expr and its evaluation result on empty record batch + pub collected_count_expr_map: HashMap, + pub expr_check_map: ExprCheckMap, } +pub type ExprCheckMap = HashMap; + impl TreeNodeRewriter for PullUpCorrelatedExpr { type N = LogicalPlan; @@ -430,7 +441,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { } } _ if plan.expressions().iter().any(|expr| expr.contains_outer()) => { - // the unsupported cases, the plan expressions contain out reference columns(like window expressions or agg expressions) + // the unsupported cases, the plan expressions contain out reference columns(like window expressions) self.can_pull_up = false; Ok(RewriteRecursion::Stop) } @@ -476,8 +487,48 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { &mut local_correlated_cols, ); // add missing columns to Projection - let missing_exprs = + let mut missing_exprs = self.collect_missing_exprs(&projection.expr, &local_correlated_cols)?; + if !self.collected_count_expr_map.is_empty() { + let head_expr = missing_exprs.get(0); + if let Some(expr) = head_expr { + let result_expr = expr.clone().transform_up(&|expr| { + if let Expr::Column(Column { name, .. }) = &expr { + if let Some((_, result_expr)) = + self.collected_count_expr_map.get(name) + { + Ok(Transformed::Yes(result_expr.clone())) + } else { + Ok(Transformed::No(expr)) + } + } else { + Ok(Transformed::No(expr)) + } + })?; + let scalar_expr = match expr { + Expr::Alias(_, alias) => ( + alias.to_string(), + Expr::Column(Column::new_unqualified(alias)), + ), + Expr::Column(Column { relation: _, name }) => { + (name.to_string(), expr.clone()) + } + _ => { + let scalar_column = expr.display_name()?; + ( + scalar_column.clone(), + Expr::Column(Column::new_unqualified(scalar_column)), + ) + } + }; + self.expr_check_map + .insert(scalar_expr.0, (scalar_expr.1, result_expr)); + missing_exprs.push(Expr::Column(Column::new_unqualified( + "__always_true".to_string(), + ))); + } + } + let new_plan = LogicalPlanBuilder::from((*projection.input).clone()) .project(missing_exprs)? .build()?; @@ -493,10 +544,37 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { &mut local_correlated_cols, ); // add missing columns to Aggregation's group expression - let missing_exprs = self.collect_missing_exprs( + let mut missing_exprs = self.collect_missing_exprs( &aggregate.group_expr, &local_correlated_cols, )?; + + if self.need_collect_count_expr_map && aggregate.group_expr.is_empty() { + let agg_result_exprs = agg_exprs_eva_result_on_empty_batch( + &aggregate.aggr_expr, + subquery_schema, + )?; + if !missing_exprs.is_empty() { + let scalar_agg = !agg_result_exprs.values().any(|result_expr| { + matches!(result_expr, Expr::Literal(ScalarValue::Null)) + }); + if scalar_agg { + let internal_always_true_col = Expr::Alias( + Box::new(Expr::Literal(ScalarValue::Boolean(Some(true)))), + "__always_true".to_string(), + ); + missing_exprs.push(internal_always_true_col); + for (agg_expr, result_expr_on_empty) in agg_result_exprs { + let agg_expr_name = agg_expr.display_name()?; + self.collected_count_expr_map.insert( + agg_expr_name, + (agg_expr, result_expr_on_empty), + ); + } + } + } + } + let new_plan = LogicalPlanBuilder::from((*aggregate.input).clone()) .aggregate(missing_exprs, aggregate.aggr_expr.to_vec())? .build()?; @@ -618,6 +696,40 @@ fn remove_duplicated_filter(filters: Vec, in_predicate: &Expr) -> Vec>() } +fn agg_exprs_eva_result_on_empty_batch( + agg_expr: &[Expr], + schema: DFSchemaRef, +) -> Result> { + let mut result_expr_map = HashMap::new(); + for e in agg_expr.iter() { + let new_expr = e.clone().transform_up(&|expr| { + let new_expr = match expr { + Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction { + fun, + .. + }) => { + if matches!(fun, AggregateFunction::Count) { + Transformed::Yes(Expr::Literal(ScalarValue::Int64(Some(0)))) + } else { + Transformed::Yes(Expr::Literal(ScalarValue::Null)) + } + } + Expr::AggregateUDF(_) => { + Transformed::Yes(Expr::Literal(ScalarValue::Null)) + } + _ => Transformed::No(expr), + }; + Ok(new_expr) + })?; + + let props = ExecutionProps::new(); + let info = SimplifyContext::new(&props).with_schema(schema.clone()); + let simplifier = ExprSimplifier::new(info); + result_expr_map.insert(e.clone(), simplifier.simplify(new_expr)?); + } + Ok(result_expr_map) +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/optimizer/tests/integration-test.rs b/datafusion/optimizer/tests/integration-test.rs index 350e6e3a75448..c101eff9de9e6 100644 --- a/datafusion/optimizer/tests/integration-test.rs +++ b/datafusion/optimizer/tests/integration-test.rs @@ -66,14 +66,13 @@ fn subquery_filter_with_cast() -> Result<()> { )"; let plan = test_sql(sql)?; let expected = "Projection: test.col_int32\ - \n Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.AVG(test.col_int32)\ - \n CrossJoin:\ - \n TableScan: test projection=[col_int32]\ - \n SubqueryAlias: __scalar_sq_1\ - \n Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]]\ - \n Projection: test.col_int32\ - \n Filter: test.col_utf8 >= Utf8(\"2002-05-08\") AND test.col_utf8 <= Utf8(\"2002-05-13\")\ - \n TableScan: test projection=[col_int32, col_utf8]"; + \n Inner Join: Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.AVG(test.col_int32)\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: __scalar_sq_1\ + \n Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]]\ + \n Projection: test.col_int32\ + \n Filter: test.col_utf8 >= Utf8(\"2002-05-08\") AND test.col_utf8 <= Utf8(\"2002-05-13\")\ + \n TableScan: test projection=[col_int32, col_utf8]"; assert_eq!(expected, format!("{plan:?}")); Ok(()) } diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 2fe44602d831a..607572862290b 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -237,9 +237,16 @@ impl Accumulator for AvgAccumulator { ScalarValue::Float64(e) => { Ok(ScalarValue::Float64(e.map(|f| f / self.count as f64))) } - ScalarValue::Decimal128(value, precision, scale) => { - Ok(match value { - None => ScalarValue::Decimal128(None, precision, scale), + ScalarValue::Decimal128(value, _, scale) => { + match value { + None => match &self.return_data_type { + DataType::Decimal128(p, s) => { + Ok(ScalarValue::Decimal128(None, *p, *s)) + } + other => Err(DataFusionError::Internal(format!( + "Error returned data type in AvgAccumulator {other:?}" + ))), + }, Some(value) => { // now the sum_type and return type is not the same, need to convert the sum type to return type calculate_result_decimal_for_avg( @@ -247,9 +254,9 @@ impl Accumulator for AvgAccumulator { self.count as i128, scale, &self.return_data_type, - )? + ) } - }) + } } _ => Err(DataFusionError::Internal( "Sum should be f64 or decimal128 on average".to_string(), From f64473e086fc98a7d0ebafab911b28dbb8586eb8 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Tue, 30 May 2023 18:04:01 +0800 Subject: [PATCH 06/13] resolve review comments --- datafusion/expr/src/logical_plan/plan.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index f1ef72d3148bc..b38ec4af74e12 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -403,6 +403,7 @@ impl LogicalPlan { Ok(using_columns) } + /// returns the first output expression of this `LogicalPlan` node. pub fn head_output_expr(&self) -> Result> { match self { LogicalPlan::Projection(projection) => { @@ -415,10 +416,12 @@ impl LogicalPlan { Ok(Some(agg.group_expr.as_slice()[0].clone())) } } - LogicalPlan::Filter(filter) => filter.input.head_output_expr(), - LogicalPlan::Distinct(distinct) => distinct.input.head_output_expr(), - LogicalPlan::Sort(sort) => sort.input.head_output_expr(), - LogicalPlan::Limit(limit) => limit.input.head_output_expr(), + LogicalPlan::Filter(Filter { input, .. }) + | LogicalPlan::Distinct(Distinct { input, .. }) + | LogicalPlan::Sort(Sort { input, .. }) + | LogicalPlan::Limit(Limit { input, .. }) + | LogicalPlan::Repartition(Repartition { input, .. }) + | LogicalPlan::Window(Window { input, .. }) => input.head_output_expr(), LogicalPlan::Join(Join { left, right, @@ -442,8 +445,6 @@ impl LogicalPlan { cross.left.head_output_expr() } } - LogicalPlan::Repartition(repartition) => repartition.input.head_output_expr(), - LogicalPlan::Window(window) => window.input.head_output_expr(), LogicalPlan::Union(union) => Ok(Some(Expr::Column( union.schema.fields()[0].qualified_column(), ))), From df94f18291e490eda3aea94c320338f846a4fd42 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 1 Jun 2023 10:53:13 +0800 Subject: [PATCH 07/13] refine the count bug handling logic --- datafusion/core/tests/sql/subqueries.rs | 225 ++++++++ datafusion/optimizer/src/decorrelate.rs | 505 ++++++++++++++++++ .../src/decorrelate_predicate_subquery.rs | 9 +- datafusion/optimizer/src/lib.rs | 1 + .../optimizer/src/scalar_subquery_to_join.rs | 111 ++-- datafusion/optimizer/src/utils.rs | 353 +----------- 6 files changed, 803 insertions(+), 401 deletions(-) create mode 100644 datafusion/optimizer/src/decorrelate.rs diff --git a/datafusion/core/tests/sql/subqueries.rs b/datafusion/core/tests/sql/subqueries.rs index f3da709f88b86..d5d136c6f8339 100644 --- a/datafusion/core/tests/sql/subqueries.rs +++ b/datafusion/core/tests/sql/subqueries.rs @@ -1359,6 +1359,94 @@ async fn correlated_scalar_subquery_sum_agg_bug() -> Result<()> { Ok(()) } +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_with_having() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT count(*) + 2 as cnt_plus_2 FROM t2 WHERE t2.t2_int = t1.t1_int having count(*) >1) from t1"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // the having condition is kept as the normal filter condition, no need to pull up + let expected = vec![ + "Projection: t1.t1_id, __scalar_sq_1.cnt_plus_2 AS cnt_plus_2 [t1_id:UInt32;N, cnt_plus_2:Int64;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_id:UInt32;N, t1_int:UInt32;N, cnt_plus_2:Int64;N, t2_int:UInt32;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [cnt_plus_2:Int64;N, t2_int:UInt32;N]", + " Projection: COUNT(UInt8(1)) + Int64(2) AS cnt_plus_2, t2.t2_int [cnt_plus_2:Int64;N, t2_int:UInt32;N]", + " Filter: COUNT(UInt8(1)) > Int64(1) [t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N]", + " Projection: t2.t2_int, COUNT(UInt8(1)) [t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+------------+", + "| t1_id | cnt_plus_2 |", + "+-------+------------+", + "| 11 | |", + "| 22 | |", + "| 33 | 5 |", + "| 44 | |", + "+-------+------------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_with_pull_up_having() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "SELECT t1_id, (SELECT count(*) + 2 as cnt_plus_2 FROM t2 WHERE t2.t2_int = t1.t1_int having count(*) = 0) from t1"; + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // the having condition need to pull up and evaluated after the left out join + let expected = vec![ + "Projection: t1.t1_id, CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) AS cnt_plus_2 WHEN __scalar_sq_1.COUNT(UInt8(1)) != Int64(0) THEN NULL ELSE __scalar_sq_1.cnt_plus_2 END AS cnt_plus_2 [t1_id:UInt32;N, cnt_plus_2:Int64;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_id:UInt32;N, t1_int:UInt32;N, cnt_plus_2:Int64;N, t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_id, t1_int] [t1_id:UInt32;N, t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [cnt_plus_2:Int64;N, t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)) + Int64(2) AS cnt_plus_2, t2.t2_int, COUNT(UInt8(1)), __always_true [cnt_plus_2:Int64;N, t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+-------+------------+", + "| t1_id | cnt_plus_2 |", + "+-------+------------+", + "| 11 | |", + "| 22 | 2 |", + "| 33 | |", + "| 44 | 2 |", + "+-------+------------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + #[tokio::test] async fn correlated_scalar_subquery_count_agg_in_having() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id", true)?; @@ -1402,3 +1490,140 @@ async fn correlated_scalar_subquery_count_agg_in_having() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_in_nested_projection() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "select t1.t1_int from t1 where (select cnt from (select count(*) as cnt, sum(t2_int) from t2 where t1.t1_int = t2.t2_int)) = 0"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_int [t1_int:UInt32;N]", + " Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.cnt END = Int64(0) [t1_int:UInt32;N, cnt:Int64;N, __always_true:Boolean;N]", + " Projection: t1.t1_int, __scalar_sq_1.cnt, __scalar_sq_1.__always_true [t1_int:UInt32;N, cnt:Int64;N, __always_true:Boolean;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_int:UInt32;N, cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_int] [t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)) AS cnt, t2.t2_int, __always_true [cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+--------+", + "| t1_int |", + "+--------+", + "| 2 |", + "| 4 |", + "+--------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_in_nested_subquery() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "select t1.t1_int from t1 where \ + (select cnt_plus_one + 1 as cnt_plus_two from \ + (select cnt + 1 as cnt_plus_one from \ + (select count(*) as cnt, sum(t2_int) s from t2 where t1.t1_int = t2.t2_int having cnt = 0)\ + )\ + ) = 2"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + // pull up the deeply nested having condition + let expected = vec![ + "Projection: t1.t1_int [t1_int:UInt32;N]", + " Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(2) WHEN __scalar_sq_1.COUNT(UInt8(1)) != Int64(0) THEN NULL ELSE __scalar_sq_1.cnt_plus_two END = Int64(2) [t1_int:UInt32;N, cnt_plus_two:Int64;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " Projection: t1.t1_int, __scalar_sq_1.cnt_plus_two, __scalar_sq_1.COUNT(UInt8(1)), __scalar_sq_1.__always_true [t1_int:UInt32;N, cnt_plus_two:Int64;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_int:UInt32;N, cnt_plus_two:Int64;N, t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_int] [t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [cnt_plus_two:Int64;N, t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean]", + " Projection: COUNT(UInt8(1)) + Int64(1) + Int64(1) AS cnt_plus_two, t2.t2_int, COUNT(UInt8(1)), __always_true [cnt_plus_two:Int64;N, t2_int:UInt32;N, COUNT(UInt8(1)):Int64;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+--------+", + "| t1_int |", + "+--------+", + "| 2 |", + "| 4 |", + "+--------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn correlated_scalar_subquery_count_agg_in_case_when() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id", true)?; + + let sql = "select t1.t1_int from t1 where \ + (select case when count(*) = 1 then null else count(*) end as cnt from t2 where t2.t2_int = t1.t1_int)\ + = 0"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let plan = dataframe.into_optimized_plan()?; + + let expected = vec![ + "Projection: t1.t1_int [t1_int:UInt32;N]", + " Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.cnt END = Int64(0) [t1_int:UInt32;N, cnt:Int64;N, __always_true:Boolean;N]", + " Projection: t1.t1_int, __scalar_sq_1.cnt, __scalar_sq_1.__always_true [t1_int:UInt32;N, cnt:Int64;N, __always_true:Boolean;N]", + " Left Join: t1.t1_int = __scalar_sq_1.t2_int [t1_int:UInt32;N, cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean;N]", + " TableScan: t1 projection=[t1_int] [t1_int:UInt32;N]", + " SubqueryAlias: __scalar_sq_1 [cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Projection: CASE WHEN COUNT(UInt8(1)) = Int64(1) THEN Int64(NULL) ELSE COUNT(UInt8(1)) END AS cnt, t2.t2_int, __always_true [cnt:Int64;N, t2_int:UInt32;N, __always_true:Boolean]", + " Aggregate: groupBy=[[t2.t2_int, Boolean(true) AS __always_true]], aggr=[[COUNT(UInt8(1))]] [t2_int:UInt32;N, __always_true:Boolean, COUNT(UInt8(1)):Int64;N]", + " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + // assert data + let results = execute_to_batches(&ctx, sql).await; + let expected = vec![ + "+--------+", + "| t1_int |", + "+--------+", + "| 2 |", + "| 4 |", + "+--------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) +} diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs new file mode 100644 index 0000000000000..3e1a97cf88232 --- /dev/null +++ b/datafusion/optimizer/src/decorrelate.rs @@ -0,0 +1,505 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::simplify_expressions::{ExprSimplifier, SimplifyContext}; +use crate::utils::{ + collect_subquery_cols, conjunction, find_join_exprs, split_conjunction, +}; +use datafusion_common::tree_node::{ + RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, +}; +use datafusion_common::Result; +use datafusion_common::{Column, DFSchemaRef, DataFusionError, ScalarValue}; +use datafusion_expr::expr_rewriter::unnormalize_col; +use datafusion_expr::{ + expr, BinaryExpr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder, Operator, +}; +use datafusion_physical_expr::execution_props::ExecutionProps; +use std::collections::{BTreeSet, HashMap}; +use std::ops::Deref; + +/// This struct rewrite the sub query plan by pull up the correlated expressions(contains outer reference columns) from the inner subquery's [Filter]. +/// It adds the inner reference columns to the 'Projection' or 'Aggregate' of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. +pub struct PullUpCorrelatedExpr { + pub join_filters: Vec, + // mapping from the plan to its holding correlated columns + pub correlated_subquery_cols_map: HashMap>, + pub in_predicate_opt: Option, + // indicate whether it is Exists(Not Exists) SubQuery + pub exists_sub_query: bool, + // indicate whether the correlated expressions can pull up or not + pub can_pull_up: bool, + // indicate whether need to handle the Count bug during the pull up process + pub need_handle_count_bug: bool, + // mapping from the plan to its expressions' evaluation result on empty batch + pub collected_count_expr_map: HashMap, + // pull up having expr, which must be evaluated after the Join + pub pull_up_having_expr: Option, +} + +/// Used to indicate the unmatched rows from the inner(subquery) table after the left out Join +/// This is used to handle the Count bug +pub const UN_MATCHED_ROW_INDICATOR: &str = "__always_true"; + +/// Mapping from expr display name to its evaluation result on empty record batch (for example: 'count(*)' is 'ScalarValue(0)', 'count(*) + 2' is 'ScalarValue(2)') +pub type ExprResultMap = HashMap; + +impl TreeNodeRewriter for PullUpCorrelatedExpr { + type N = LogicalPlan; + + fn pre_visit(&mut self, plan: &LogicalPlan) -> Result { + match plan { + LogicalPlan::Filter(_) => Ok(RewriteRecursion::Continue), + LogicalPlan::Union(_) | LogicalPlan::Sort(_) | LogicalPlan::Extension(_) => { + let plan_hold_outer = !plan.all_out_ref_exprs().is_empty(); + if plan_hold_outer { + // the unsupported case + self.can_pull_up = false; + Ok(RewriteRecursion::Stop) + } else { + Ok(RewriteRecursion::Continue) + } + } + LogicalPlan::Limit(_) => { + let plan_hold_outer = !plan.all_out_ref_exprs().is_empty(); + match (self.exists_sub_query, plan_hold_outer) { + (false, true) => { + // the unsupported case + self.can_pull_up = false; + Ok(RewriteRecursion::Stop) + } + _ => Ok(RewriteRecursion::Continue), + } + } + _ if plan.expressions().iter().any(|expr| expr.contains_outer()) => { + // the unsupported cases, the plan expressions contain out reference columns(like window expressions) + self.can_pull_up = false; + Ok(RewriteRecursion::Stop) + } + _ => Ok(RewriteRecursion::Continue), + } + } + + fn mutate(&mut self, plan: LogicalPlan) -> Result { + let subquery_schema = plan.schema().clone(); + match &plan { + LogicalPlan::Filter(plan_filter) => { + let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); + let (mut join_filters, subquery_filters) = + find_join_exprs(subquery_filter_exprs)?; + if let Some(in_predicate) = &self.in_predicate_opt { + // in_predicate may be already included in the join filters, remove it from the join filters first. + join_filters = remove_duplicated_filter(join_filters, in_predicate); + } + let correlated_subquery_cols = + collect_subquery_cols(&join_filters, subquery_schema)?; + for expr in join_filters { + if !self.join_filters.contains(&expr) { + self.join_filters.push(expr) + } + } + + let mut expr_result_map_for_count_bug = HashMap::new(); + let pull_up_expr_opt = if let Some(expr_result_map) = + self.collected_count_expr_map.get(plan_filter.input.deref()) + { + if let Some(expr) = conjunction(subquery_filters.clone()) { + filter_exprs_evaluation_result_on_empty_batch( + &expr, + plan_filter.input.schema().clone(), + expr_result_map, + &mut expr_result_map_for_count_bug, + )? + } else { + None + } + } else { + None + }; + + match (&pull_up_expr_opt, &self.pull_up_having_expr) { + (Some(_), Some(_)) => { + // Error path + Err(DataFusionError::Plan( + "Unsupported Subquery plan".to_string(), + )) + } + (Some(_), None) => { + self.pull_up_having_expr = pull_up_expr_opt; + let new_plan = + LogicalPlanBuilder::from((*plan_filter.input).clone()) + .build()?; + self.correlated_subquery_cols_map + .insert(new_plan.clone(), correlated_subquery_cols); + Ok(new_plan) + } + (None, _) => { + // if the subquery still has filter expressions, restore them. + let mut plan = + LogicalPlanBuilder::from((*plan_filter.input).clone()); + if let Some(expr) = conjunction(subquery_filters) { + plan = plan.filter(expr)? + } + let new_plan = plan.build()?; + self.correlated_subquery_cols_map + .insert(new_plan.clone(), correlated_subquery_cols); + Ok(new_plan) + } + } + } + LogicalPlan::Projection(projection) + if self.in_predicate_opt.is_some() || !self.join_filters.is_empty() => + { + let mut local_correlated_cols = BTreeSet::new(); + collect_local_correlated_cols( + &plan, + &self.correlated_subquery_cols_map, + &mut local_correlated_cols, + ); + // add missing columns to Projection + let mut missing_exprs = + self.collect_missing_exprs(&projection.expr, &local_correlated_cols)?; + + let mut expr_result_map_for_count_bug = HashMap::new(); + if let Some(expr_result_map) = + self.collected_count_expr_map.get(projection.input.deref()) + { + proj_exprs_evaluation_result_on_empty_batch( + &projection.expr, + projection.input.schema().clone(), + expr_result_map, + &mut expr_result_map_for_count_bug, + )?; + if !expr_result_map_for_count_bug.is_empty() { + // has count bug + let un_matched_row = Expr::Column(Column::new_unqualified( + UN_MATCHED_ROW_INDICATOR.to_string(), + )); + // add the unmatched rows indicator to the Projection expressions + missing_exprs.push(un_matched_row); + } + } + + let new_plan = LogicalPlanBuilder::from((*projection.input).clone()) + .project(missing_exprs)? + .build()?; + if !expr_result_map_for_count_bug.is_empty() { + self.collected_count_expr_map + .insert(new_plan.clone(), expr_result_map_for_count_bug); + } + Ok(new_plan) + } + LogicalPlan::Aggregate(aggregate) + if self.in_predicate_opt.is_some() || !self.join_filters.is_empty() => + { + let mut local_correlated_cols = BTreeSet::new(); + collect_local_correlated_cols( + &plan, + &self.correlated_subquery_cols_map, + &mut local_correlated_cols, + ); + // add missing columns to Aggregation's group expressions + let mut missing_exprs = self.collect_missing_exprs( + &aggregate.group_expr, + &local_correlated_cols, + )?; + + // if the original group expressions are empty, need to handle the Count bug + let mut expr_result_map_for_count_bug = HashMap::new(); + if self.need_handle_count_bug + && aggregate.group_expr.is_empty() + && !missing_exprs.is_empty() + { + agg_exprs_evaluation_result_on_empty_batch( + &aggregate.aggr_expr, + aggregate.input.schema().clone(), + &mut expr_result_map_for_count_bug, + )?; + if !expr_result_map_for_count_bug.is_empty() { + // has count bug + let un_matched_row = Expr::Alias( + Box::new(Expr::Literal(ScalarValue::Boolean(Some(true)))), + UN_MATCHED_ROW_INDICATOR.to_string(), + ); + // add the unmatched rows indicator to the Aggregation's group expressions + missing_exprs.push(un_matched_row); + } + } + let new_plan = LogicalPlanBuilder::from((*aggregate.input).clone()) + .aggregate(missing_exprs, aggregate.aggr_expr.to_vec())? + .build()?; + if !expr_result_map_for_count_bug.is_empty() { + self.collected_count_expr_map + .insert(new_plan.clone(), expr_result_map_for_count_bug); + } + Ok(new_plan) + } + LogicalPlan::SubqueryAlias(alias) => { + let mut local_correlated_cols = BTreeSet::new(); + collect_local_correlated_cols( + &plan, + &self.correlated_subquery_cols_map, + &mut local_correlated_cols, + ); + let mut new_correlated_cols = BTreeSet::new(); + for col in local_correlated_cols.iter() { + new_correlated_cols + .insert(Column::new(Some(alias.alias.clone()), col.name.clone())); + } + self.correlated_subquery_cols_map + .insert(plan.clone(), new_correlated_cols); + Ok(plan) + } + LogicalPlan::Limit(limit) => { + // handling the limit clause in the subquery + match (self.exists_sub_query, self.join_filters.is_empty()) { + // un-correlated exist subquery, keep the limit + (true, true) => Ok(plan), + // Correlated exist subquery, remove the limit(so that correlated expressions can pull up) + (true, false) => { + if limit.fetch.filter(|limit_row| *limit_row == 0).is_some() { + Ok(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: limit.input.schema().clone(), + })) + } else { + LogicalPlanBuilder::from((*limit.input).clone()).build() + } + } + _ => Ok(plan), + } + } + _ => Ok(plan), + } + } +} + +impl PullUpCorrelatedExpr { + fn collect_missing_exprs( + &self, + exprs: &[Expr], + correlated_subquery_cols: &BTreeSet, + ) -> Result> { + let mut missing_exprs = vec![]; + if let Some(Expr::BinaryExpr(BinaryExpr { + left: _, + op: Operator::Eq, + right, + })) = &self.in_predicate_opt + { + if !matches!(right.deref(), Expr::Column(_)) + && !matches!(right.deref(), Expr::Literal(_)) + && !matches!(right.deref(), Expr::Alias(_, _)) + { + let alias_expr = right + .deref() + .clone() + .alias(format!("{:?}", unnormalize_col(right.deref().clone()))); + missing_exprs.push(alias_expr) + } + } + for expr in exprs { + if !missing_exprs.contains(expr) { + missing_exprs.push(expr.clone()) + } + } + for col in correlated_subquery_cols.iter() { + let col_expr = Expr::Column(col.clone()); + if !missing_exprs.contains(&col_expr) { + missing_exprs.push(col_expr) + } + } + if let Some(pull_up_having) = &self.pull_up_having_expr { + let filter_apply_columns = pull_up_having.to_columns()?; + for col in filter_apply_columns { + let col_expr = Expr::Column(col); + if !missing_exprs.contains(&col_expr) { + missing_exprs.push(col_expr) + } + } + } + + Ok(missing_exprs) + } +} + +fn collect_local_correlated_cols( + plan: &LogicalPlan, + all_cols_map: &HashMap>, + local_cols: &mut BTreeSet, +) { + for child in plan.inputs() { + if let Some(cols) = all_cols_map.get(child) { + local_cols.extend(cols.clone()); + } + // SubqueryAlias is treated as the leaf node + if !matches!(child, LogicalPlan::SubqueryAlias(_)) { + collect_local_correlated_cols(child, all_cols_map, local_cols); + } + } +} + +fn remove_duplicated_filter(filters: Vec, in_predicate: &Expr) -> Vec { + filters + .into_iter() + .filter(|filter| { + if filter == in_predicate { + return false; + } + + // ignore the binary order + !match (filter, in_predicate) { + (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { + (a_expr.op == b_expr.op) + && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) + || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) + } + _ => false, + } + }) + .collect::>() +} + +fn agg_exprs_evaluation_result_on_empty_batch( + agg_expr: &[Expr], + schema: DFSchemaRef, + expr_result_map_for_count_bug: &mut ExprResultMap, +) -> Result<()> { + for e in agg_expr.iter() { + let result_expr = e.clone().transform_up(&|expr| { + let new_expr = match expr { + Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction { + fun, + .. + }) => { + if matches!(fun, datafusion_expr::AggregateFunction::Count) { + Transformed::Yes(Expr::Literal(ScalarValue::Int64(Some(0)))) + } else { + Transformed::Yes(Expr::Literal(ScalarValue::Null)) + } + } + Expr::AggregateUDF(_) => { + Transformed::Yes(Expr::Literal(ScalarValue::Null)) + } + _ => Transformed::No(expr), + }; + Ok(new_expr) + })?; + + let props = ExecutionProps::new(); + let info = SimplifyContext::new(&props).with_schema(schema.clone()); + let simplifier = ExprSimplifier::new(info); + let result_expr = simplifier.simplify(result_expr)?; + if matches!(result_expr, Expr::Literal(ScalarValue::Int64(_))) { + expr_result_map_for_count_bug.insert(e.display_name()?, result_expr); + } + } + Ok(()) +} + +fn proj_exprs_evaluation_result_on_empty_batch( + proj_expr: &[Expr], + schema: DFSchemaRef, + input_expr_result_map_for_count_bug: &ExprResultMap, + expr_result_map_for_count_bug: &mut ExprResultMap, +) -> Result<()> { + for expr in proj_expr.iter() { + let result_expr = expr.clone().transform_up(&|expr| { + if let Expr::Column(Column { name, .. }) = &expr { + if let Some(result_expr) = input_expr_result_map_for_count_bug.get(name) { + Ok(Transformed::Yes(result_expr.clone())) + } else { + Ok(Transformed::No(expr)) + } + } else { + Ok(Transformed::No(expr)) + } + })?; + if result_expr.ne(expr) { + let props = ExecutionProps::new(); + let info = SimplifyContext::new(&props).with_schema(schema.clone()); + let simplifier = ExprSimplifier::new(info); + let result_expr = simplifier.simplify(result_expr)?; + let expr_name = match expr { + Expr::Alias(_, alias) => alias.to_string(), + Expr::Column(Column { relation: _, name }) => name.to_string(), + _ => expr.display_name()?, + }; + expr_result_map_for_count_bug.insert(expr_name, result_expr); + } + } + Ok(()) +} + +fn filter_exprs_evaluation_result_on_empty_batch( + filter_expr: &Expr, + schema: DFSchemaRef, + input_expr_result_map_for_count_bug: &ExprResultMap, + expr_result_map_for_count_bug: &mut ExprResultMap, +) -> Result> { + let result_expr = filter_expr.clone().transform_up(&|expr| { + if let Expr::Column(Column { name, .. }) = &expr { + if let Some(result_expr) = input_expr_result_map_for_count_bug.get(name) { + Ok(Transformed::Yes(result_expr.clone())) + } else { + Ok(Transformed::No(expr)) + } + } else { + Ok(Transformed::No(expr)) + } + })?; + let pull_up_expr = if result_expr.ne(filter_expr) { + let props = ExecutionProps::new(); + let info = SimplifyContext::new(&props).with_schema(schema); + let simplifier = ExprSimplifier::new(info); + let result_expr = simplifier.simplify(result_expr)?; + match &result_expr { + // evaluate to false or null on empty batch, no need to pull up + Expr::Literal(ScalarValue::Null) + | Expr::Literal(ScalarValue::Boolean(Some(false))) => None, + // evaluate to true on empty batch, need to pull up the expr + Expr::Literal(ScalarValue::Boolean(Some(true))) => { + for (name, exprs) in input_expr_result_map_for_count_bug { + expr_result_map_for_count_bug.insert(name.clone(), exprs.clone()); + } + Some(filter_expr.clone()) + } + // can not evaluate statically + _ => { + for input_expr in input_expr_result_map_for_count_bug.values() { + let new_expr = Expr::Case(expr::Case { + expr: None, + when_then_expr: vec![( + Box::new(result_expr.clone()), + Box::new(input_expr.clone()), + )], + else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null))), + }); + expr_result_map_for_count_bug + .insert(new_expr.display_name()?, new_expr); + } + None + } + } + } else { + for (name, exprs) in input_expr_result_map_for_count_bug { + expr_result_map_for_count_bug.insert(name.clone(), exprs.clone()); + } + None + }; + Ok(pull_up_expr) +} diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 449bec48b051f..cda921c188429 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -16,10 +16,9 @@ // under the License. use crate::alias::AliasGenerator; +use crate::decorrelate::PullUpCorrelatedExpr; use crate::optimizer::ApplyOrder; -use crate::utils::{ - conjunction, replace_qualified_name, split_conjunction, PullUpCorrelatedExpr, -}; +use crate::utils::{conjunction, replace_qualified_name, split_conjunction}; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::TreeNode; use datafusion_common::{Column, DataFusionError, Result}; @@ -223,9 +222,9 @@ fn build_join( in_predicate_opt: in_predicate_opt.clone(), exists_sub_query: in_predicate_opt.is_none(), can_pull_up: true, - need_collect_count_expr_map: false, + need_handle_count_bug: false, collected_count_expr_map: Default::default(), - expr_check_map: Default::default(), + pull_up_having_expr: None, }; let new_plan = subquery.clone().rewrite(&mut pull_up)?; if !pull_up.can_pull_up { diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs index 2af5edbd9fd34..ec971b7fbd622 100644 --- a/datafusion/optimizer/src/lib.rs +++ b/datafusion/optimizer/src/lib.rs @@ -18,6 +18,7 @@ pub mod alias; pub mod analyzer; pub mod common_subexpr_eliminate; +pub mod decorrelate; pub mod decorrelate_predicate_subquery; pub mod eliminate_cross_join; pub mod eliminate_duplicated_expr; diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 46ad5f2a80fac..97cc25768db17 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -16,15 +16,14 @@ // under the License. use crate::alias::AliasGenerator; +use crate::decorrelate::{PullUpCorrelatedExpr, UN_MATCHED_ROW_INDICATOR}; use crate::optimizer::ApplyOrder; -use crate::utils::{ - conjunction, replace_qualified_name, ExprCheckMap, PullUpCorrelatedExpr, -}; +use crate::utils::{conjunction, replace_qualified_name}; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::{ RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, }; -use datafusion_common::{Column, DataFusionError, Result}; +use datafusion_common::{Column, DataFusionError, Result, ScalarValue}; use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; use std::collections::{BTreeSet, HashMap}; @@ -82,30 +81,16 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut cur_input = filter.input.as_ref().clone(); for (subquery, alias) in subqueries { if let Some((optimized_subquery, expr_check_map)) = - build_join(&subquery, &cur_input, &alias, true)? + build_join(&subquery, &cur_input, &alias)? { if !expr_check_map.is_empty() { rewrite_expr = rewrite_expr.clone().transform_up(&|expr| { if let Expr::Column(col) = &expr { - if let Some((expr1, expr2)) = + if let Some(map_expr) = expr_check_map.get(&col.name) { - let new_expr = Expr::Case(expr::Case { - expr: None, - when_then_expr: vec![( - Box::new(Expr::IsNull(Box::new( - Expr::Column( - Column::new_unqualified( - "__always_true", - ), - ), - ))), - Box::new(expr2.clone()), - )], - else_expr: Some(Box::new(expr1.clone())), - }); - Ok(Transformed::Yes(new_expr)) + Ok(Transformed::Yes(map_expr.clone())) } else { Ok(Transformed::No(expr)) } @@ -146,7 +131,7 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut cur_input = projection.input.as_ref().clone(); for (subquery, alias) in all_subqueryies { if let Some((optimized_subquery, expr_check_map)) = - build_join(&subquery, &cur_input, &alias, true)? + build_join(&subquery, &cur_input, &alias)? { cur_input = optimized_subquery; if !expr_check_map.is_empty() { @@ -154,29 +139,20 @@ impl OptimizerRule for ScalarSubqueryToJoin { if let Some(rewrite_expr) = expr_to_rewrite_expr_map.get(expr) { - let new_expr = rewrite_expr.clone().transform_up(&|expr| { - if let Expr::Column(col) = &expr { - if let Some((expr1, expr2)) = expr_check_map.get(&col.name) - { - let new_expr = Expr::Case(expr::Case { - expr: None, - when_then_expr: vec![( - Box::new(Expr::IsNull(Box::new( - Expr::Column(Column::new_unqualified("__always_true")), - ))), - Box::new(expr2.clone()), - )], - else_expr: Some(Box::new(expr1.clone())), - }); - Ok(Transformed::Yes(new_expr)) + let new_expr = + rewrite_expr.clone().transform_up(&|expr| { + if let Expr::Column(col) = &expr { + if let Some(map_expr) = + expr_check_map.get(&col.name) + { + Ok(Transformed::Yes(map_expr.clone())) + } else { + Ok(Transformed::No(expr)) + } } else { Ok(Transformed::No(expr)) } - } else { - Ok(Transformed::No(expr)) - } - - })?; + })?; expr_to_rewrite_expr_map.insert(expr, new_expr); } } @@ -303,8 +279,7 @@ fn build_join( subquery: &Subquery, filter_input: &LogicalPlan, subquery_alias: &str, - need_collect_count_expr_map: bool, -) -> Result> { +) -> Result)>> { let subquery_plan = subquery.subquery.as_ref(); let mut pull_up = PullUpCorrelatedExpr { join_filters: vec![], @@ -312,15 +287,17 @@ fn build_join( in_predicate_opt: None, exists_sub_query: false, can_pull_up: true, - need_collect_count_expr_map, + need_handle_count_bug: true, collected_count_expr_map: Default::default(), - expr_check_map: Default::default(), + pull_up_having_expr: None, }; let new_plan = subquery_plan.clone().rewrite(&mut pull_up)?; if !pull_up.can_pull_up { return Ok(None); } + let collected_count_expr_map = + pull_up.collected_count_expr_map.get(&new_plan).cloned(); let sub_query_alias = LogicalPlanBuilder::from(new_plan) .alias(subquery_alias.to_string())? .build()?; @@ -368,7 +345,47 @@ fn build_join( )? .build()? }; - Ok(Some((new_plan, pull_up.expr_check_map.clone()))) + let mut computation_project_expr = HashMap::new(); + if let Some(expr_map) = collected_count_expr_map { + for (name, result) in expr_map { + let computer_expr = if let Some(filter) = &pull_up.pull_up_having_expr { + Expr::Case(expr::Case { + expr: None, + when_then_expr: vec![ + ( + Box::new(Expr::IsNull(Box::new(Expr::Column( + Column::new_unqualified(UN_MATCHED_ROW_INDICATOR), + )))), + Box::new(result), + ), + ( + Box::new(Expr::Not(Box::new(filter.clone()))), + Box::new(Expr::Literal(ScalarValue::Null)), + ), + ], + else_expr: Some(Box::new(Expr::Column(Column::new_unqualified( + name.clone(), + )))), + }) + } else { + Expr::Case(expr::Case { + expr: None, + when_then_expr: vec![( + Box::new(Expr::IsNull(Box::new(Expr::Column( + Column::new_unqualified(UN_MATCHED_ROW_INDICATOR), + )))), + Box::new(result), + )], + else_expr: Some(Box::new(Expr::Column(Column::new_unqualified( + name.clone(), + )))), + }) + }; + computation_project_expr.insert(name, computer_expr); + } + } + + Ok(Some((new_plan, computation_project_expr))) } #[cfg(test)] diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index 177e9bf4ab545..32ef4e087923d 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -17,28 +17,20 @@ //! Collection of utility functions that are leveraged by the query optimizer rules -use crate::simplify_expressions::{ExprSimplifier, SimplifyContext}; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::tree_node::{ - RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, -}; -use datafusion_common::{plan_err, Column, DFSchemaRef, ScalarValue}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRewriter}; +use datafusion_common::{plan_err, Column, DFSchemaRef}; use datafusion_common::{DFSchema, Result}; use datafusion_expr::expr::{BinaryExpr, Sort}; -use datafusion_expr::expr_rewriter::{ - replace_col, strip_outer_reference, unnormalize_col, -}; -use datafusion_expr::logical_plan::LogicalPlanBuilder; +use datafusion_expr::expr_rewriter::{replace_col, strip_outer_reference}; use datafusion_expr::utils::from_plan; use datafusion_expr::{ and, logical_plan::{Filter, LogicalPlan}, - AggregateFunction, EmptyRelation, Expr, Operator, + Expr, Operator, }; -use datafusion_physical_expr::execution_props::ExecutionProps; use log::{debug, trace}; use std::collections::{BTreeSet, HashMap}; -use std::ops::Deref; use std::sync::Arc; /// Convenience rule for writing optimizers: recursively invoke @@ -393,343 +385,6 @@ pub fn log_plan(description: &str, plan: &LogicalPlan) { trace!("{description}::\n{}\n", plan.display_indent_schema()); } -/// This struct rewrite the sub query plan by pull up the correlated expressions(contains outer reference columns) from the inner subquery's [Filter]. -/// It adds the inner reference columns to the 'Projection' or 'Aggregate' of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. -pub struct PullUpCorrelatedExpr { - pub join_filters: Vec, - // mapping from the plan to its holding correlated columns - pub correlated_subquery_cols_map: HashMap>, - pub in_predicate_opt: Option, - // indicate whether it is Exists(Not Exists) SubQuery - pub exists_sub_query: bool, - // indicate whether the correlated expressions can pull up or not - pub can_pull_up: bool, - // indicate whether the subquery need to collect count expr mapping - pub need_collect_count_expr_map: bool, - // mapping from expr name to the pair of agg expr and its evaluation result on empty record batch - pub collected_count_expr_map: HashMap, - pub expr_check_map: ExprCheckMap, -} - -pub type ExprCheckMap = HashMap; - -impl TreeNodeRewriter for PullUpCorrelatedExpr { - type N = LogicalPlan; - - fn pre_visit(&mut self, plan: &LogicalPlan) -> Result { - match plan { - LogicalPlan::Filter(_) => Ok(RewriteRecursion::Continue), - LogicalPlan::Union(_) | LogicalPlan::Sort(_) | LogicalPlan::Extension(_) => { - let plan_hold_outer = !plan.all_out_ref_exprs().is_empty(); - if plan_hold_outer { - // the unsupported case - self.can_pull_up = false; - Ok(RewriteRecursion::Stop) - } else { - Ok(RewriteRecursion::Continue) - } - } - LogicalPlan::Limit(_) => { - let plan_hold_outer = !plan.all_out_ref_exprs().is_empty(); - match (self.exists_sub_query, plan_hold_outer) { - (false, true) => { - // the unsupported case - self.can_pull_up = false; - Ok(RewriteRecursion::Stop) - } - _ => Ok(RewriteRecursion::Continue), - } - } - _ if plan.expressions().iter().any(|expr| expr.contains_outer()) => { - // the unsupported cases, the plan expressions contain out reference columns(like window expressions) - self.can_pull_up = false; - Ok(RewriteRecursion::Stop) - } - _ => Ok(RewriteRecursion::Continue), - } - } - - fn mutate(&mut self, plan: LogicalPlan) -> Result { - let subquery_schema = plan.schema().clone(); - match &plan { - LogicalPlan::Filter(plan_filter) => { - let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); - let (mut join_filters, subquery_filters) = - find_join_exprs(subquery_filter_exprs)?; - if let Some(in_predicate) = &self.in_predicate_opt { - // in_predicate may be already included in the join filters, remove it from the join filters first. - join_filters = remove_duplicated_filter(join_filters, in_predicate); - } - let correlated_subquery_cols = - collect_subquery_cols(&join_filters, subquery_schema)?; - for expr in join_filters { - if !self.join_filters.contains(&expr) { - self.join_filters.push(expr) - } - } - // if the subquery still has filter expressions, restore them. - let mut plan = LogicalPlanBuilder::from((*plan_filter.input).clone()); - if let Some(expr) = conjunction(subquery_filters) { - plan = plan.filter(expr)? - } - let new_plan = plan.build()?; - self.correlated_subquery_cols_map - .insert(new_plan.clone(), correlated_subquery_cols); - Ok(new_plan) - } - LogicalPlan::Projection(projection) - if self.in_predicate_opt.is_some() || !self.join_filters.is_empty() => - { - let mut local_correlated_cols = BTreeSet::new(); - collect_local_correlated_cols( - &plan, - &self.correlated_subquery_cols_map, - &mut local_correlated_cols, - ); - // add missing columns to Projection - let mut missing_exprs = - self.collect_missing_exprs(&projection.expr, &local_correlated_cols)?; - if !self.collected_count_expr_map.is_empty() { - let head_expr = missing_exprs.get(0); - if let Some(expr) = head_expr { - let result_expr = expr.clone().transform_up(&|expr| { - if let Expr::Column(Column { name, .. }) = &expr { - if let Some((_, result_expr)) = - self.collected_count_expr_map.get(name) - { - Ok(Transformed::Yes(result_expr.clone())) - } else { - Ok(Transformed::No(expr)) - } - } else { - Ok(Transformed::No(expr)) - } - })?; - let scalar_expr = match expr { - Expr::Alias(_, alias) => ( - alias.to_string(), - Expr::Column(Column::new_unqualified(alias)), - ), - Expr::Column(Column { relation: _, name }) => { - (name.to_string(), expr.clone()) - } - _ => { - let scalar_column = expr.display_name()?; - ( - scalar_column.clone(), - Expr::Column(Column::new_unqualified(scalar_column)), - ) - } - }; - self.expr_check_map - .insert(scalar_expr.0, (scalar_expr.1, result_expr)); - missing_exprs.push(Expr::Column(Column::new_unqualified( - "__always_true".to_string(), - ))); - } - } - - let new_plan = LogicalPlanBuilder::from((*projection.input).clone()) - .project(missing_exprs)? - .build()?; - Ok(new_plan) - } - LogicalPlan::Aggregate(aggregate) - if self.in_predicate_opt.is_some() || !self.join_filters.is_empty() => - { - let mut local_correlated_cols = BTreeSet::new(); - collect_local_correlated_cols( - &plan, - &self.correlated_subquery_cols_map, - &mut local_correlated_cols, - ); - // add missing columns to Aggregation's group expression - let mut missing_exprs = self.collect_missing_exprs( - &aggregate.group_expr, - &local_correlated_cols, - )?; - - if self.need_collect_count_expr_map && aggregate.group_expr.is_empty() { - let agg_result_exprs = agg_exprs_eva_result_on_empty_batch( - &aggregate.aggr_expr, - subquery_schema, - )?; - if !missing_exprs.is_empty() { - let scalar_agg = !agg_result_exprs.values().any(|result_expr| { - matches!(result_expr, Expr::Literal(ScalarValue::Null)) - }); - if scalar_agg { - let internal_always_true_col = Expr::Alias( - Box::new(Expr::Literal(ScalarValue::Boolean(Some(true)))), - "__always_true".to_string(), - ); - missing_exprs.push(internal_always_true_col); - for (agg_expr, result_expr_on_empty) in agg_result_exprs { - let agg_expr_name = agg_expr.display_name()?; - self.collected_count_expr_map.insert( - agg_expr_name, - (agg_expr, result_expr_on_empty), - ); - } - } - } - } - - let new_plan = LogicalPlanBuilder::from((*aggregate.input).clone()) - .aggregate(missing_exprs, aggregate.aggr_expr.to_vec())? - .build()?; - Ok(new_plan) - } - LogicalPlan::SubqueryAlias(alias) => { - let mut local_correlated_cols = BTreeSet::new(); - collect_local_correlated_cols( - &plan, - &self.correlated_subquery_cols_map, - &mut local_correlated_cols, - ); - let mut new_correlated_cols = BTreeSet::new(); - for col in local_correlated_cols.iter() { - new_correlated_cols - .insert(Column::new(Some(alias.alias.clone()), col.name.clone())); - } - self.correlated_subquery_cols_map - .insert(plan.clone(), new_correlated_cols); - Ok(plan) - } - LogicalPlan::Limit(limit) => { - // handling the limit clause in the subquery - match (self.exists_sub_query, self.join_filters.is_empty()) { - // un-correlated exist subquery, keep the limit - (true, true) => Ok(plan), - // Correlated exist subquery, remove the limit(so that correlated expressions can pull up) - (true, false) => { - if limit.fetch.filter(|limit_row| *limit_row == 0).is_some() { - Ok(LogicalPlan::EmptyRelation(EmptyRelation { - produce_one_row: false, - schema: limit.input.schema().clone(), - })) - } else { - LogicalPlanBuilder::from((*limit.input).clone()).build() - } - } - _ => Ok(plan), - } - } - _ => Ok(plan), - } - } -} - -impl PullUpCorrelatedExpr { - fn collect_missing_exprs( - &self, - exprs: &[Expr], - correlated_subquery_cols: &BTreeSet, - ) -> Result> { - let mut missing_exprs = vec![]; - if let Some(Expr::BinaryExpr(BinaryExpr { - left: _, - op: Operator::Eq, - right, - })) = &self.in_predicate_opt - { - if !matches!(right.deref(), Expr::Column(_)) - && !matches!(right.deref(), Expr::Literal(_)) - && !matches!(right.deref(), Expr::Alias(_, _)) - { - let alias_expr = right - .deref() - .clone() - .alias(format!("{:?}", unnormalize_col(right.deref().clone()))); - missing_exprs.push(alias_expr) - } - } - for expr in exprs { - if !missing_exprs.contains(expr) { - missing_exprs.push(expr.clone()) - } - } - for col in correlated_subquery_cols.iter() { - let col_expr = Expr::Column(col.clone()); - if !missing_exprs.contains(&col_expr) { - missing_exprs.push(col_expr) - } - } - Ok(missing_exprs) - } -} - -fn collect_local_correlated_cols( - plan: &LogicalPlan, - all_cols_map: &HashMap>, - local_cols: &mut BTreeSet, -) { - for child in plan.inputs() { - if let Some(cols) = all_cols_map.get(child) { - local_cols.extend(cols.clone()); - } - // SubqueryAlias is treated as the leaf node - if !matches!(child, LogicalPlan::SubqueryAlias(_)) { - collect_local_correlated_cols(child, all_cols_map, local_cols); - } - } -} - -fn remove_duplicated_filter(filters: Vec, in_predicate: &Expr) -> Vec { - filters - .into_iter() - .filter(|filter| { - if filter == in_predicate { - return false; - } - - // ignore the binary order - !match (filter, in_predicate) { - (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { - (a_expr.op == b_expr.op) - && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) - || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) - } - _ => false, - } - }) - .collect::>() -} - -fn agg_exprs_eva_result_on_empty_batch( - agg_expr: &[Expr], - schema: DFSchemaRef, -) -> Result> { - let mut result_expr_map = HashMap::new(); - for e in agg_expr.iter() { - let new_expr = e.clone().transform_up(&|expr| { - let new_expr = match expr { - Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction { - fun, - .. - }) => { - if matches!(fun, AggregateFunction::Count) { - Transformed::Yes(Expr::Literal(ScalarValue::Int64(Some(0)))) - } else { - Transformed::Yes(Expr::Literal(ScalarValue::Null)) - } - } - Expr::AggregateUDF(_) => { - Transformed::Yes(Expr::Literal(ScalarValue::Null)) - } - _ => Transformed::No(expr), - }; - Ok(new_expr) - })?; - - let props = ExecutionProps::new(); - let info = SimplifyContext::new(&props).with_schema(schema.clone()); - let simplifier = ExprSimplifier::new(info); - result_expr_map.insert(e.clone(), simplifier.simplify(new_expr)?); - } - Ok(result_expr_map) -} - #[cfg(test)] mod tests { use super::*; From 75eeb88f745b06e2eb30cda6a026c90acd799b09 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 1 Jun 2023 11:30:16 +0800 Subject: [PATCH 08/13] tiny fix --- datafusion/core/tests/sql/subqueries.rs | 1 - datafusion/optimizer/src/decorrelate.rs | 28 ++++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/datafusion/core/tests/sql/subqueries.rs b/datafusion/core/tests/sql/subqueries.rs index d5d136c6f8339..30dfdeb3eaaf2 100644 --- a/datafusion/core/tests/sql/subqueries.rs +++ b/datafusion/core/tests/sql/subqueries.rs @@ -961,7 +961,6 @@ async fn uncorrelated_scalar_subquery_with_limit0() -> Result<()> { let dataframe = ctx.sql(sql).await.expect(&msg); let plan = dataframe.into_optimized_plan()?; - // not de-correlated let expected = vec![ "Projection: t1.t1_id, __scalar_sq_1.t2_id AS t2_id [t1_id:UInt32;N, t2_id:UInt32;N]", " Left Join: [t1_id:UInt32;N, t2_id:UInt32;N]", diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 3e1a97cf88232..b40642abd3fd7 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -262,26 +262,40 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { } self.correlated_subquery_cols_map .insert(plan.clone(), new_correlated_cols); + if let Some(input_map) = + self.collected_count_expr_map.get(alias.input.deref()) + { + self.collected_count_expr_map + .insert(plan.clone(), input_map.clone()); + } Ok(plan) } LogicalPlan::Limit(limit) => { + let input_expr_map = self + .collected_count_expr_map + .get(limit.input.deref()) + .cloned(); // handling the limit clause in the subquery - match (self.exists_sub_query, self.join_filters.is_empty()) { - // un-correlated exist subquery, keep the limit - (true, true) => Ok(plan), + let new_plan = match (self.exists_sub_query, self.join_filters.is_empty()) + { // Correlated exist subquery, remove the limit(so that correlated expressions can pull up) (true, false) => { if limit.fetch.filter(|limit_row| *limit_row == 0).is_some() { - Ok(LogicalPlan::EmptyRelation(EmptyRelation { + LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: limit.input.schema().clone(), - })) + }) } else { - LogicalPlanBuilder::from((*limit.input).clone()).build() + LogicalPlanBuilder::from((*limit.input).clone()).build()? } } - _ => Ok(plan), + _ => plan, + }; + if let Some(input_map) = input_expr_map { + self.collected_count_expr_map + .insert(new_plan.clone(), input_map); } + Ok(new_plan) } _ => Ok(plan), } From 717f51dbc7d49316455c2074aa7884e1b19d3729 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 1 Jun 2023 13:08:49 +0800 Subject: [PATCH 09/13] fix doc --- datafusion/optimizer/src/decorrelate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index b40642abd3fd7..db4a1f2c409a2 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -32,7 +32,7 @@ use datafusion_physical_expr::execution_props::ExecutionProps; use std::collections::{BTreeSet, HashMap}; use std::ops::Deref; -/// This struct rewrite the sub query plan by pull up the correlated expressions(contains outer reference columns) from the inner subquery's [Filter]. +/// This struct rewrite the sub query plan by pull up the correlated expressions(contains outer reference columns) from the inner subquery's 'Filter'. /// It adds the inner reference columns to the 'Projection' or 'Aggregate' of the subquery if they are missing, so that they can be evaluated by the parent operator as the join condition. pub struct PullUpCorrelatedExpr { pub join_filters: Vec, From a752ee8f351a51606260e37fc8e781ffec3ff74b Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 1 Jun 2023 17:40:56 +0800 Subject: [PATCH 10/13] fix tpch planning change --- .../test_files/tpch/q11.slt.part | 157 +++++++++--------- .../test_files/tpch/q15.slt.part | 53 +++--- .../test_files/tpch/q17.slt.part | 8 +- .../sqllogictests/test_files/tpch/q2.slt.part | 10 +- .../test_files/tpch/q20.slt.part | 8 +- .../test_files/tpch/q22.slt.part | 75 ++++----- 6 files changed, 149 insertions(+), 162 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch/q11.slt.part b/datafusion/core/tests/sqllogictests/test_files/tpch/q11.slt.part index bc6d166b8680f..b3a462baaa0d3 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch/q11.slt.part +++ b/datafusion/core/tests/sqllogictests/test_files/tpch/q11.slt.part @@ -50,92 +50,89 @@ logical_plan Limit: skip=0, fetch=10 --Sort: value DESC NULLS FIRST, fetch=10 ----Projection: partsupp.ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS value -------Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.__value ---------CrossJoin: -----------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] -------------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost ---------------Inner Join: supplier.s_nationkey = nation.n_nationkey -----------------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey -------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey ---------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] ---------------------TableScan: supplier projection=[s_suppkey, s_nationkey] -----------------Projection: nation.n_nationkey -------------------Filter: nation.n_name = Utf8("GERMANY") ---------------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("GERMANY")] -----------SubqueryAlias: __scalar_sq_1 -------------Projection: CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) AS __value ---------------Aggregate: groupBy=[[]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] -----------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost -------------------Inner Join: supplier.s_nationkey = nation.n_nationkey ---------------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey -----------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey -------------------------TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] -------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] ---------------------Projection: nation.n_nationkey -----------------------Filter: nation.n_name = Utf8("GERMANY") -------------------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("GERMANY")] +------Inner Join: Filter: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001) +--------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] +----------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost +------------Inner Join: supplier.s_nationkey = nation.n_nationkey +--------------Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey +----------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey +------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] +------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +--------------Projection: nation.n_nationkey +----------------Filter: nation.n_name = Utf8("GERMANY") +------------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("GERMANY")] +--------SubqueryAlias: __scalar_sq_1 +----------Projection: CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) +------------Aggregate: groupBy=[[]], aggr=[[SUM(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] +--------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost +----------------Inner Join: supplier.s_nationkey = nation.n_nationkey +------------------Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey +--------------------Inner Join: partsupp.ps_suppkey = supplier.s_suppkey +----------------------TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] +----------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +------------------Projection: nation.n_nationkey +--------------------Filter: nation.n_name = Utf8("GERMANY") +----------------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("GERMANY")] physical_plan GlobalLimitExec: skip=0, fetch=10 ---SortExec: fetch=10, expr=[value@1 DESC] -----ProjectionExec: expr=[ps_partkey@0 as ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] -------CoalesceBatchesExec: target_batch_size=8192 ---------FilterExec: CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 AS Decimal128(38, 15)) > __value@2 -----------CrossJoinExec -------------CoalescePartitionsExec ---------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] -----------------CoalesceBatchesExec: target_batch_size=8192 -------------------RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 4), input_partitions=4 ---------------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] -----------------------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost] +--SortPreservingMergeExec: [value@1 DESC] +----SortExec: fetch=10, expr=[value@1 DESC] +------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@1 as value] +--------NestedLoopJoinExec: join_type=Inner, filter=BinaryExpr { left: CastExpr { expr: Column { name: "SUM(partsupp.ps_supplycost * partsupp.ps_availqty)", index: 0 }, cast_type: Decimal128(38, 15), cast_options: CastOptions { safe: false, format_options: FormatOptions { safe: true, null: "", date_format: None, datetime_format: None, timestamp_format: None, timestamp_tz_format: None, time_format: None } } }, op: Gt, right: Column { name: "SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)", index: 1 } } +----------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] +------------CoalesceBatchesExec: target_batch_size=8192 +--------------RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 4), input_partitions=4 +----------------AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] +------------------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost] +--------------------CoalesceBatchesExec: target_batch_size=8192 +----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] ------------------------CoalesceBatchesExec: target_batch_size=8192 ---------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 3 }, Column { name: "n_nationkey", index: 0 })] -----------------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------------RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 4), input_partitions=4 ---------------------------------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@5 as s_nationkey] +--------------------------RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 4), input_partitions=4 +----------------------------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@5 as s_nationkey] +------------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 1 }, Column { name: "s_suppkey", index: 0 })] ----------------------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 1 }, Column { name: "s_suppkey", index: 0 })] ---------------------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------------------RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 4), input_partitions=4 -------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ---------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/partsupp.tbl]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], has_header=false ---------------------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------------------RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 4), input_partitions=4 -------------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ---------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], has_header=false -----------------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------------RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 4), input_partitions=4 ---------------------------------ProjectionExec: expr=[n_nationkey@0 as n_nationkey] +------------------------------------RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 4), input_partitions=4 +--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/partsupp.tbl]]}, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost], has_header=false +----------------------------------CoalesceBatchesExec: target_batch_size=8192 +------------------------------------RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 4), input_partitions=4 +--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], has_header=false +------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 4), input_partitions=4 +----------------------------ProjectionExec: expr=[n_nationkey@0 as n_nationkey] +------------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------------FilterExec: n_name@1 = GERMANY +----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false +----------ProjectionExec: expr=[CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as SUM(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] +------------AggregateExec: mode=Final, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] +--------------CoalescePartitionsExec +----------------AggregateExec: mode=Partial, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] +------------------ProjectionExec: expr=[ps_availqty@0 as ps_availqty, ps_supplycost@1 as ps_supplycost] +--------------------CoalesceBatchesExec: target_batch_size=8192 +----------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 2 }, Column { name: "n_nationkey", index: 0 })] +------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 4), input_partitions=4 +----------------------------ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@4 as s_nationkey] +------------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 0 }, Column { name: "s_suppkey", index: 0 })] +----------------------------------CoalesceBatchesExec: target_batch_size=8192 +------------------------------------RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 4), input_partitions=4 +--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/partsupp.tbl]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], has_header=false ----------------------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------------------FilterExec: n_name@1 = GERMANY +------------------------------------RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 4), input_partitions=4 --------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false -------------ProjectionExec: expr=[CAST(CAST(SUM(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as __value] ---------------AggregateExec: mode=Final, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] -----------------CoalescePartitionsExec -------------------AggregateExec: mode=Partial, gby=[], aggr=[SUM(partsupp.ps_supplycost * partsupp.ps_availqty)] ---------------------ProjectionExec: expr=[ps_availqty@0 as ps_availqty, ps_supplycost@1 as ps_supplycost] -----------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "s_nationkey", index: 2 }, Column { name: "n_nationkey", index: 0 })] ---------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------RepartitionExec: partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 4), input_partitions=4 -------------------------------ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@4 as s_nationkey] ---------------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_suppkey", index: 0 }, Column { name: "s_suppkey", index: 0 })] -------------------------------------CoalesceBatchesExec: target_batch_size=8192 ---------------------------------------RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 4), input_partitions=4 -----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/partsupp.tbl]]}, projection=[ps_suppkey, ps_availqty, ps_supplycost], has_header=false -------------------------------------CoalesceBatchesExec: target_batch_size=8192 ---------------------------------------RepartitionExec: partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 4), input_partitions=4 -----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], has_header=false ---------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 4), input_partitions=4 -------------------------------ProjectionExec: expr=[n_nationkey@0 as n_nationkey] ---------------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------------FilterExec: n_name@1 = GERMANY -------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ---------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false +----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/supplier.tbl]]}, projection=[s_suppkey, s_nationkey], has_header=false +------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------RepartitionExec: partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 4), input_partitions=4 +----------------------------ProjectionExec: expr=[n_nationkey@0 as n_nationkey] +------------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------------FilterExec: n_name@1 = GERMANY +----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/nation.tbl]]}, projection=[n_nationkey, n_name], has_header=false diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch/q15.slt.part b/datafusion/core/tests/sqllogictests/test_files/tpch/q15.slt.part index f7e428dcfb9d6..0406b4f70f7a0 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch/q15.slt.part +++ b/datafusion/core/tests/sqllogictests/test_files/tpch/q15.slt.part @@ -52,7 +52,7 @@ order by logical_plan Sort: supplier.s_suppkey ASC NULLS LAST --Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, total_revenue -----Inner Join: total_revenue = __scalar_sq_3.__value +----Inner Join: total_revenue = __scalar_sq_3.MAX(total_revenue) ------Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address, supplier.s_phone, total_revenue --------Inner Join: supplier.s_suppkey = supplier_no ----------TableScan: supplier projection=[s_suppkey, s_name, s_address, s_phone] @@ -63,21 +63,20 @@ Sort: supplier.s_suppkey ASC NULLS LAST ------------------Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") --------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("9496"), lineitem.l_shipdate < Date32("9587")] ------SubqueryAlias: __scalar_sq_3 ---------Projection: MAX(total_revenue) AS __value -----------Aggregate: groupBy=[[]], aggr=[[MAX(total_revenue)]] -------------Projection: revenue0.SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue ---------------SubqueryAlias: revenue0 -----------------Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) -------------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] ---------------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount -----------------------Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") -------------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("9496"), lineitem.l_shipdate < Date32("9587")] +--------Aggregate: groupBy=[[]], aggr=[[MAX(total_revenue)]] +----------Projection: revenue0.SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS total_revenue +------------SubqueryAlias: revenue0 +--------------Projection: SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) +----------------Aggregate: groupBy=[[lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] +------------------Projection: lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount +--------------------Filter: lineitem.l_shipdate >= Date32("9496") AND lineitem.l_shipdate < Date32("9587") +----------------------TableScan: lineitem projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("9496"), lineitem.l_shipdate < Date32("9587")] physical_plan SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST] --SortExec: expr=[s_suppkey@0 ASC NULLS LAST] ----ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address, s_phone@3 as s_phone, total_revenue@4 as total_revenue] ------CoalesceBatchesExec: target_batch_size=8192 ---------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "total_revenue", index: 4 }, Column { name: "__value", index: 0 })] +--------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "total_revenue", index: 4 }, Column { name: "MAX(total_revenue)", index: 0 })] ----------CoalesceBatchesExec: target_batch_size=8192 ------------RepartitionExec: partitioning=Hash([Column { name: "total_revenue", index: 4 }], 4), input_partitions=4 --------------ProjectionExec: expr=[s_suppkey@0 as s_suppkey, s_name@1 as s_name, s_address@2 as s_address, s_phone@3 as s_phone, total_revenue@5 as total_revenue] @@ -98,23 +97,21 @@ SortPreservingMergeExec: [s_suppkey@0 ASC NULLS LAST] ------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 --------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/lineitem.tbl]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false ----------CoalesceBatchesExec: target_batch_size=8192 -------------RepartitionExec: partitioning=Hash([Column { name: "__value", index: 0 }], 4), input_partitions=1 ---------------ProjectionExec: expr=[MAX(total_revenue)@0 as __value] -----------------AggregateExec: mode=Final, gby=[], aggr=[MAX(total_revenue)] -------------------CoalescePartitionsExec ---------------------AggregateExec: mode=Partial, gby=[], aggr=[MAX(total_revenue)] -----------------------ProjectionExec: expr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@0 as total_revenue] -------------------------ProjectionExec: expr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] ---------------------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -----------------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------------RepartitionExec: partitioning=Hash([Column { name: "l_suppkey", index: 0 }], 4), input_partitions=4 ---------------------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -----------------------------------ProjectionExec: expr=[l_suppkey@0 as l_suppkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] -------------------------------------CoalesceBatchesExec: target_batch_size=8192 ---------------------------------------FilterExec: l_shipdate@3 >= 9496 AND l_shipdate@3 < 9587 -----------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -------------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/lineitem.tbl]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false - +------------RepartitionExec: partitioning=Hash([Column { name: "MAX(total_revenue)", index: 0 }], 4), input_partitions=1 +--------------AggregateExec: mode=Final, gby=[], aggr=[MAX(total_revenue)] +----------------CoalescePartitionsExec +------------------AggregateExec: mode=Partial, gby=[], aggr=[MAX(total_revenue)] +--------------------ProjectionExec: expr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@0 as total_revenue] +----------------------ProjectionExec: expr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +------------------------AggregateExec: mode=FinalPartitioned, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +--------------------------CoalesceBatchesExec: target_batch_size=8192 +----------------------------RepartitionExec: partitioning=Hash([Column { name: "l_suppkey", index: 0 }], 4), input_partitions=4 +------------------------------AggregateExec: mode=Partial, gby=[l_suppkey@0 as l_suppkey], aggr=[SUM(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +--------------------------------ProjectionExec: expr=[l_suppkey@0 as l_suppkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] +----------------------------------CoalesceBatchesExec: target_batch_size=8192 +------------------------------------FilterExec: l_shipdate@3 >= 9496 AND l_shipdate@3 < 9587 +--------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/lineitem.tbl]]}, projection=[l_suppkey, l_extendedprice, l_discount, l_shipdate], has_header=false query ITTTR diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch/q17.slt.part b/datafusion/core/tests/sqllogictests/test_files/tpch/q17.slt.part index 522d67811aac9..4f52711f2985d 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch/q17.slt.part +++ b/datafusion/core/tests/sqllogictests/test_files/tpch/q17.slt.part @@ -39,7 +39,7 @@ logical_plan Projection: CAST(SUM(lineitem.l_extendedprice) AS Float64) / Float64(7) AS avg_yearly --Aggregate: groupBy=[[]], aggr=[[SUM(lineitem.l_extendedprice)]] ----Projection: lineitem.l_extendedprice -------Inner Join: part.p_partkey = __scalar_sq_5.l_partkey Filter: CAST(lineitem.l_quantity AS Decimal128(30, 15)) < __scalar_sq_5.__value +------Inner Join: part.p_partkey = __scalar_sq_5.l_partkey Filter: CAST(lineitem.l_quantity AS Decimal128(30, 15)) < __scalar_sq_5.Float64(0.2) * AVG(lineitem.l_quantity) --------Projection: lineitem.l_quantity, lineitem.l_extendedprice, part.p_partkey ----------Inner Join: lineitem.l_partkey = part.p_partkey ------------TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice] @@ -47,7 +47,7 @@ Projection: CAST(SUM(lineitem.l_extendedprice) AS Float64) / Float64(7) AS avg_y --------------Filter: part.p_brand = Utf8("Brand#23") AND part.p_container = Utf8("MED BOX") ----------------TableScan: part projection=[p_partkey, p_brand, p_container], partial_filters=[part.p_brand = Utf8("Brand#23"), part.p_container = Utf8("MED BOX")] --------SubqueryAlias: __scalar_sq_5 -----------Projection: lineitem.l_partkey, CAST(Float64(0.2) * CAST(AVG(lineitem.l_quantity) AS Float64) AS Decimal128(30, 15)) AS __value +----------Projection: CAST(Float64(0.2) * CAST(AVG(lineitem.l_quantity) AS Float64) AS Decimal128(30, 15)), lineitem.l_partkey ------------Aggregate: groupBy=[[lineitem.l_partkey]], aggr=[[AVG(lineitem.l_quantity)]] --------------TableScan: lineitem projection=[l_partkey, l_quantity] physical_plan @@ -57,7 +57,7 @@ ProjectionExec: expr=[CAST(SUM(lineitem.l_extendedprice)@0 AS Float64) / 7 as av ------AggregateExec: mode=Partial, gby=[], aggr=[SUM(lineitem.l_extendedprice)] --------ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice] ----------CoalesceBatchesExec: target_batch_size=8192 -------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 2 }, Column { name: "l_partkey", index: 0 })], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < __value@1 +------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 2 }, Column { name: "l_partkey", index: 1 })], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * AVG(lineitem.l_quantity)@1 --------------CoalesceBatchesExec: target_batch_size=8192 ----------------RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 2 }], 4), input_partitions=4 ------------------ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, p_partkey@3 as p_partkey] @@ -74,7 +74,7 @@ ProjectionExec: expr=[CAST(SUM(lineitem.l_extendedprice)@0 AS Float64) / 7 as av --------------------------------FilterExec: p_brand@1 = Brand#23 AND p_container@2 = MED BOX ----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_brand, p_container], has_header=false ---------------ProjectionExec: expr=[l_partkey@0 as l_partkey, CAST(0.2 * CAST(AVG(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as __value] +--------------ProjectionExec: expr=[CAST(0.2 * CAST(AVG(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * AVG(lineitem.l_quantity), l_partkey@0 as l_partkey] ----------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[AVG(lineitem.l_quantity)] ------------------CoalesceBatchesExec: target_batch_size=8192 --------------------RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }], 4), input_partitions=4 diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch/q2.slt.part b/datafusion/core/tests/sqllogictests/test_files/tpch/q2.slt.part index fe125c2b3b0cc..8203642869c07 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch/q2.slt.part +++ b/datafusion/core/tests/sqllogictests/test_files/tpch/q2.slt.part @@ -66,7 +66,7 @@ logical_plan Limit: skip=0, fetch=10 --Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST, fetch=10 ----Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment -------Inner Join: part.p_partkey = __scalar_sq_7.ps_partkey, partsupp.ps_supplycost = __scalar_sq_7.__value +------Inner Join: part.p_partkey = __scalar_sq_7.ps_partkey, partsupp.ps_supplycost = __scalar_sq_7.MIN(partsupp.ps_supplycost) --------Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name ----------Inner Join: nation.n_regionkey = region.r_regionkey ------------Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name, nation.n_regionkey @@ -85,7 +85,7 @@ Limit: skip=0, fetch=10 --------------Filter: region.r_name = Utf8("EUROPE") ----------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("EUROPE")] --------SubqueryAlias: __scalar_sq_7 -----------Projection: partsupp.ps_partkey, MIN(partsupp.ps_supplycost) AS __value +----------Projection: MIN(partsupp.ps_supplycost), partsupp.ps_partkey ------------Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[MIN(partsupp.ps_supplycost)]] --------------Projection: partsupp.ps_partkey, partsupp.ps_supplycost ----------------Inner Join: nation.n_regionkey = region.r_regionkey @@ -105,7 +105,7 @@ GlobalLimitExec: skip=0, fetch=10 ----SortExec: fetch=10, expr=[s_acctbal@0 DESC,n_name@2 ASC NULLS LAST,s_name@1 ASC NULLS LAST,p_partkey@3 ASC NULLS LAST] ------ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@8 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment] --------CoalesceBatchesExec: target_batch_size=8192 -----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 0 }, Column { name: "ps_partkey", index: 0 }), (Column { name: "ps_supplycost", index: 7 }, Column { name: "__value", index: 1 })] +----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "p_partkey", index: 0 }, Column { name: "ps_partkey", index: 1 }), (Column { name: "ps_supplycost", index: 7 }, Column { name: "MIN(partsupp.ps_supplycost)", index: 0 })] ------------CoalesceBatchesExec: target_batch_size=8192 --------------RepartitionExec: partitioning=Hash([Column { name: "p_partkey", index: 0 }, Column { name: "ps_supplycost", index: 7 }], 4), input_partitions=4 ----------------ProjectionExec: expr=[p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_name@2 as s_name, s_address@3 as s_address, s_phone@4 as s_phone, s_acctbal@5 as s_acctbal, s_comment@6 as s_comment, ps_supplycost@7 as ps_supplycost, n_name@8 as n_name] @@ -153,8 +153,8 @@ GlobalLimitExec: skip=0, fetch=10 --------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ----------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/region.tbl]]}, projection=[r_regionkey, r_name], has_header=false ------------CoalesceBatchesExec: target_batch_size=8192 ---------------RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "__value", index: 1 }], 4), input_partitions=4 -----------------ProjectionExec: expr=[ps_partkey@0 as ps_partkey, MIN(partsupp.ps_supplycost)@1 as __value] +--------------RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 1 }, Column { name: "MIN(partsupp.ps_supplycost)", index: 0 }], 4), input_partitions=4 +----------------ProjectionExec: expr=[MIN(partsupp.ps_supplycost)@1 as MIN(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] ------------------AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[MIN(partsupp.ps_supplycost)] --------------------CoalesceBatchesExec: target_batch_size=8192 ----------------------RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 4), input_partitions=4 diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch/q20.slt.part b/datafusion/core/tests/sqllogictests/test_files/tpch/q20.slt.part index f6d343d4db30d..8e2af2c340d20 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch/q20.slt.part +++ b/datafusion/core/tests/sqllogictests/test_files/tpch/q20.slt.part @@ -67,7 +67,7 @@ Sort: supplier.s_name ASC NULLS LAST --------------TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8("CANADA")] ------SubqueryAlias: __correlated_sq_5 --------Projection: partsupp.ps_suppkey -----------Inner Join: partsupp.ps_partkey = __scalar_sq_9.l_partkey, partsupp.ps_suppkey = __scalar_sq_9.l_suppkey Filter: CAST(partsupp.ps_availqty AS Float64) > __scalar_sq_9.__value +----------Inner Join: partsupp.ps_partkey = __scalar_sq_9.l_partkey, partsupp.ps_suppkey = __scalar_sq_9.l_suppkey Filter: CAST(partsupp.ps_availqty AS Float64) > __scalar_sq_9.Float64(0.5) * SUM(lineitem.l_quantity) ------------LeftSemi Join: partsupp.ps_partkey = __correlated_sq_6.p_partkey --------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty] --------------SubqueryAlias: __correlated_sq_6 @@ -75,7 +75,7 @@ Sort: supplier.s_name ASC NULLS LAST ------------------Filter: part.p_name LIKE Utf8("forest%") --------------------TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8("forest%")] ------------SubqueryAlias: __scalar_sq_9 ---------------Projection: lineitem.l_partkey, lineitem.l_suppkey, Float64(0.5) * CAST(SUM(lineitem.l_quantity) AS Float64) AS __value +--------------Projection: Float64(0.5) * CAST(SUM(lineitem.l_quantity) AS Float64), lineitem.l_partkey, lineitem.l_suppkey ----------------Aggregate: groupBy=[[lineitem.l_partkey, lineitem.l_suppkey]], aggr=[[SUM(lineitem.l_quantity)]] ------------------Projection: lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity --------------------Filter: lineitem.l_shipdate >= Date32("8766") AND lineitem.l_shipdate < Date32("9131") @@ -106,7 +106,7 @@ SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] ------------RepartitionExec: partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 4), input_partitions=4 --------------ProjectionExec: expr=[ps_suppkey@1 as ps_suppkey] ----------------CoalesceBatchesExec: target_batch_size=8192 -------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_partkey", index: 0 }, Column { name: "l_partkey", index: 0 }), (Column { name: "ps_suppkey", index: 1 }, Column { name: "l_suppkey", index: 1 })], filter=CAST(ps_availqty@0 AS Float64) > __value@1 +------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "ps_partkey", index: 0 }, Column { name: "l_partkey", index: 1 }), (Column { name: "ps_suppkey", index: 1 }, Column { name: "l_suppkey", index: 2 })], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * SUM(lineitem.l_quantity)@1 --------------------CoalesceBatchesExec: target_batch_size=8192 ----------------------RepartitionExec: partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "ps_suppkey", index: 1 }], 4), input_partitions=4 ------------------------CoalesceBatchesExec: target_batch_size=8192 @@ -122,7 +122,7 @@ SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] ------------------------------------FilterExec: p_name@1 LIKE forest% --------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_name], has_header=false ---------------------ProjectionExec: expr=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey, 0.5 * CAST(SUM(lineitem.l_quantity)@2 AS Float64) as __value] +--------------------ProjectionExec: expr=[0.5 * CAST(SUM(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * SUM(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey] ----------------------AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[SUM(lineitem.l_quantity)] ------------------------CoalesceBatchesExec: target_batch_size=8192 --------------------------RepartitionExec: partitioning=Hash([Column { name: "l_partkey", index: 0 }, Column { name: "l_suppkey", index: 1 }], 4), input_partitions=4 diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch/q22.slt.part b/datafusion/core/tests/sqllogictests/test_files/tpch/q22.slt.part index 9c7dd85ccd82f..9f8b651f5386b 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch/q22.slt.part +++ b/datafusion/core/tests/sqllogictests/test_files/tpch/q22.slt.part @@ -61,56 +61,49 @@ Sort: custsale.cntrycode ASC NULLS LAST ----Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[COUNT(UInt8(1)), SUM(custsale.c_acctbal)]] ------SubqueryAlias: custsale --------Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal -----------Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_11.__value -------------CrossJoin: ---------------Projection: customer.c_phone, customer.c_acctbal -----------------LeftAnti Join: customer.c_custkey = __correlated_sq_13.o_custkey -------------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) ---------------------TableScan: customer projection=[c_custkey, c_phone, c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])] -------------------SubqueryAlias: __correlated_sq_13 ---------------------TableScan: orders projection=[o_custkey] ---------------SubqueryAlias: __scalar_sq_11 -----------------Projection: AVG(customer.c_acctbal) AS __value -------------------Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] ---------------------Projection: customer.c_acctbal -----------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) -------------------------TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2) AS customer.c_acctbal > Decimal128(Some(0),30,15), substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]), customer.c_acctbal > Decimal128(Some(0),15,2)] +----------Inner Join: Filter: CAST(customer.c_acctbal AS Decimal128(19, 6)) > __scalar_sq_11.AVG(customer.c_acctbal) +------------Projection: customer.c_phone, customer.c_acctbal +--------------LeftAnti Join: customer.c_custkey = __correlated_sq_13.o_custkey +----------------Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) +------------------TableScan: customer projection=[c_custkey, c_phone, c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")])] +----------------SubqueryAlias: __correlated_sq_13 +------------------TableScan: orders projection=[o_custkey] +------------SubqueryAlias: __scalar_sq_11 +--------------Aggregate: groupBy=[[]], aggr=[[AVG(customer.c_acctbal)]] +----------------Projection: customer.c_acctbal +------------------Filter: customer.c_acctbal > Decimal128(Some(0),15,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]) +--------------------TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[customer.c_acctbal > Decimal128(Some(0),15,2) AS customer.c_acctbal > Decimal128(Some(0),30,15), substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8("13"), Utf8("31"), Utf8("23"), Utf8("29"), Utf8("30"), Utf8("18"), Utf8("17")]), customer.c_acctbal > Decimal128(Some(0),15,2)] physical_plan SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] --SortExec: expr=[cntrycode@0 ASC NULLS LAST] ----ProjectionExec: expr=[cntrycode@0 as cntrycode, COUNT(UInt8(1))@1 as numcust, SUM(custsale.c_acctbal)@2 as totacctbal] ------AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] --------CoalesceBatchesExec: target_batch_size=8192 -----------RepartitionExec: partitioning=Hash([Column { name: "cntrycode", index: 0 }], 4), input_partitions=1 +----------RepartitionExec: partitioning=Hash([Column { name: "cntrycode", index: 0 }], 4), input_partitions=4 ------------AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[COUNT(UInt8(1)), SUM(custsale.c_acctbal)] --------------ProjectionExec: expr=[substr(c_phone@0, 1, 2) as cntrycode, c_acctbal@1 as c_acctbal] -----------------CoalesceBatchesExec: target_batch_size=8192 -------------------FilterExec: CAST(c_acctbal@1 AS Decimal128(19, 6)) > __value@2 ---------------------CrossJoinExec -----------------------CoalescePartitionsExec -------------------------ProjectionExec: expr=[c_phone@1 as c_phone, c_acctbal@2 as c_acctbal] +----------------NestedLoopJoinExec: join_type=Inner, filter=BinaryExpr { left: CastExpr { expr: Column { name: "c_acctbal", index: 0 }, cast_type: Decimal128(19, 6), cast_options: CastOptions { safe: false, format_options: FormatOptions { safe: true, null: "", date_format: None, datetime_format: None, timestamp_format: None, timestamp_tz_format: None, time_format: None } } }, op: Gt, right: Column { name: "AVG(customer.c_acctbal)", index: 1 } } +------------------ProjectionExec: expr=[c_phone@1 as c_phone, c_acctbal@2 as c_acctbal] +--------------------CoalesceBatchesExec: target_batch_size=8192 +----------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(Column { name: "c_custkey", index: 0 }, Column { name: "o_custkey", index: 0 })] +------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------RepartitionExec: partitioning=Hash([Column { name: "c_custkey", index: 0 }], 4), input_partitions=4 +----------------------------CoalesceBatchesExec: target_batch_size=8192 +------------------------------FilterExec: Use substr(c_phone@1, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) +--------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +----------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], has_header=false +------------------------CoalesceBatchesExec: target_batch_size=8192 +--------------------------RepartitionExec: partitioning=Hash([Column { name: "o_custkey", index: 0 }], 4), input_partitions=4 +----------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/orders.tbl]]}, projection=[o_custkey], has_header=false +------------------AggregateExec: mode=Final, gby=[], aggr=[AVG(customer.c_acctbal)] +--------------------CoalescePartitionsExec +----------------------AggregateExec: mode=Partial, gby=[], aggr=[AVG(customer.c_acctbal)] +------------------------ProjectionExec: expr=[c_acctbal@1 as c_acctbal] --------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(Column { name: "c_custkey", index: 0 }, Column { name: "o_custkey", index: 0 })] -------------------------------CoalesceBatchesExec: target_batch_size=8192 ---------------------------------RepartitionExec: partitioning=Hash([Column { name: "c_custkey", index: 0 }], 4), input_partitions=4 -----------------------------------CoalesceBatchesExec: target_batch_size=8192 -------------------------------------FilterExec: Use substr(c_phone@1, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) ---------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/customer.tbl]]}, projection=[c_custkey, c_phone, c_acctbal], has_header=false -------------------------------CoalesceBatchesExec: target_batch_size=8192 ---------------------------------RepartitionExec: partitioning=Hash([Column { name: "o_custkey", index: 0 }], 4), input_partitions=4 -----------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/orders.tbl]]}, projection=[o_custkey], has_header=false -----------------------ProjectionExec: expr=[AVG(customer.c_acctbal)@0 as __value] -------------------------AggregateExec: mode=Final, gby=[], aggr=[AVG(customer.c_acctbal)] ---------------------------CoalescePartitionsExec -----------------------------AggregateExec: mode=Partial, gby=[], aggr=[AVG(customer.c_acctbal)] -------------------------------ProjectionExec: expr=[c_acctbal@1 as c_acctbal] ---------------------------------CoalesceBatchesExec: target_batch_size=8192 -----------------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) -------------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 ---------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], has_header=false - +----------------------------FilterExec: c_acctbal@1 > Some(0),15,2 AND Use substr(c_phone@0, 1, 2) IN (SET) ([Literal { value: Utf8("13") }, Literal { value: Utf8("31") }, Literal { value: Utf8("23") }, Literal { value: Utf8("29") }, Literal { value: Utf8("30") }, Literal { value: Utf8("18") }, Literal { value: Utf8("17") }]) +------------------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/sqllogictests/test_files/tpch/data/customer.tbl]]}, projection=[c_phone, c_acctbal], has_header=false query TIR From c549b6a1e5119497c3503baf8dcf916a8e92b303 Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Thu, 1 Jun 2023 18:58:24 +0800 Subject: [PATCH 11/13] Avoid unnecessary alias in the InSubquery rewriting --- datafusion/core/tests/sql/joins.rs | 48 +++++++++---------- datafusion/expr/src/expr_rewriter/mod.rs | 17 +++++++ datafusion/expr/src/logical_plan/plan.rs | 17 +++---- datafusion/optimizer/src/decorrelate.rs | 23 +-------- .../src/decorrelate_predicate_subquery.rs | 44 ++++++++--------- .../optimizer/src/scalar_subquery_to_join.rs | 17 ++----- 6 files changed, 77 insertions(+), 89 deletions(-) diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs index 1ab28d683f45e..3f4da9f658aae 100644 --- a/datafusion/core/tests/sql/joins.rs +++ b/datafusion/core/tests/sql/joins.rs @@ -2027,10 +2027,10 @@ async fn subquery_to_join_with_both_side_expr() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) [t2.t2_id + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; @@ -2071,10 +2071,10 @@ async fn subquery_to_join_with_muti_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N, t2_int:UInt32;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1), t2.t2_int [t2.t2_id + Int64(1):Int64;N, t2_int:UInt32;N]", " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_int:UInt32;N]", " TableScan: t2 projection=[t2_id, t2_int] [t2_id:UInt32;N, t2_int:UInt32;N]", ]; @@ -2115,10 +2115,10 @@ async fn three_projection_exprs_subquery_to_join() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [t2.t2_id + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", " Filter: t2.t2_int > UInt32(0) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; @@ -2158,11 +2158,11 @@ async fn in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { let plan = dataframe.into_optimized_plan().unwrap(); let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) [t2.t2_id + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; @@ -2188,10 +2188,10 @@ async fn not_in_subquery_to_join_with_correlated_outer_filter() -> Result<()> { let plan = dataframe.into_optimized_plan().unwrap(); let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftAnti Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftAnti Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) Filter: t1.t1_int > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) [t2.t2_id + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", ]; @@ -2218,11 +2218,11 @@ async fn in_subquery_to_join_with_outer_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) Filter: t1.t1_int <= __correlated_sq_1.t2_int AND t1.t1_name != __correlated_sq_1.t2_name [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [CAST(t2_id AS Int64) + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1), t2.t2_int, t2.t2_name [t2.t2_id + Int64(1):Int64;N, t2_int:UInt32;N, t2_name:Utf8;N]", " TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]", ]; @@ -2264,15 +2264,15 @@ async fn two_in_subquery_to_join_with_outer_filter() -> Result<()> { let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " LeftSemi Join: CAST(t1.t1_int AS Int64) = __correlated_sq_2.CAST(t2_int AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.CAST(t2_id AS Int64) + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_int AS Int64) = __correlated_sq_2.t2.t2_int + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", + " LeftSemi Join: CAST(t1.t1_id AS Int64) + Int64(12) = __correlated_sq_1.t2.t2_id + Int64(1) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " Filter: t1.t1_id > UInt32(0) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", " TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]", - " SubqueryAlias: __correlated_sq_1 [CAST(t2_id AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_id AS Int64) + Int64(1) AS t2.t2_id + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) [CAST(t2_id AS Int64) + Int64(1):Int64;N]", + " SubqueryAlias: __correlated_sq_1 [t2.t2_id + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_id AS Int64) + Int64(1) [t2.t2_id + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_id] [t2_id:UInt32;N]", - " SubqueryAlias: __correlated_sq_2 [CAST(t2_int AS Int64) + Int64(1):Int64;N]", - " Projection: CAST(t2.t2_int AS Int64) + Int64(1) AS t2.t2_int + Int64(1) AS CAST(t2_int AS Int64) + Int64(1) [CAST(t2_int AS Int64) + Int64(1):Int64;N]", + " SubqueryAlias: __correlated_sq_2 [t2.t2_int + Int64(1):Int64;N]", + " Projection: CAST(t2.t2_int AS Int64) + Int64(1) [t2.t2_int + Int64(1):Int64;N]", " TableScan: t2 projection=[t2_int] [t2_int:UInt32;N]", ]; diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 63b5d3ed67308..6a47d951617c4 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -137,6 +137,23 @@ pub fn unnormalize_col(expr: Expr) -> Expr { .expect("Unnormalize is infallable") } +/// Create a Column from the Scalar Expr +pub fn create_col_from_scalar_expr( + scalar_expr: &Expr, + subqry_alias: String, +) -> Result { + match scalar_expr { + Expr::Alias(_, alias) => Ok(Column::new(Some(subqry_alias), alias)), + Expr::Column(Column { relation: _, name }) => { + Ok(Column::new(Some(subqry_alias), name)) + } + _ => { + let scalar_column = scalar_expr.display_name()?; + Ok(Column::new(Some(subqry_alias), scalar_column)) + } + } +} + /// Recursively un-normalize all [`Column`] expressions in a list of expression trees #[inline] pub fn unnormalize_cols(exprs: impl IntoIterator) -> Vec { diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0f00b2bbc990c..e53f5fa250e47 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -17,7 +17,7 @@ use crate::expr::InSubquery; use crate::expr::{Exists, Placeholder}; -use crate::expr_rewriter::unnormalize_col; +use crate::expr_rewriter::create_col_from_scalar_expr; ///! Logical plan types use crate::logical_plan::display::{GraphvizVisitor, IndentVisitor}; use crate::logical_plan::extension::UserDefinedLogicalNode; @@ -453,13 +453,14 @@ impl LogicalPlan { ))), LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; - Ok(expr_opt.map(|expr| { - let col_name = format!("{:?}", unnormalize_col(expr)); - Expr::Column(Column::new( - Some(subquery_alias.alias.clone()), - col_name, - )) - })) + expr_opt + .map(|expr| { + Ok(Expr::Column(create_col_from_scalar_expr( + &expr, + subquery_alias.alias.to_string(), + )?)) + }) + .map_or(Ok(None), |v| v.map(Some)) } LogicalPlan::Subquery(_) => Ok(None), LogicalPlan::EmptyRelation(_) diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index db4a1f2c409a2..fa96ede3c8e47 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -24,10 +24,7 @@ use datafusion_common::tree_node::{ }; use datafusion_common::Result; use datafusion_common::{Column, DFSchemaRef, DataFusionError, ScalarValue}; -use datafusion_expr::expr_rewriter::unnormalize_col; -use datafusion_expr::{ - expr, BinaryExpr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder, Operator, -}; +use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; use datafusion_physical_expr::execution_props::ExecutionProps; use std::collections::{BTreeSet, HashMap}; use std::ops::Deref; @@ -309,23 +306,6 @@ impl PullUpCorrelatedExpr { correlated_subquery_cols: &BTreeSet, ) -> Result> { let mut missing_exprs = vec![]; - if let Some(Expr::BinaryExpr(BinaryExpr { - left: _, - op: Operator::Eq, - right, - })) = &self.in_predicate_opt - { - if !matches!(right.deref(), Expr::Column(_)) - && !matches!(right.deref(), Expr::Literal(_)) - && !matches!(right.deref(), Expr::Alias(_, _)) - { - let alias_expr = right - .deref() - .clone() - .alias(format!("{:?}", unnormalize_col(right.deref().clone()))); - missing_exprs.push(alias_expr) - } - } for expr in exprs { if !missing_exprs.contains(expr) { missing_exprs.push(expr.clone()) @@ -346,7 +326,6 @@ impl PullUpCorrelatedExpr { } } } - Ok(missing_exprs) } } diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 926e671926f87..80ceeb11e269c 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -23,7 +23,7 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::TreeNode; use datafusion_common::{Column, DataFusionError, Result}; use datafusion_expr::expr::{Exists, InSubquery}; -use datafusion_expr::expr_rewriter::unnormalize_col; +use datafusion_expr::expr_rewriter::create_col_from_scalar_expr; use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::{ exists, in_subquery, not_exists, not_in_subquery, BinaryExpr, Expr, Filter, @@ -256,8 +256,7 @@ fn build_join( right, })), ) => { - let right_expr_name = format!("{:?}", unnormalize_col(right.deref().clone())); - let right_col = Column::new(Some(subquery_alias), right_expr_name); + let right_col = create_col_from_scalar_expr(right.deref(), subquery_alias)?; let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); Some(in_predicate.and(join_filter)) } @@ -270,8 +269,7 @@ fn build_join( right, })), ) => { - let right_expr_name = format!("{:?}", unnormalize_col(right.deref().clone())); - let right_col = Column::new(Some(subquery_alias), right_expr_name); + let right_col = create_col_from_scalar_expr(right.deref(), subquery_alias)?; let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); Some(in_predicate) } @@ -887,10 +885,10 @@ mod tests { .build()?; let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey + Int32(1) AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.orders.o_custkey + Int32(1) AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey + Int32(1):Int64, orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ - \n Projection: orders.o_custkey + Int32(1) AS o_custkey + Int32(1), orders.o_custkey + Int32(1), orders.o_custkey [o_custkey + Int32(1):Int64, orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n SubqueryAlias: __correlated_sq_1 [orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n Projection: orders.o_custkey + Int32(1), orders.o_custkey [orders.o_custkey + Int32(1):Int64, o_custkey:Int64]\ \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; assert_optimized_plan_eq_display_indent( @@ -1098,10 +1096,10 @@ mod tests { .build()?; let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.sq.c * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.c * UInt32(2) [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [sq.c * UInt32(2):UInt32]\ + \n Projection: sq.c * UInt32(2) [sq.c * UInt32(2):UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_eq_display_indent( @@ -1132,10 +1130,10 @@ mod tests { .build()?; let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.sq.c * UInt32(2) AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.c * UInt32(2), sq.a [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [sq.c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq.c * UInt32(2), sq.a [sq.c * UInt32(2):UInt32, a:UInt32]\ \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; @@ -1168,10 +1166,10 @@ mod tests { .build()?; let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a + test.b = __correlated_sq_1.a + __correlated_sq_1.b [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.sq.c * UInt32(2) AND test.a + test.b = __correlated_sq_1.a + __correlated_sq_1.b [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.c * UInt32(2), sq.a, sq.b [c * UInt32(2):UInt32, sq.c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [sq.c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ + \n Projection: sq.c * UInt32(2), sq.a, sq.b [sq.c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; @@ -1211,14 +1209,14 @@ mod tests { let expected = "Projection: test.b [b:UInt32]\ \n Filter: test.c > UInt32(1) [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c * UInt32(2) = __correlated_sq_2.c * UInt32(2) AND test.a > __correlated_sq_2.a [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a > __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c * UInt32(2) = __correlated_sq_2.sq2.c * UInt32(2) AND test.a > __correlated_sq_2.a [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.sq1.c * UInt32(2) AND test.a > __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, sq1.c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq1.c * UInt32(2) AS c * UInt32(2), sq1.c * UInt32(2), sq1.a [c * UInt32(2):UInt32, sq1.c * UInt32(2):UInt32, a:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [sq1.c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq1.c * UInt32(2), sq1.a [sq1.c * UInt32(2):UInt32, a:UInt32]\ \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_2 [c * UInt32(2):UInt32, sq2.c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq2.c * UInt32(2) AS c * UInt32(2), sq2.c * UInt32(2), sq2.a [c * UInt32(2):UInt32, sq2.c * UInt32(2):UInt32, a:UInt32]\ + \n SubqueryAlias: __correlated_sq_2 [sq2.c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq2.c * UInt32(2), sq2.a [sq2.c * UInt32(2):UInt32, a:UInt32]\ \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_eq_display_indent( diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 21e217249e444..fa06e1ddbfafb 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -24,6 +24,7 @@ use datafusion_common::tree_node::{ RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, }; use datafusion_common::{Column, DataFusionError, Result, ScalarValue}; +use datafusion_expr::expr_rewriter::create_col_from_scalar_expr; use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; use std::collections::{BTreeSet, HashMap}; @@ -221,18 +222,10 @@ impl TreeNodeRewriter for ExtractScalarSubQuery { )), Ok, )?; - match scalar_expr { - Expr::Alias(_, alias) => { - Ok(Expr::Column(Column::new(Some(subqry_alias), alias))) - } - Expr::Column(Column { relation: _, name }) => { - Ok(Expr::Column(Column::new(Some(subqry_alias), name))) - } - _ => { - let scalar_column = scalar_expr.display_name()?; - Ok(Expr::Column(Column::new(Some(subqry_alias), scalar_column))) - } - } + Ok(Expr::Column(create_col_from_scalar_expr( + &scalar_expr, + subqry_alias, + )?)) } _ => Ok(expr), } From 7662a3af4ed7e2eca1f17ac90faa55c0f5eaefcf Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Tue, 13 Jun 2023 16:49:03 +0800 Subject: [PATCH 12/13] fix joins.slt --- .../tests/sqllogictests/test_files/joins.slt | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/joins.slt b/datafusion/core/tests/sqllogictests/test_files/joins.slt index 4486d7c47b491..c4b4e9ab68a12 100644 --- a/datafusion/core/tests/sqllogictests/test_files/joins.slt +++ b/datafusion/core/tests/sqllogictests/test_files/joins.slt @@ -1700,10 +1700,10 @@ from join_t1 where join_t1.t1_id + 12 in (select join_t2.t2_id + 1 from join_t2) ---- logical_plan -LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_5.CAST(t2_id AS Int64) + Int64(1) +LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_5.join_t2.t2_id + Int64(1) --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_5 -----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) +----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) ------TableScan: join_t2 projection=[t2_id] query ITI rowsort @@ -1729,10 +1729,10 @@ where join_t1.t1_id + 12 in ) ---- logical_plan -LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_7.CAST(t2_id AS Int64) + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_7.t2_int +LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_7.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_7.t2_int --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_7 -----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), join_t2.t2_int +----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int ------Filter: join_t2.t2_int > UInt32(0) --------TableScan: join_t2 projection=[t2_id, t2_int] @@ -1765,10 +1765,10 @@ where join_t1.t1_id + 12 in ) ---- logical_plan -LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_9.CAST(t2_id AS Int64) + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_9.t2_int AND join_t1.t1_name != __correlated_sq_9.t2_name +LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_9.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_9.t2_int AND join_t1.t1_name != __correlated_sq_9.t2_name --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_9 -----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name +----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name ------Filter: join_t2.t2_int > UInt32(0) --------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] @@ -1797,11 +1797,11 @@ where join_t1.t1_id + 12 in (select join_t2.t2_id + 1 from join_t2 where join_t1.t1_int > 0) ---- logical_plan -LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_11.CAST(t2_id AS Int64) + Int64(1) +LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_11.join_t2.t2_id + Int64(1) --Filter: join_t1.t1_int > UInt32(0) ----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_11 -----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) +----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) ------TableScan: join_t2 projection=[t2_id] # Not in subquery to join with correlated outer filter @@ -1814,10 +1814,10 @@ where join_t1.t1_id + 12 not in (select join_t2.t2_id + 1 from join_t2 where join_t1.t1_int > 0) ---- logical_plan -LeftAnti Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_12.CAST(t2_id AS Int64) + Int64(1) Filter: join_t1.t1_int > UInt32(0) +LeftAnti Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_12.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int > UInt32(0) --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_12 -----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) +----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) ------TableScan: join_t2 projection=[t2_id] # In subquery to join with outer filter @@ -1836,11 +1836,11 @@ where join_t1.t1_id + 12 in and join_t1.t1_id > 0 ---- logical_plan -LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_13.CAST(t2_id AS Int64) + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_13.t2_int AND join_t1.t1_name != __correlated_sq_13.t2_name +LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_13.join_t2.t2_id + Int64(1) Filter: join_t1.t1_int <= __correlated_sq_13.t2_int AND join_t1.t1_name != __correlated_sq_13.t2_name --Filter: join_t1.t1_id > UInt32(0) ----TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_13 -----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name +----Projection: CAST(join_t2.t2_id AS Int64) + Int64(1), join_t2.t2_int, join_t2.t2_name ------TableScan: join_t2 projection=[t2_id, t2_name, t2_int] query ITI rowsort @@ -1869,15 +1869,15 @@ where join_t1.t1_id + 12 in (select join_t2.t2_id + 1 from join_t2) and join_t1.t1_id > 0 ---- logical_plan -LeftSemi Join: CAST(join_t1.t1_int AS Int64) = __correlated_sq_16.CAST(t2_int AS Int64) + Int64(1) ---LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_15.CAST(t2_id AS Int64) + Int64(1) +LeftSemi Join: CAST(join_t1.t1_int AS Int64) = __correlated_sq_16.join_t2.t2_int + Int64(1) +--LeftSemi Join: CAST(join_t1.t1_id AS Int64) + Int64(12) = __correlated_sq_15.join_t2.t2_id + Int64(1) ----Filter: join_t1.t1_id > UInt32(0) ------TableScan: join_t1 projection=[t1_id, t1_name, t1_int] ----SubqueryAlias: __correlated_sq_15 -------Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) AS CAST(t2_id AS Int64) + Int64(1) +------Projection: CAST(join_t2.t2_id AS Int64) + Int64(1) --------TableScan: join_t2 projection=[t2_id] --SubqueryAlias: __correlated_sq_16 -----Projection: CAST(join_t2.t2_int AS Int64) + Int64(1) AS CAST(t2_int AS Int64) + Int64(1) +----Projection: CAST(join_t2.t2_int AS Int64) + Int64(1) ------TableScan: join_t2 projection=[t2_int] query ITI @@ -2197,8 +2197,10 @@ logical_plan LeftAnti Join: Filter: CAST(join_t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_31.t2_id AS Int64) * Int64(2) --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_31 -----Aggregate: groupBy=[[join_t2.t2_id]], aggr=[[]] -------TableScan: join_t2 projection=[t2_id] +----Projection: join_t2.t2_id +------Aggregate: groupBy=[[join_t2.t2_int, join_t2.t2_id]], aggr=[[]] +--------Projection: join_t2.t2_int, join_t2.t2_id +----------TableScan: join_t2 projection=[t2_id, t2_int] statement ok set datafusion.optimizer.repartition_joins = false; @@ -2244,8 +2246,10 @@ logical_plan LeftAnti Join: Filter: CAST(join_t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_34.t2_id AS Int64) * Int64(2) --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_34 -----Aggregate: groupBy=[[join_t2.t2_id]], aggr=[[]] -------TableScan: join_t2 projection=[t2_id] +----Projection: join_t2.t2_id +------Aggregate: groupBy=[[join_t2.t2_id + join_t2.t2_int, join_t2.t2_int, join_t2.t2_id]], aggr=[[]] +--------Projection: join_t2.t2_id + join_t2.t2_int, join_t2.t2_int, join_t2.t2_id +----------TableScan: join_t2 projection=[t2_id, t2_int] statement ok set datafusion.optimizer.repartition_joins = false; @@ -2293,8 +2297,10 @@ logical_plan LeftAnti Join: Filter: CAST(join_t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_37.t2_id AS Int64) * Int64(2) --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_37 -----Aggregate: groupBy=[[join_t2.t2_id]], aggr=[[]] -------TableScan: join_t2 projection=[t2_id] +----Projection: join_t2.t2_id +------Aggregate: groupBy=[[Int64(1), join_t2.t2_int, join_t2.t2_id]], aggr=[[]] +--------Projection: Int64(1), join_t2.t2_int, join_t2.t2_id +----------TableScan: join_t2 projection=[t2_id, t2_int] query ITI SELECT * FROM join_t1 @@ -2322,8 +2328,10 @@ logical_plan LeftAnti Join: Filter: CAST(join_t1.t1_id AS Int64) + Int64(1) > CAST(__correlated_sq_39.t2_id AS Int64) * Int64(2) --TableScan: join_t1 projection=[t1_id, t1_name, t1_int] --SubqueryAlias: __correlated_sq_39 -----Aggregate: groupBy=[[join_t2.t2_id]], aggr=[[]] -------TableScan: join_t2 projection=[t2_id] +----Projection: join_t2.t2_id +------Aggregate: groupBy=[[Int64(1), join_t2.t2_int, join_t2.t2_id]], aggr=[[]] +--------Projection: Int64(1), join_t2.t2_int, join_t2.t2_id +----------TableScan: join_t2 projection=[t2_id, t2_int] query ITI SELECT * FROM join_t1 @@ -2936,4 +2944,3 @@ set datafusion.execution.target_partitions = 2; statement ok set datafusion.execution.batch_size = 4096; - From 2112d2bd8dfcb2e476f5aa99fabf3db4812efb5f Mon Sep 17 00:00:00 2001 From: "mingmwang@ebay.com" Date: Tue, 13 Jun 2023 18:10:06 +0800 Subject: [PATCH 13/13] fix fmt --- datafusion/expr/src/logical_plan/plan.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 4d1f2382ded6f..4f21ad38c4d2f 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -///! Logical plan types +//! Logical plan types + use crate::expr::InSubquery; use crate::expr::{Exists, Placeholder}; use crate::expr_rewriter::create_col_from_scalar_expr;