From d53a163ab0c815d115d3484d42decce8211a9413 Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Tue, 3 Dec 2024 17:49:32 +0800 Subject: [PATCH] Optimize DISTINCT, ORDER BY clause when Aggregation without Group By. For query which has Aggregation but without Group by clause, the DISTINCT/DISTINCT ON/ORDER BY clause could be removed as there would be one row returned at most. And there is no necessary to do unique or sort. This can simply the plan, and process less Aggref nodes during planner. select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); QUERY PLAN -------------------------------------------------------------------- Unique Output: (count(a)), (sum(b)), (count(c)), (count(b)) Group Key: (count(c)), (count(b)) -> Sort Output: (count(a)), (sum(b)), (count(c)), (count(b)) Sort Key: (count(t_distinct_sort.c)), (count(t_distinct_sort.b)) -> Finalize Aggregate Output: count(a), sum(b), count(c), count(b) -> Gather Motion 3:1 (slice1; segments: 3) Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(c)), (PARTIAL count(b)) -> Partial Aggregate Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(c), PARTIAL count(b) -> Seq Scan on public.t_distinct_sort Output: a, b, c After this commit: select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); QUERY PLAN -------------------------------------------------------- Finalize Aggregate Output: count(a), sum(b) -> Gather Motion 3:1 (slice1; segments: 3) Output: (PARTIAL count(a)), (PARTIAL sum(b)) -> Partial Aggregate Output: PARTIAL count(a), PARTIAL sum(b) -> Seq Scan on public.t_distinct_sort Output: a, b, c Optimizer: Postgres query optimizer Authored-by: Zhang Mingli avamingli@gmail.com --- src/backend/optimizer/plan/planner.c | 2 + src/backend/optimizer/plan/transform.c | 86 +++++++ src/include/optimizer/transform.h | 4 + src/test/regress/expected/aggregates.out | 46 ++-- .../regress/expected/aggregates_optimizer.out | 26 +-- src/test/regress/expected/cbdb_parallel.out | 20 +- .../regress/expected/incremental_sort.out | 36 ++- .../expected/incremental_sort_optimizer.out | 36 ++- .../regress/expected/offload_entry_to_qe.out | 24 +- src/test/regress/expected/select_distinct.out | 211 +++++++++++++++++ .../expected/select_distinct_optimizer.out | 213 ++++++++++++++++++ src/test/regress/sql/select_distinct.sql | 38 ++++ .../expected/aggregates.out | 46 ++-- .../expected/incremental_sort.out | 20 +- 14 files changed, 658 insertions(+), 150 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b6dd832ff53..39252c4bffb 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1339,6 +1339,8 @@ subquery_planner(PlannerGlobal *glob, Query *parse, if (hasResultRTEs) remove_useless_result_rtes(root); + parse = remove_distinct_sort_clause(parse); + /* * Do the main planning. */ diff --git a/src/backend/optimizer/plan/transform.c b/src/backend/optimizer/plan/transform.c index 1a76e24123b..1a8776c6ab7 100644 --- a/src/backend/optimizer/plan/transform.c +++ b/src/backend/optimizer/plan/transform.c @@ -20,6 +20,7 @@ #include "nodes/makefuncs.h" #include "optimizer/clauses.h" #include "optimizer/optimizer.h" +#include "optimizer/tlist.h" #include "optimizer/transform.h" #include "utils/lsyscache.h" #include "catalog/pg_proc.h" @@ -39,6 +40,7 @@ static SubLink *make_sirvf_subselect(FuncExpr *fe); static Query *make_sirvf_subquery(FuncExpr *fe); static bool safe_to_replace_sirvf_tle(Query *query); static bool safe_to_replace_sirvf_rte(Query *query); +static bool tlist_has_srf(Query *query); /** * Normalize query before planning. @@ -520,3 +522,87 @@ replace_sirvf_rte(Query *query, RangeTblEntry *rte) return rte; } + +/* + * Does target list have SRFs? + */ +static +bool tlist_has_srf(Query *query) +{ + if (query->hasTargetSRFs) + { + return true; + } + + if (expression_returns_set( (Node *) query->targetList)) + { + return true; + } + + return false; +} + +/* + * DISTINCT/DISTINCT ON/ORDER BY optimization. + * Remove DISTINCT clause if possibile, ex: + * select DISTINCT count(a) from t; to + * select count(a) from t; + * There is one row returned at most, DISTINCT and/or ON is pointless then. + * The same with ORDER BY clause; + */ +Query *remove_distinct_sort_clause(Query *parse) +{ + if (parse->hasAggs && + parse->groupClause == NIL && + !contain_mutable_functions((Node *) parse) && + !tlist_has_srf(parse)) + { + List *useless_tlist = NIL; + List *tles; + List *sortops; + List *eqops; + ListCell *lc; + + if (parse->distinctClause != NIL) + { + get_sortgroupclauses_tles(parse->distinctClause, parse->targetList, + &tles, &sortops, &eqops); + foreach(lc, tles) + { + TargetEntry *tle = lfirst(lc); + if (tle->resjunk) + useless_tlist = lappend(useless_tlist, tle); + } + parse->distinctClause = NIL; + if (parse->hasDistinctOn) + parse->hasDistinctOn = false; + } + + if (parse->sortClause != NIL) + { + + get_sortgroupclauses_tles(parse->sortClause, parse->targetList, + &tles, &sortops, &eqops); + foreach(lc, tles) + { + TargetEntry *tle = lfirst(lc); + /* + * For SELECT DISTINCT, ORDER BY expressions must appear in select list, + * Some tles may be already in the list. + */ + if (tle->resjunk) + useless_tlist = list_append_unique(useless_tlist, tle); + } + parse->sortClause = NIL; + } + + /* + * There is no groupClause, sortClause and distinctClause now . + * The junk TargetEntrys with ressortgroupref index are safe to be removed. + */ + if (useless_tlist != NIL) + parse->targetList = list_difference(parse->targetList, useless_tlist); + } + + return parse; +} diff --git a/src/include/optimizer/transform.h b/src/include/optimizer/transform.h index 85d32f35e0f..d53c393769c 100644 --- a/src/include/optimizer/transform.h +++ b/src/include/optimizer/transform.h @@ -21,4 +21,8 @@ extern Query *normalize_query(Query *query); +extern Query *remove_distinct_sort_clause(Query *query); + +extern bool query_has_srf(Query *query); + #endif /* TRANSFORM_H */ diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 62548a4c69c..b932b05dfb0 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1066,17 +1066,15 @@ explain (costs off) select distinct max(unique2) from tenk1; QUERY PLAN --------------------------------------------------------------------------- - HashAggregate - Group Key: $0 + Result InitPlan 1 (returns $0) (slice1) -> Limit -> Gather Motion 3:1 (slice2; segments: 3) Merge Key: tenk1.unique2 -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(10 rows) +(8 rows) select distinct max(unique2) from tenk1; max @@ -1088,17 +1086,15 @@ explain (costs off) select max(unique2) from tenk1 order by 1; QUERY PLAN --------------------------------------------------------------------------- - Sort - Sort Key: ($0) + Result InitPlan 1 (returns $0) (slice1) -> Limit -> Gather Motion 3:1 (slice2; segments: 3) Merge Key: tenk1.unique2 -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(10 rows) +(8 rows) select max(unique2) from tenk1 order by 1; max @@ -1110,17 +1106,15 @@ explain (costs off) select max(unique2) from tenk1 order by max(unique2); QUERY PLAN --------------------------------------------------------------------------- - Sort - Sort Key: ($0) + Result InitPlan 1 (returns $0) (slice1) -> Limit -> Gather Motion 3:1 (slice2; segments: 3) Merge Key: tenk1.unique2 -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(10 rows) +(8 rows) select max(unique2) from tenk1 order by max(unique2); max @@ -1132,17 +1126,15 @@ explain (costs off) select max(unique2) from tenk1 order by max(unique2)+1; QUERY PLAN --------------------------------------------------------------------------- - Sort - Sort Key: (($0 + 1)) + Result InitPlan 1 (returns $0) (slice1) -> Limit -> Gather Motion 3:1 (slice2; segments: 3) Merge Key: tenk1.unique2 -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(10 rows) +(8 rows) select max(unique2) from tenk1 order by max(unique2)+1; max @@ -1260,20 +1252,16 @@ explain (costs off) select distinct min(f1), max(f1) from minmaxtest; QUERY PLAN -------------------------------------------------------------- - Unique - Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1)) - -> Sort - Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1)) - -> Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on minmaxtest minmaxtest_1 - -> Seq Scan on minmaxtest1 minmaxtest_2 - -> Seq Scan on minmaxtest2 minmaxtest_3 - -> Seq Scan on minmaxtest3 minmaxtest_4 + Finalize Aggregate + -> Gather Motion 3:1 (slice1; segments: 3) + -> Partial Aggregate + -> Append + -> Seq Scan on minmaxtest minmaxtest_1 + -> Seq Scan on minmaxtest1 minmaxtest_2 + -> Seq Scan on minmaxtest2 minmaxtest_3 + -> Seq Scan on minmaxtest3 minmaxtest_4 Optimizer: Postgres query optimizer -(13 rows) +(9 rows) select distinct min(f1), max(f1) from minmaxtest; min | max diff --git a/src/test/regress/expected/aggregates_optimizer.out b/src/test/regress/expected/aggregates_optimizer.out index 54c9fbe2e8f..959aef0c0b2 100644 --- a/src/test/regress/expected/aggregates_optimizer.out +++ b/src/test/regress/expected/aggregates_optimizer.out @@ -1267,22 +1267,18 @@ explain (costs off) select distinct min(f1), max(f1) from minmaxtest; INFO: GPORCA failed to produce a plan, falling back to planner DETAIL: Feature not supported: Inherited tables - QUERY PLAN --------------------------------------------------------------------------- - Unique - Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1)) - -> Sort - Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1)) - -> Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on minmaxtest minmaxtest_1 - -> Seq Scan on minmaxtest1 minmaxtest_2 - -> Seq Scan on minmaxtest2 minmaxtest_3 - -> Seq Scan on minmaxtest3 minmaxtest_4 + QUERY PLAN +-------------------------------------------------------------- + Finalize Aggregate + -> Gather Motion 3:1 (slice1; segments: 3) + -> Partial Aggregate + -> Append + -> Seq Scan on minmaxtest minmaxtest_1 + -> Seq Scan on minmaxtest1 minmaxtest_2 + -> Seq Scan on minmaxtest2 minmaxtest_3 + -> Seq Scan on minmaxtest3 minmaxtest_4 Optimizer: Postgres query optimizer -(13 rows) +(9 rows) select distinct min(f1), max(f1) from minmaxtest; INFO: GPORCA failed to produce a plan, falling back to planner diff --git a/src/test/regress/expected/cbdb_parallel.out b/src/test/regress/expected/cbdb_parallel.out index 8935ce25f62..648259d3798 100644 --- a/src/test/regress/expected/cbdb_parallel.out +++ b/src/test/regress/expected/cbdb_parallel.out @@ -2407,22 +2407,16 @@ create table t1(c1 int) distributed by (c1); insert into t1 values(11), (12); analyze t1; explain(costs off, locus) select distinct min(c1), max(c1) from t1; - QUERY PLAN ------------------------------------------------------------- - Unique + QUERY PLAN +------------------------------------------------ + Aggregate Locus: Entry - Group Key: (min(c1)), (max(c1)) - -> Sort + -> Gather Motion 3:1 (slice1; segments: 3) Locus: Entry - Sort Key: (min(c1)), (max(c1)) - -> Aggregate - Locus: Entry - -> Gather Motion 3:1 (slice1; segments: 3) - Locus: Entry - -> Seq Scan on t1 - Locus: Hashed + -> Seq Scan on t1 + Locus: Hashed Optimizer: Postgres query optimizer -(13 rows) +(7 rows) abort; begin; diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index 8780b357aad..9bdd3486d2b 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1705,26 +1705,24 @@ from tenk1 t1 join tenk1 t2 on t1.unique1 = t2.unique2 join tenk1 t3 on t2.unique1 = t3.unique1 order by count(*); - QUERY PLAN -------------------------------------------------------------------------------------------------- - Sort - Sort Key: (count(*)) - -> Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Hash Join - Hash Cond: (t2.unique2 = t1.unique1) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: t2.unique2 - -> Hash Join - Hash Cond: (t2.unique1 = t3.unique1) - -> Index Scan using tenk1_unique2 on tenk1 t2 - -> Hash - -> Index Only Scan using tenk1_unique1 on tenk1 t3 - -> Hash - -> Index Only Scan using tenk1_unique1 on tenk1 t1 + QUERY PLAN +------------------------------------------------------------------------------------------- + Finalize Aggregate + -> Gather Motion 3:1 (slice1; segments: 3) + -> Partial Aggregate + -> Hash Join + Hash Cond: (t2.unique2 = t1.unique1) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: t2.unique2 + -> Hash Join + Hash Cond: (t2.unique1 = t3.unique1) + -> Index Scan using tenk1_unique2 on tenk1 t2 + -> Hash + -> Index Only Scan using tenk1_unique1 on tenk1 t3 + -> Hash + -> Index Only Scan using tenk1_unique1 on tenk1 t1 Optimizer: Postgres query optimizer -(17 rows) +(15 rows) -- Parallel sort but with expression (correlated subquery) that -- is prohibited in parallel plans. diff --git a/src/test/regress/expected/incremental_sort_optimizer.out b/src/test/regress/expected/incremental_sort_optimizer.out index 0c82c1e2482..a2387b3bf67 100644 --- a/src/test/regress/expected/incremental_sort_optimizer.out +++ b/src/test/regress/expected/incremental_sort_optimizer.out @@ -1555,26 +1555,24 @@ from tenk1 t1 join tenk1 t2 on t1.unique1 = t2.unique2 join tenk1 t3 on t2.unique1 = t3.unique1 order by count(*); - QUERY PLAN -------------------------------------------------------------------------------------------------- - Sort - Sort Key: (count(*)) - -> Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Hash Join - Hash Cond: (t2.unique2 = t1.unique1) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: t2.unique2 - -> Hash Join - Hash Cond: (t2.unique1 = t3.unique1) - -> Index Scan using tenk1_unique2 on tenk1 t2 - -> Hash - -> Index Only Scan using tenk1_unique1 on tenk1 t3 - -> Hash - -> Index Only Scan using tenk1_unique1 on tenk1 t1 + QUERY PLAN +------------------------------------------------------------------------------------------- + Finalize Aggregate + -> Gather Motion 3:1 (slice1; segments: 3) + -> Partial Aggregate + -> Hash Join + Hash Cond: (t2.unique2 = t1.unique1) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: t2.unique2 + -> Hash Join + Hash Cond: (t2.unique1 = t3.unique1) + -> Index Scan using tenk1_unique2 on tenk1 t2 + -> Hash + -> Index Only Scan using tenk1_unique1 on tenk1 t3 + -> Hash + -> Index Only Scan using tenk1_unique1 on tenk1 t1 Optimizer: Postgres query optimizer -(18 rows) +(15 rows) -- Parallel sort but with expression (correlated subquery) that -- is prohibited in parallel plans. diff --git a/src/test/regress/expected/offload_entry_to_qe.out b/src/test/regress/expected/offload_entry_to_qe.out index 1c1d0a13a25..4b441e75572 100644 --- a/src/test/regress/expected/offload_entry_to_qe.out +++ b/src/test/regress/expected/offload_entry_to_qe.out @@ -90,26 +90,20 @@ explain (costs off, locus) select count(x) over () from tst group by x limit 1; (14 rows) explain (costs off, locus) select distinct min(x), max(x) from tst; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------ Gather Motion 1:1 (slice1; segments: 1) Locus: Entry - -> Unique + -> Finalize Aggregate Locus: SingleQE - Group Key: (min(x)), (max(x)) - -> Sort + -> Gather Motion 3:1 (slice2; segments: 3) Locus: SingleQE - Sort Key: (min(x)), (max(x)) - -> Finalize Aggregate - Locus: SingleQE - -> Gather Motion 3:1 (slice2; segments: 3) - Locus: SingleQE - -> Partial Aggregate - Locus: Hashed - -> Seq Scan on tst - Locus: Hashed + -> Partial Aggregate + Locus: Hashed + -> Seq Scan on tst + Locus: Hashed Optimizer: Postgres query optimizer -(17 rows) +(11 rows) reset enable_offload_entry_to_qe; -- compare results with GUC set to false select count(x) from tst where x > 1; diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index bcfa5237e29..f463e61ef0d 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -534,3 +534,214 @@ DROP TABLE capitals; DROP TABLE cities; set gp_statistics_pullup_from_child_partition to off; -- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on +create table t_distinct_sort(a int, b int, c int); +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +analyze t_distinct_sort; +explain(verbose, costs off) +select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(10 rows) + +select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a); + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(10 rows) + +select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select count(a), sum(b) from t_distinct_sort order by sum(a), count(c); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(10 rows) + +select count(a), sum(b) from t_distinct_sort order by sum(a), count(c); + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select distinct count(a), sum(b) from t_distinct_sort ; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(10 rows) + +select distinct count(a), sum(b) from t_distinct_sort ; + count | sum +-------+----- + 30 | 195 +(1 row) + +-- should keep distinct clause +explain(verbose, costs off) +select distinct on(count(random())) count(a), sum(b) from t_distinct_sort; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Unique + Output: (count(a)), (sum(b)), (count(random())) + Group Key: (count(random())) + -> Sort + Output: (count(a)), (sum(b)), (count(random())) + Sort Key: (count(random())) + -> Finalize Aggregate + Output: count(a), sum(b), count(random()) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(random())) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(random()) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(16 rows) + +select distinct on(count(random())) count(a), sum(b) from t_distinct_sort; + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select distinct(count(a)) from t_distinct_sort, (select distinct(count(*)), generate_series( +0, 2) from t_distinct_sort)as xx; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(t_distinct_sort.a) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(t_distinct_sort.a)) + -> Partial Aggregate + Output: PARTIAL count(t_distinct_sort.a) + -> Nested Loop + Output: t_distinct_sort.a + -> Broadcast Motion 1:3 (slice2; segments: 1) + -> Subquery Scan on xx + -> Unique + Output: (count(*)), (generate_series(0, 2)) + Group Key: (count(*)), (generate_series(0, 2)) + -> Sort + Output: (count(*)), (generate_series(0, 2)) + Sort Key: (count(*)), (generate_series(0, 2)) + -> ProjectSet + Output: (count(*)), generate_series(0, 2) + -> Finalize Aggregate + Output: count(*) + -> Gather Motion 3:1 (slice3; segments: 3) + Output: (PARTIAL count(*)) + -> Partial Aggregate + Output: PARTIAL count(*) + -> Seq Scan on public.t_distinct_sort t_distinct_sort_1 + Output: t_distinct_sort_1.a, t_distinct_sort_1.b, t_distinct_sort_1.c + -> Materialize + Output: t_distinct_sort.a + -> Seq Scan on public.t_distinct_sort + Output: t_distinct_sort.a + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(32 rows) + +select distinct(count(a)) from t_distinct_sort, (select distinct(count(*)), generate_series( +0, 2) from t_distinct_sort)as xx; + count +------- + 90 +(1 row) + +drop table t_distinct_sort; +explain(verbose, costs off) +select distinct(count(a)) from generate_series(0, 1) as a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Aggregate + Output: count(a) + -> Function Scan on pg_catalog.generate_series a + Output: a + Function Call: generate_series(0, 1) + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(7 rows) + +select distinct(count(a)) from generate_series(0, 1) as a; + count +------- + 2 +(1 row) + +explain(verbose, costs off) +select distinct(count(*)) from generate_series(0, 1) a join generate_series(0, 2) b on true; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Aggregate + Output: count(*) + -> Nested Loop + -> Function Scan on pg_catalog.generate_series a + Output: a.a + Function Call: generate_series(0, 1) + -> Function Scan on pg_catalog.generate_series b + Output: b.b + Function Call: generate_series(0, 2) + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'off' + Optimizer: Postgres query optimizer +(11 rows) + +select distinct(count(*)) from generate_series(0, 1) a join generate_series(0, 2) b on true; + count +------- + 6 +(1 row) + diff --git a/src/test/regress/expected/select_distinct_optimizer.out b/src/test/regress/expected/select_distinct_optimizer.out index 91b868818c9..702bb6c23fa 100644 --- a/src/test/regress/expected/select_distinct_optimizer.out +++ b/src/test/regress/expected/select_distinct_optimizer.out @@ -537,3 +537,216 @@ DROP TABLE capitals; DROP TABLE cities; set gp_statistics_pullup_from_child_partition to off; -- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on +create table t_distinct_sort(a int, b int, c int); +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +analyze t_distinct_sort; +explain(verbose, costs off) +select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a); + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Sort + Output: (count(a)), (sum(b)) + Sort Key: (sum(t_distinct_sort.b)), (count(t_distinct_sort.a)) + -> Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on' + Optimizer: Pivotal Optimizer (GPORCA) +(13 rows) + +select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a); + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on' + Optimizer: Postgres query optimizer +(10 rows) + +select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select count(a), sum(b) from t_distinct_sort order by sum(a), count(c); + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Result + Output: (count(a)), (sum(b)) + -> Sort + Output: (count(a)), (sum(b)), (sum(a)), (count(c)) + Sort Key: (sum(t_distinct_sort.a)), (count(t_distinct_sort.c)) + -> Finalize Aggregate + Output: count(a), sum(b), sum(a), count(c) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL sum(a)), (PARTIAL count(c)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL sum(a), PARTIAL count(c) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on' + Optimizer: Pivotal Optimizer (GPORCA) +(15 rows) + +select count(a), sum(b) from t_distinct_sort order by sum(a), count(c); + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select distinct count(a), sum(b) from t_distinct_sort ; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(a), sum(b) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b) + -> Seq Scan on public.t_distinct_sort + Output: a, b + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on' + Optimizer: Pivotal Optimizer (GPORCA) +(10 rows) + +select distinct count(a), sum(b) from t_distinct_sort ; + count | sum +-------+----- + 30 | 195 +(1 row) + +-- should keep distinct clause +explain(verbose, costs off) +select distinct on(count(random())) count(a), sum(b) from t_distinct_sort; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Unique + Output: (count(a)), (sum(b)), (count(random())) + Group Key: (count(random())) + -> Sort + Output: (count(a)), (sum(b)), (count(random())) + Sort Key: (count(random())) + -> Finalize Aggregate + Output: count(a), sum(b), count(random()) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(random())) + -> Partial Aggregate + Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(random()) + -> Seq Scan on public.t_distinct_sort + Output: a, b, c + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off', optimizer = 'on' + Optimizer: Postgres query optimizer +(16 rows) + +select distinct on(count(random())) count(a), sum(b) from t_distinct_sort; + count | sum +-------+----- + 30 | 195 +(1 row) + +explain(verbose, costs off) +select distinct(count(a)) from t_distinct_sort, (select distinct(count(*)), generate_series( +0, 2) from t_distinct_sort)as xx; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Finalize Aggregate + Output: count(t_distinct_sort_1.a) + -> Gather Motion 3:1 (slice1; segments: 3) + Output: (PARTIAL count(t_distinct_sort_1.a)) + -> Partial Aggregate + Output: PARTIAL count(t_distinct_sort_1.a) + -> Nested Loop + Output: t_distinct_sort_1.a + Join Filter: true + -> Seq Scan on public.t_distinct_sort t_distinct_sort_1 + Output: t_distinct_sort_1.a + -> Materialize + -> Broadcast Motion 1:3 (slice2) + -> Result + -> ProjectSet + Output: generate_series(0, 2) + -> Finalize Aggregate + Output: count() + -> Gather Motion 3:1 (slice3; segments: 3) + Output: (PARTIAL count()) + -> Partial Aggregate + Output: PARTIAL count() + -> Seq Scan on public.t_distinct_sort + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off' + Optimizer: Pivotal Optimizer (GPORCA) +(25 rows) + +select distinct(count(a)) from t_distinct_sort, (select distinct(count(*)), generate_series( +0, 2) from t_distinct_sort)as xx; + count +------- + 90 +(1 row) + +drop table t_distinct_sort; +explain(verbose, costs off) +select distinct(count(a)) from generate_series(0, 1) as a; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Aggregate + Output: count(generate_series) + -> Function Scan on pg_catalog.generate_series + Output: generate_series + Function Call: generate_series(0, 1) + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off' + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +select distinct(count(a)) from generate_series(0, 1) as a; + count +------- + 2 +(1 row) + +explain(verbose, costs off) +select distinct(count(*)) from generate_series(0, 1) a join generate_series(0, 2) b on true; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Aggregate + Output: count() + -> Nested Loop + Join Filter: true + -> Function Scan on pg_catalog.generate_series generate_series_1 + Output: generate_series_1.generate_series + Function Call: generate_series(0, 1) + -> Function Scan on pg_catalog.generate_series + Output: generate_series.generate_series + Function Call: generate_series(0, 2) + Settings: enable_hashagg = 'on', enable_sort = 'on', gp_statistics_pullup_from_child_partition = 'off' + Optimizer: Pivotal Optimizer (GPORCA) +(12 rows) + +select distinct(count(*)) from generate_series(0, 1) a join generate_series(0, 2) b on true; + count +------- + 6 +(1 row) + diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index 27b63e699a0..a9ec619ff7e 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -200,3 +200,41 @@ DROP TABLE capitals; DROP TABLE cities; set gp_statistics_pullup_from_child_partition to off; -- gpdb end: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on + +create table t_distinct_sort(a int, b int, c int); +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +insert into t_distinct_sort select i, i+1, i+2 from generate_series(1, 10)i; +analyze t_distinct_sort; + +explain(verbose, costs off) +select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a); +select distinct count(a), sum(b) from t_distinct_sort order by sum(b), count(a); +explain(verbose, costs off) +select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); +select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); +explain(verbose, costs off) +select count(a), sum(b) from t_distinct_sort order by sum(a), count(c); +select count(a), sum(b) from t_distinct_sort order by sum(a), count(c); +explain(verbose, costs off) +select distinct count(a), sum(b) from t_distinct_sort ; +select distinct count(a), sum(b) from t_distinct_sort ; + +-- should keep distinct clause +explain(verbose, costs off) +select distinct on(count(random())) count(a), sum(b) from t_distinct_sort; +select distinct on(count(random())) count(a), sum(b) from t_distinct_sort; + +explain(verbose, costs off) +select distinct(count(a)) from t_distinct_sort, (select distinct(count(*)), generate_series( +0, 2) from t_distinct_sort)as xx; +select distinct(count(a)) from t_distinct_sort, (select distinct(count(*)), generate_series( +0, 2) from t_distinct_sort)as xx; +drop table t_distinct_sort; + +explain(verbose, costs off) +select distinct(count(a)) from generate_series(0, 1) as a; +select distinct(count(a)) from generate_series(0, 1) as a; +explain(verbose, costs off) +select distinct(count(*)) from generate_series(0, 1) a join generate_series(0, 2) b on true; +select distinct(count(*)) from generate_series(0, 1) a join generate_series(0, 2) b on true; diff --git a/src/test/singlenode_regress/expected/aggregates.out b/src/test/singlenode_regress/expected/aggregates.out index 6cde5430002..2b151573b10 100644 --- a/src/test/singlenode_regress/expected/aggregates.out +++ b/src/test/singlenode_regress/expected/aggregates.out @@ -1054,15 +1054,13 @@ explain (costs off) select distinct max(unique2) from tenk1; QUERY PLAN --------------------------------------------------------------------- - HashAggregate - Group Key: $0 + Result InitPlan 1 (returns $0) -> Limit -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(8 rows) +(6 rows) select distinct max(unique2) from tenk1; max @@ -1074,15 +1072,13 @@ explain (costs off) select max(unique2) from tenk1 order by 1; QUERY PLAN --------------------------------------------------------------------- - Sort - Sort Key: ($0) + Result InitPlan 1 (returns $0) -> Limit -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(8 rows) +(6 rows) select max(unique2) from tenk1 order by 1; max @@ -1094,15 +1090,13 @@ explain (costs off) select max(unique2) from tenk1 order by max(unique2); QUERY PLAN --------------------------------------------------------------------- - Sort - Sort Key: ($0) + Result InitPlan 1 (returns $0) -> Limit -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(8 rows) +(6 rows) select max(unique2) from tenk1 order by max(unique2); max @@ -1114,15 +1108,13 @@ explain (costs off) select max(unique2) from tenk1 order by max(unique2)+1; QUERY PLAN --------------------------------------------------------------------- - Sort - Sort Key: (($0 + 1)) + Result InitPlan 1 (returns $0) -> Limit -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) - -> Result Optimizer: Postgres query optimizer -(8 rows) +(6 rows) select max(unique2) from tenk1 order by max(unique2)+1; max @@ -1231,20 +1223,16 @@ reset enable_seqscan; -- DISTINCT doesn't do anything useful here, but it shouldn't fail explain (costs off) select distinct min(f1), max(f1) from minmaxtest; - QUERY PLAN --------------------------------------------------------------- - Unique - Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1)) - -> Sort - Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1)) - -> Aggregate - -> Append - -> Seq Scan on minmaxtest minmaxtest_1 - -> Seq Scan on minmaxtest1 minmaxtest_2 - -> Seq Scan on minmaxtest2 minmaxtest_3 - -> Seq Scan on minmaxtest3 minmaxtest_4 + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Append + -> Seq Scan on minmaxtest minmaxtest_1 + -> Seq Scan on minmaxtest1 minmaxtest_2 + -> Seq Scan on minmaxtest2 minmaxtest_3 + -> Seq Scan on minmaxtest3 minmaxtest_4 Optimizer: Postgres query optimizer -(11 rows) +(7 rows) select distinct min(f1), max(f1) from minmaxtest; min | max diff --git a/src/test/singlenode_regress/expected/incremental_sort.out b/src/test/singlenode_regress/expected/incremental_sort.out index 40c7dd477cf..179aeabaa87 100644 --- a/src/test/singlenode_regress/expected/incremental_sort.out +++ b/src/test/singlenode_regress/expected/incremental_sort.out @@ -1610,20 +1610,18 @@ join tenk1 t3 on t2.unique1 = t3.unique1 order by count(*); QUERY PLAN ------------------------------------------------------------------------------- - Sort - Sort Key: (count(*)) - -> Aggregate + Aggregate + -> Hash Join + Hash Cond: (t2.unique1 = t3.unique1) -> Hash Join - Hash Cond: (t2.unique1 = t3.unique1) - -> Hash Join - Hash Cond: (t2.unique2 = t1.unique1) - -> Index Scan using tenk1_unique2 on tenk1 t2 - -> Hash - -> Index Only Scan using tenk1_unique1 on tenk1 t1 + Hash Cond: (t2.unique2 = t1.unique1) + -> Index Scan using tenk1_unique2 on tenk1 t2 -> Hash - -> Index Only Scan using tenk1_unique1 on tenk1 t3 + -> Index Only Scan using tenk1_unique1 on tenk1 t1 + -> Hash + -> Index Only Scan using tenk1_unique1 on tenk1 t3 Optimizer: Postgres query optimizer -(13 rows) +(10 rows) -- Parallel sort but with expression (correlated subquery) that -- is prohibited in parallel plans.