From 81f0ee275bb83882773306fc893a610c52e3c01a Mon Sep 17 00:00:00 2001
From: Leonid Borchuk <le.borchuk@gmail.com>
Date: Sun, 2 Feb 2025 00:29:03 +0300
Subject: [PATCH] Do not call gporca for simple queries

---
 src/backend/optimizer/plan/planner.c          | 45 +++++++++++++-
 src/backend/utils/misc/guc_gp.c               | 12 ++++
 src/include/utils/guc.h                       |  1 +
 src/include/utils/unsync_guc_name.h           |  1 +
 src/test/regress/expected/gporca.out          | 61 ++++++++++++++++++
 .../regress/expected/gporca_optimizer.out     | 62 +++++++++++++++++++
 src/test/regress/sql/gporca.sql               | 10 +++
 7 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 16efdb2ed80..04d9235a27d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -183,6 +183,7 @@ static List *preprocess_groupclause(PlannerInfo *root, List *force);
 static List *extract_rollup_sets(List *groupingSets);
 static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
 static void standard_qp_callback(PlannerInfo *root, void *extra);
+bool enabled_for_optimizer(Query *parse);
 static double get_number_of_groups(PlannerInfo *root,
 								   double path_rows,
 								   grouping_sets_data *gd,
@@ -342,6 +343,47 @@ planner(Query *parse, const char *query_string, int cursorOptions,
 	return result;
 }
 
+/* Check if query too simple to use optimizer */
+bool enabled_for_optimizer(Query *parse)
+{
+	int num_relations = 0;
+	ListCell   *l;
+
+	if (optimizer_relations_threshold == 0)
+		return true;
+
+	if (parse->hasAggs || parse->hasWindowFuncs ||  parse->hasSubLinks || parse->hasRecursive || parse->hasDistinctOn || parse->cteList || parse->hasModifyingCTE)
+		return true;
+
+	foreach(l, parse->rtable)
+        {
+                RangeTblEntry *rte = lfirst_node(RangeTblEntry, l);
+
+                switch (rte->rtekind)
+                {
+                        case RTE_RELATION:
+				num_relations++;
+                                break;
+                        case RTE_JOIN:
+				// do not count joins 
+                                break;
+                        case RTE_RESULT:
+                                break;
+                        default:
+                                /* No work here for other RTE types */
+                                break;
+                }
+
+                if (rte->lateral)
+                        return true;
+
+		if (num_relations > optimizer_relations_threshold)
+			return true;
+	}
+	
+	return false;
+}
+
 PlannedStmt *
 standard_planner(Query *parse, const char *query_string, int cursorOptions,
 				 ParamListInfo boundParams)
@@ -373,11 +415,12 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 	 *
 	 * PARALLEL RETRIEVE CURSOR is not supported by ORCA yet.
 	 */
+	
 	if (optimizer &&
 		GP_ROLE_DISPATCH == Gp_role &&
 		IS_QUERY_DISPATCHER() &&
 		(cursorOptions & CURSOR_OPT_SKIP_FOREIGN_PARTITIONS) == 0 &&
-		(cursorOptions & CURSOR_OPT_PARALLEL_RETRIEVE) == 0)
+		(cursorOptions & CURSOR_OPT_PARALLEL_RETRIEVE) == 0 && enabled_for_optimizer(parse))
 	{
 
 #ifdef USE_ORCA
diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c
index b50f55d1024..313f9ede259 100644
--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -380,6 +380,7 @@ bool		optimizer_enable_derive_stats_all_groups;
 int			optimizer_segments;
 int			optimizer_penalize_broadcast_threshold;
 double		optimizer_cost_threshold;
+int             optimizer_relations_threshold;
 double		optimizer_nestloop_factor;
 double		optimizer_sort_factor;
 double		optimizer_spilling_mem_threshold;
@@ -4363,6 +4364,17 @@ struct config_int ConfigureNamesInt_gp[] =
 		NULL, NULL, NULL
 	},
 
+	{
+                {"optimizer_relations_threshold", PGC_USERSET, DEVELOPER_OPTIONS,
+                        gettext_noop("Minimal number of relations in a query to use gporca for planning, 0 means always try to use gporca"),
+                        NULL,
+                        GUC_NOT_IN_SAMPLE
+                },
+                &optimizer_relations_threshold,
+                0, 0, INT_MAX,
+                NULL, NULL, NULL
+        },
+
 	{
 		{"memory_profiler_dataset_size", PGC_USERSET, DEVELOPER_OPTIONS,
 			gettext_noop("Set the size in GB"),
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index a6d3e9e3c84..cf66c20a282 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -559,6 +559,7 @@ extern bool optimizer_enable_derive_stats_all_groups;
 extern int optimizer_segments;
 extern int optimizer_penalize_broadcast_threshold;
 extern double optimizer_cost_threshold;
+extern int optimizer_relations_threshold;
 extern double optimizer_nestloop_factor;
 extern double optimizer_sort_factor;
 extern double optimizer_spilling_mem_threshold;
diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h
index 6f8e02e6f0c..8d00b464ceb 100644
--- a/src/include/utils/unsync_guc_name.h
+++ b/src/include/utils/unsync_guc_name.h
@@ -379,6 +379,7 @@
 		"optimizer_control",
 		"optimizer_cost_model",
 		"optimizer_cost_threshold",
+		"optimizer_relations_threshold",
 		"optimizer_cte_inlining",
 		"optimizer_cte_inlining_bound",
 		"optimizer_damping_factor_filter",
diff --git a/src/test/regress/expected/gporca.out b/src/test/regress/expected/gporca.out
index 0d83a50c178..c7c6d5e0479 100644
--- a/src/test/regress/expected/gporca.out
+++ b/src/test/regress/expected/gporca.out
@@ -14933,3 +14933,64 @@ SELECT CAST(a AS TEXT[]) FROM array_coerceviaio;
 (1 row)
 
 ---------------------------------------------------------------------------------
+-- Test do not use ORCA when optimizer_relations_threshold is set
+create table ort(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+explain insert into ort values(1);
+                   QUERY PLAN
+-------------------------------------------------
+ Insert on ort  (cost=0.00..0.03 rows=0 width=0)
+   ->  Result  (cost=0.00..0.01 rows=1 width=4)
+ Optimizer: Postgres query optimizer
+(3 rows)
+
+set optimizer_relations_threshold = 1;
+explain insert into ort values(1);
+                   QUERY PLAN
+-------------------------------------------------
+ Insert on ort  (cost=0.00..0.03 rows=0 width=0)
+   ->  Result  (cost=0.00..0.01 rows=1 width=4)
+ Optimizer: Postgres query optimizer
+(3 rows)
+
+explain select * from ort a join ort b on a.a = b.a;
+                                       QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=756.25..284554.25 rows=9273690 width=8)
+   ->  Hash Join  (cost=756.25..160905.05 rows=3091230 width=8)
+         Hash Cond: (a.a = b.a)
+         ->  Seq Scan on ort a  (cost=0.00..355.00 rows=32100 width=4)
+         ->  Hash  (cost=355.00..355.00 rows=32100 width=4)
+               ->  Seq Scan on ort b  (cost=0.00..355.00 rows=32100 width=4)
+ Optimizer: Postgres query optimizer
+(7 rows)
+
+set optimizer_relations_threshold = 2;
+explain select count(a.a) from ort a join ort b on a.a = b.a;
+                                         QUERY PLAN
+--------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=168633.18..168633.19 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=168633.12..168633.18 rows=3 width=8)
+         ->  Partial Aggregate  (cost=168633.12..168633.14 rows=1 width=8)
+               ->  Hash Join  (cost=756.25..160905.05 rows=3091230 width=4)
+                     Hash Cond: (a.a = b.a)
+                     ->  Seq Scan on ort a  (cost=0.00..355.00 rows=32100 width=4)
+                     ->  Hash  (cost=355.00..355.00 rows=32100 width=4)
+                           ->  Seq Scan on ort b  (cost=0.00..355.00 rows=32100 width=4)
+ Optimizer: Postgres query optimizer
+(9 rows)
+
+explain select * from ort a join ort b on a.a = b.a;
+                                       QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=756.25..284554.25 rows=9273690 width=8)
+   ->  Hash Join  (cost=756.25..160905.05 rows=3091230 width=8)
+         Hash Cond: (a.a = b.a)
+         ->  Seq Scan on ort a  (cost=0.00..355.00 rows=32100 width=4)
+         ->  Hash  (cost=355.00..355.00 rows=32100 width=4)
+               ->  Seq Scan on ort b  (cost=0.00..355.00 rows=32100 width=4)
+ Optimizer: Postgres query optimizer
+(7 rows)
+
+drop table ort;
diff --git a/src/test/regress/expected/gporca_optimizer.out b/src/test/regress/expected/gporca_optimizer.out
index 6eb308c4164..9337befb698 100644
--- a/src/test/regress/expected/gporca_optimizer.out
+++ b/src/test/regress/expected/gporca_optimizer.out
@@ -15004,3 +15004,65 @@ SELECT CAST(a AS TEXT[]) FROM array_coerceviaio;
 (1 row)
 
 ---------------------------------------------------------------------------------
+-- Test do not use ORCA when optimizer_relations_threshold is set
+create table ort(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+explain insert into ort values(1);
+                         QUERY PLAN
+------------------------------------------------------------
+ Insert on ort  (cost=0.00..0.01 rows=1 width=4)
+   ->  Result  (cost=0.00..0.00 rows=1 width=8)
+         ->  Result  (cost=0.00..0.00 rows=1 width=4)
+               ->  Result  (cost=0.00..0.00 rows=1 width=1)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(5 rows)
+
+set optimizer_relations_threshold = 1;
+explain insert into ort values(1);
+                   QUERY PLAN
+-------------------------------------------------
+ Insert on ort  (cost=0.00..0.03 rows=0 width=0)
+   ->  Result  (cost=0.00..0.01 rows=1 width=4)
+ Optimizer: Postgres query optimizer
+(3 rows)
+
+explain select * from ort a join ort b on a.a = b.a;
+                                  QUERY PLAN
+------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.00 rows=1 width=8)
+   ->  Hash Join  (cost=0.00..862.00 rows=1 width=8)
+         Hash Cond: (ort.a = ort_1.a)
+         ->  Seq Scan on ort  (cost=0.00..431.00 rows=1 width=4)
+         ->  Hash  (cost=431.00..431.00 rows=1 width=4)
+               ->  Seq Scan on ort ort_1  (cost=0.00..431.00 rows=1 width=4)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(7 rows)
+
+set optimizer_relations_threshold = 2;
+explain select count(a.a) from ort a join ort b on a.a = b.a;
+                                     QUERY PLAN
+------------------------------------------------------------------------------------
+ Aggregate  (cost=0.00..862.00 rows=1 width=8)
+   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..862.00 rows=1 width=4)
+         ->  Hash Join  (cost=0.00..862.00 rows=1 width=4)
+               Hash Cond: (ort.a = ort_1.a)
+               ->  Seq Scan on ort  (cost=0.00..431.00 rows=1 width=4)
+               ->  Hash  (cost=431.00..431.00 rows=1 width=4)
+                     ->  Seq Scan on ort ort_1  (cost=0.00..431.00 rows=1 width=4)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(8 rows)
+
+explain select * from ort a join ort b on a.a = b.a;
+                                       QUERY PLAN
+-----------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=756.25..284554.25 rows=9273690 width=8)
+   ->  Hash Join  (cost=756.25..160905.05 rows=3091230 width=8)
+         Hash Cond: (a.a = b.a)
+         ->  Seq Scan on ort a  (cost=0.00..355.00 rows=32100 width=4)
+         ->  Hash  (cost=355.00..355.00 rows=32100 width=4)
+               ->  Seq Scan on ort b  (cost=0.00..355.00 rows=32100 width=4)
+ Optimizer: Postgres query optimizer
+(7 rows)
+
+drop table ort;
diff --git a/src/test/regress/sql/gporca.sql b/src/test/regress/sql/gporca.sql
index a659d5695c6..fb72970dd6f 100644
--- a/src/test/regress/sql/gporca.sql
+++ b/src/test/regress/sql/gporca.sql
@@ -3706,6 +3706,16 @@ INSERT INTO array_coerceviaio values(ARRAY[1, 2, 3]);
 EXPLAIN SELECT CAST(a AS TEXT[]) FROM array_coerceviaio;
 SELECT CAST(a AS TEXT[]) FROM array_coerceviaio;
 ---------------------------------------------------------------------------------
+-- Test do not use ORCA when optimizer_relations_threshold is set
+create table ort(a int);
+explain insert into ort values(1);
+set optimizer_relations_threshold = 1;
+explain insert into ort values(1);
+explain select * from ort a join ort b on a.a = b.a;
+set optimizer_relations_threshold = 2;
+explain select count(a.a) from ort a join ort b on a.a = b.a;
+explain select * from ort a join ort b on a.a = b.a;
+drop table ort;
 
 -- start_ignore
 DROP SCHEMA orca CASCADE;