From 65c1e2ac8ea715667a675214c5bef8d7ef39a585 Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Fri, 18 Oct 2024 23:43:13 +0800 Subject: [PATCH 1/4] Fast path to REFRESH materialized view. We already have the ability to track the data status for some materialized views, aware whether its data is up to date or not. And we could avoid doing the real REFRESH if the data of view is up to date. The no-refreshed data should be the logically same as after a real REFRESH when there is no data changed since latest REFRESH command. In that case we may save a lot (read data from view query, compute and write into view table), ex: a cron task REFRESH view takes a long time and much resource periodically or executed manually by users each time. New GUC: gp_enable_refresh_fast_path Set this feature default to true, but let users decide if they intend to do a real REFRESH. Authored-by: Zhang Mingli avamingli@gmail.com --- src/backend/commands/matview.c | 21 ++++++++++++ src/backend/utils/misc/guc_gp.c | 14 ++++++++ src/include/utils/guc.h | 2 ++ src/include/utils/unsync_guc_name.h | 1 + src/test/regress/expected/matview.out | 34 +++++++++++++++++++ .../regress/expected/matview_optimizer.out | 33 ++++++++++++++++++ src/test/regress/sql/matview.sql | 19 +++++++++++ 7 files changed, 124 insertions(+) diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index fc85217448b..70a856ce10a 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -387,6 +387,27 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, matviewRel = table_open(matviewOid, NoLock); relowner = matviewRel->rd_rel->relowner; + /* + * Fastpath to REFRESH a view: + * avoid do the real REFRESH if the data of view + * is up to date. The data should be the logically same as after + * REFRESH when there is data changed since latest REFRESH. + * In that case we may save a lot, ex: a cron task REFRESH view periodically + * or manually executed by users each time. + * + * Set this feature default to true, but let uesrs decide if they intend + * to do a real REFRESH. + */ + if (gp_enable_refresh_fast_path && + !RelationIsIVM(matviewRel) && + !stmt->skipData && + MatviewIsGeneralyUpToDate(matviewOid)) + { + ObjectAddressSet(address, RelationRelationId, matviewOid); + table_close(matviewRel, NoLock); + return address; + } + /* * Switch to the owner's userid, so that any functions are run as that * user. Also lock down security-restricted operations and arrange to diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index 958e1731c26..bdc7eaf700b 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -448,6 +448,9 @@ bool gp_log_endpoints = false; /* optional reject to parse ambigous 5-digits date in YYYMMDD format */ bool gp_allow_date_field_width_5digits = false; +/* Avoid do a real REFRESH materialized view if possibile. */ +bool gp_enable_refresh_fast_path = true; + static const struct config_enum_entry gp_log_format_options[] = { {"text", 0}, {"csv", 1}, @@ -3126,6 +3129,17 @@ struct config_bool ConfigureNamesBool_gp[] = false, NULL, NULL, NULL }, + + { + {"gp_enable_refresh_fast_path", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Avoid do a real REFRESH materialized view if possibile."), + NULL + }, + &gp_enable_refresh_fast_path, + true, + NULL, NULL, NULL + }, + { {"gp_detect_data_correctness", PGC_USERSET, UNGROUPED, gettext_noop("Detect if the current partitioning of the table or data distribution is correct."), diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 1f0471e5584..3eac20b1013 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -623,6 +623,8 @@ extern bool gp_external_enable_filter_pushdown; /* Enable the Global Deadlock Detector */ extern bool gp_enable_global_deadlock_detector; +extern bool gp_enable_refresh_fast_path; + extern bool gp_enable_predicate_pushdown; extern int gp_predicate_pushdown_sample_rows; diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h index 667fc80b242..930ba5a8891 100644 --- a/src/include/utils/unsync_guc_name.h +++ b/src/include/utils/unsync_guc_name.h @@ -196,6 +196,7 @@ "gp_enable_predicate_pushdown", "gp_enable_preunique", "gp_enable_query_metrics", + "gp_enable_refresh_fast_path", "gp_enable_relsize_collection", "gp_enable_slow_writer_testmode", "gp_enable_sort_distinct", diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 51b99230907..55e16a03f31 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -746,3 +746,37 @@ NOTICE: relation "matview_ine_tab" already exists, skipping (0 rows) DROP MATERIALIZED VIEW matview_ine_tab; +-- test REFRESH fast path +create materialized view mv_fast as select * from mvtest_t; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'id' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +set gp_enable_refresh_fast_path = off; +select relfilenode into temp mv_fast_relfilenode_0 from pg_class where oid = 'mv_fast'::regclass::oid; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'relfilenode' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +refresh materialized view mv_fast; +select relfilenode into temp mv_fast_relfilenode_1 from pg_class where oid = 'mv_fast'::regclass::oid; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'relfilenode' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- shoule be 0 +select count(*) from mv_fast_relfilenode_0 natural join mv_fast_relfilenode_1; + count +------- + 0 +(1 row) + +-- relfilenode should not be changed then. +set gp_enable_refresh_fast_path = on; +refresh materialized view mv_fast; +select relfilenode into temp mv_fast_relfilenode_2 from pg_class where oid = 'mv_fast'::regclass::oid; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'relfilenode' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- shoule be 1 +select count(*) from mv_fast_relfilenode_1 natural join mv_fast_relfilenode_2; + count +------- + 1 +(1 row) + +reset gp_enable_refresh_fast_path; +drop materialized view mv_fast; diff --git a/src/test/regress/expected/matview_optimizer.out b/src/test/regress/expected/matview_optimizer.out index 219695e75ae..c54b1943ecb 100644 --- a/src/test/regress/expected/matview_optimizer.out +++ b/src/test/regress/expected/matview_optimizer.out @@ -764,3 +764,36 @@ NOTICE: relation "matview_ine_tab" already exists, skipping (0 rows) DROP MATERIALIZED VIEW matview_ine_tab; +-- test REFRESH fast path +create materialized view mv_fast as select * from mvtest_t; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. +set gp_enable_refresh_fast_path = off; +select relfilenode into temp mv_fast_relfilenode_0 from pg_class where oid = 'mv_fast'::regclass::oid; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'relfilenode' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +refresh materialized view mv_fast; +select relfilenode into temp mv_fast_relfilenode_1 from pg_class where oid = 'mv_fast'::regclass::oid; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'relfilenode' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- shoule be 0 +select count(*) from mv_fast_relfilenode_0 natural join mv_fast_relfilenode_1; + count +------- + 0 +(1 row) + +-- relfilenode should not be changed then. +set gp_enable_refresh_fast_path = on; +refresh materialized view mv_fast; +select relfilenode into temp mv_fast_relfilenode_2 from pg_class where oid = 'mv_fast'::regclass::oid; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'relfilenode' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +-- shoule be 1 +select count(*) from mv_fast_relfilenode_1 natural join mv_fast_relfilenode_2; + count +------- + 1 +(1 row) + +reset gp_enable_refresh_fast_path; +drop materialized view mv_fast; diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql index 4477ffe8d9c..76739f281af 100644 --- a/src/test/regress/sql/matview.sql +++ b/src/test/regress/sql/matview.sql @@ -322,3 +322,22 @@ EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) CREATE MATERIALIZED VIEW IF NOT EXISTS matview_ine_tab AS SELECT 1 / 0 WITH NO DATA; -- ok DROP MATERIALIZED VIEW matview_ine_tab; + +-- test REFRESH fast path +create materialized view mv_fast as select * from mvtest_t; +set gp_enable_refresh_fast_path = off; +select relfilenode into temp mv_fast_relfilenode_0 from pg_class where oid = 'mv_fast'::regclass::oid; +refresh materialized view mv_fast; +select relfilenode into temp mv_fast_relfilenode_1 from pg_class where oid = 'mv_fast'::regclass::oid; +-- shoule be 0 +select count(*) from mv_fast_relfilenode_0 natural join mv_fast_relfilenode_1; + +-- relfilenode should not be changed then. +set gp_enable_refresh_fast_path = on; +refresh materialized view mv_fast; +select relfilenode into temp mv_fast_relfilenode_2 from pg_class where oid = 'mv_fast'::regclass::oid; +-- shoule be 1 +select count(*) from mv_fast_relfilenode_1 natural join mv_fast_relfilenode_2; + +reset gp_enable_refresh_fast_path; +drop materialized view mv_fast; From a7f93e7b82523aa97bddec4c9a3b9dc651aeffe1 Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Sat, 19 Oct 2024 20:52:57 +0800 Subject: [PATCH 2/4] Fix case which is intended to test error of REFRESH. Authored-by: Zhang Mingli avamingli@gmail.com --- src/test/regress/sql/pg_ext_aux.sql | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/test/regress/sql/pg_ext_aux.sql b/src/test/regress/sql/pg_ext_aux.sql index fb445b52363..1e3c9b25dd4 100644 --- a/src/test/regress/sql/pg_ext_aux.sql +++ b/src/test/regress/sql/pg_ext_aux.sql @@ -40,7 +40,13 @@ select pg_ext_aux.extaux_add1(7); -- fail: should not allowed to insert by user insert into pg_ext_aux.extaux_t values(1,'hello'); -- fail: should not allowed to refresh by user +-- start_ignore +set gp_enable_refresh_fast_path = off; +-- end_ignore refresh materialized view pg_ext_aux.extaux_mv; +-- start_ignore +reset gp_enable_refresh_fast_path; +-- end_ignore -- fail: should not allow to be dropped by user drop view pg_ext_aux.extaux_v; From 677d0b2fb6cf7a20f952c0632aaf910e3c900183 Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Mon, 21 Oct 2024 23:13:11 +0800 Subject: [PATCH 3/4] Move codes after some checks. Authored-by: Zhang Mingli avamingli@gmail.com --- src/backend/commands/matview.c | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 70a856ce10a..b98d340c934 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -387,27 +387,6 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, matviewRel = table_open(matviewOid, NoLock); relowner = matviewRel->rd_rel->relowner; - /* - * Fastpath to REFRESH a view: - * avoid do the real REFRESH if the data of view - * is up to date. The data should be the logically same as after - * REFRESH when there is data changed since latest REFRESH. - * In that case we may save a lot, ex: a cron task REFRESH view periodically - * or manually executed by users each time. - * - * Set this feature default to true, but let uesrs decide if they intend - * to do a real REFRESH. - */ - if (gp_enable_refresh_fast_path && - !RelationIsIVM(matviewRel) && - !stmt->skipData && - MatviewIsGeneralyUpToDate(matviewOid)) - { - ObjectAddressSet(address, RelationRelationId, matviewOid); - table_close(matviewRel, NoLock); - return address; - } - /* * Switch to the owner's userid, so that any functions are run as that * user. Also lock down security-restricted operations and arrange to @@ -439,6 +418,27 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, errmsg("%s and %s options cannot be used together", "CONCURRENTLY", "WITH NO DATA"))); + /* + * Fast path to REFRESH a view: + * avoid do the real REFRESH if the data of view + * is up to date. The data should be the logically same as after + * REFRESH when there is data changed since latest REFRESH. + * In that case we may save a lot, ex: a cron task REFRESH view periodically + * or manually executed by users each time. + * + * Set this feature default to true, but let uesrs decide if they intend + * to do a real REFRESH. + */ + if (gp_enable_refresh_fast_path && + !RelationIsIVM(matviewRel) && + !stmt->skipData && + MatviewIsGeneralyUpToDate(matviewOid)) + { + ObjectAddressSet(address, RelationRelationId, matviewOid); + table_close(matviewRel, NoLock); + return address; + } + viewQuery = get_matview_query(matviewRel); /* For IMMV, we need to rewrite matview query */ From d39eed0753e1a9f6a7f5e308cb6db927cb0ee0f8 Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Tue, 22 Oct 2024 10:12:08 +0800 Subject: [PATCH 4/4] Rollback User GUCs and etc. Authored-by: Zhang Mingli avamingli@gmail.com --- src/backend/commands/matview.c | 49 +++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index b98d340c934..49cdce3b842 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -418,27 +418,6 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, errmsg("%s and %s options cannot be used together", "CONCURRENTLY", "WITH NO DATA"))); - /* - * Fast path to REFRESH a view: - * avoid do the real REFRESH if the data of view - * is up to date. The data should be the logically same as after - * REFRESH when there is data changed since latest REFRESH. - * In that case we may save a lot, ex: a cron task REFRESH view periodically - * or manually executed by users each time. - * - * Set this feature default to true, but let uesrs decide if they intend - * to do a real REFRESH. - */ - if (gp_enable_refresh_fast_path && - !RelationIsIVM(matviewRel) && - !stmt->skipData && - MatviewIsGeneralyUpToDate(matviewOid)) - { - ObjectAddressSet(address, RelationRelationId, matviewOid); - table_close(matviewRel, NoLock); - return address; - } - viewQuery = get_matview_query(matviewRel); /* For IMMV, we need to rewrite matview query */ @@ -489,6 +468,34 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, */ CheckTableNotInUse(matviewRel, "REFRESH MATERIALIZED VIEW"); + /* + * Fast path to REFRESH a view: + * avoid do the real REFRESH if the data of view + * is up to date. The data should be the logically same as after + * REFRESH when there is data changed since latest REFRESH. + * In that case we may save a lot, ex: a cron task REFRESH view periodically + * or manually executed by users each time. + * + * Set this feature default to true, but let uesrs decide if they intend + * to do a real REFRESH. + */ + if (gp_enable_refresh_fast_path && + !RelationIsIVM(matviewRel) && + !stmt->skipData && + MatviewIsGeneralyUpToDate(matviewOid)) + { + table_close(matviewRel, NoLock); + + /* Roll back any GUC changes */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + ObjectAddressSet(address, RelationRelationId, matviewOid); + return address; + } + /* * Tentatively mark the matview as populated or not (this will roll back * if we fail later).