From adaee4ee959c179a6315c43ce877b6e871df43e7 Mon Sep 17 00:00:00 2001
From: Haolin Wang <whaolin@vmware.com>
Date: Fri, 14 Oct 2022 14:40:53 +0800
Subject: [PATCH 1/2] Fix incorrect index->reltuples after VACUUM

Given the scenario of updating stats during VACUUM:

1) coordinator vacuums and updates stats of its own;
2) then coordinator dispatches vacuum to segments;
3) coordinator combines stats received from segments
   to overwrite the stats of its own.

Because upstream introduced a feature which could skip
full index scan uring cleanup of B-tree indexes when
possible (refer to: https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
there was a case in QD-QEs distributed deployment that
some QEs could skip full index scan and stop updating
statistics, resulting in QD being unable to collect all
QEs' stats thus overwrote a paritial accumulated value
to index->reltuples. More interesting, it usually happened
starting from the 3rd time of consecutively VACUUM after
fresh inserts due to above skipping index scan criteria.

This patch is intended to prevent from above issue by two aspects:
on QE, do not skip full index scan, report current statistics
to QD as requested;
on QD, restrict updating statistics only when collecting all QEs'
data completely, if the reporting QE number doesn't match
the total number of dispatched QEs, no stats update happens.
---
 src/backend/access/nbtree/nbtree.c            |  69 ++++++++
 .../cdb/dispatcher/cdbdispatchresult.c        |   2 +
 src/backend/commands/vacuum.c                 | 121 ++++++++++---
 src/backend/commands/vacuum_ao.c              |  30 ++--
 src/include/cdb/cdbdispatchresult.h           |   1 +
 src/include/commands/vacuum.h                 |  13 ++
 src/test/regress/expected/btree_index.out     | 124 ++++++++++++--
 .../expected/uao_compaction/index_stats.out   | 162 +++++++++++++++++-
 .../expected/uaocs_compaction/index_stats.out | 162 +++++++++++++++++-
 src/test/regress/sql/btree_index.sql          |  55 ++++--
 .../sql/uao_compaction/index_stats.sql        |  81 ++++++++-
 .../sql/uaocs_compaction/index_stats.sql      |  81 ++++++++-
 12 files changed, 831 insertions(+), 70 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 906d753d222..46bc034fd2f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -877,6 +877,75 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
 	SpinLockRelease(&btscan->btps_mutex);
 }
 
+/*
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
+ *			btbulkdelete() wasn't called.
+ */
+static bool
+_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
+{
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *metad;
+	bool		result = false;
+
+	/* Return true directly on QE for stats collection from QD. */
+	if (gp_vacuum_needs_update_stats())
+		return true;
+
+	metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
+	metapg = BufferGetPage(metabuf);
+	metad = BTPageGetMeta(metapg);
+
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
+	{
+		/*
+		 * Do cleanup if metapage needs upgrade, because we don't have
+		 * cleanup-related meta-information yet.
+		 */
+		result = true;
+	}
+	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
+			 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
+								   RecentGlobalXmin))
+	{
+		/*
+		 * If oldest btpo.xact in the deleted pages is older than
+		 * RecentGlobalXmin, then at least one deleted page can be recycled.
+		 */
+		result = true;
+	}
+	else
+	{
+		StdRdOptions *relopts;
+		float8		cleanup_scale_factor;
+		float8		prev_num_heap_tuples;
+
+		/*
+		 * If table receives enough insertions and no cleanup was performed,
+		 * then index would appear have stale statistics.  If scale factor is
+		 * set, we avoid that by performing cleanup if the number of inserted
+		 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
+		 * original tuples count.
+		 */
+		relopts = (StdRdOptions *) info->index->rd_options;
+		cleanup_scale_factor = (relopts &&
+								relopts->vacuum_cleanup_index_scale_factor >= 0)
+			? relopts->vacuum_cleanup_index_scale_factor
+			: vacuum_cleanup_index_scale_factor;
+		prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+
+		if (cleanup_scale_factor <= 0 ||
+			prev_num_heap_tuples <= 0 ||
+			(info->num_heap_tuples - prev_num_heap_tuples) /
+			prev_num_heap_tuples >= cleanup_scale_factor)
+			result = true;
+	}
+
+	_bt_relbuf(info->index, metabuf);
+	return result;
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
diff --git a/src/backend/cdb/dispatcher/cdbdispatchresult.c b/src/backend/cdb/dispatcher/cdbdispatchresult.c
index 09d712a7304..37fead28981 100644
--- a/src/backend/cdb/dispatcher/cdbdispatchresult.c
+++ b/src/backend/cdb/dispatcher/cdbdispatchresult.c
@@ -821,6 +821,8 @@ cdbdisp_returnResults(CdbDispatchResults *primaryResults, CdbPgResults *cdb_pgre
 
 	/* tell the caller how many sets we're returning. */
 	cdb_pgresults->numResults = nresults;
+	/* set the number of dispatched QE */
+	cdb_pgresults->numDispatches = primaryResults->resultCount;
 }
 
 /*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index d8cb3975af5..0e1f84cf118 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -78,7 +78,8 @@
 
 typedef struct VacuumStatsContext
 {
-	List	   *updated_stats;
+	List		*updated_stats;
+	int			nsegs;
 } VacuumStatsContext;
 
 /*
@@ -122,7 +123,7 @@ static void dispatchVacuum(VacuumParams *params, Oid relid,
 static List *vacuum_params_to_options_list(VacuumParams *params);
 static void vacuum_combine_stats(VacuumStatsContext *stats_context,
 								 CdbPgResults *cdb_pgresults);
-static void vac_update_relstats_from_list(List *updated_stats);
+static void vac_update_relstats_from_list(VacuumStatsContext *stats_context);
 
 /*
  * Primary entry point for manual VACUUM and ANALYZE commands
@@ -2687,7 +2688,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params,
 
 		stats_context.updated_stats = NIL;
 		dispatchVacuum(params, relid, &stats_context);
-		vac_update_relstats_from_list(stats_context.updated_stats);
+		vac_update_relstats_from_list(&stats_context);
 
 		/* Also update pg_stat_last_operation */
 		if (IsAutoVacuumWorkerProcess())
@@ -3081,6 +3082,8 @@ vacuum_combine_stats(VacuumStatsContext *stats_context, CdbPgResults *cdb_pgresu
 	if (cdb_pgresults == NULL || cdb_pgresults->numResults <= 0)
 		return;
 
+	stats_context->nsegs = cdb_pgresults->numDispatches;
+
 	/*
 	 * Process the dispatch results from the primary. Note that the QE
 	 * processes also send back the new stats info, such as stats on
@@ -3093,9 +3096,7 @@ vacuum_combine_stats(VacuumStatsContext *stats_context, CdbPgResults *cdb_pgresu
 	 *
 	 */
 	for(result_no = 0; result_no < cdb_pgresults->numResults; result_no++)
-	{
-		VPgClassStats *pgclass_stats = NULL;
-		ListCell *lc = NULL;
+	{		
 		struct pg_result *pgresult = cdb_pgresults->pg_results[result_no];
 
 		if (pgresult->extras == NULL || pgresult->extraType != PGExtraTypeVacuumStats)
@@ -3107,30 +3108,38 @@ vacuum_combine_stats(VacuumStatsContext *stats_context, CdbPgResults *cdb_pgresu
 		 * Process the stats for pg_class. We simply compute the maximum
 		 * number of rel_tuples and rel_pages.
 		 */
-		pgclass_stats = (VPgClassStats *) pgresult->extras;
+		VPgClassStatsCombo *pgclass_stats_combo = (VPgClassStatsCombo *) pgresult->extras;
+		ListCell *lc = NULL;
+
 		foreach (lc, stats_context->updated_stats)
 		{
-			VPgClassStats *tmp_stats = (VPgClassStats *) lfirst(lc);
+			VPgClassStatsCombo *tmp_stats_combo = (VPgClassStatsCombo *) lfirst(lc);
 
-			if (tmp_stats->relid == pgclass_stats->relid)
+			if (tmp_stats_combo->relid == pgclass_stats_combo->relid)
 			{
-				tmp_stats->rel_pages += pgclass_stats->rel_pages;
-				tmp_stats->rel_tuples += pgclass_stats->rel_tuples;
-				tmp_stats->relallvisible += pgclass_stats->relallvisible;
+				tmp_stats_combo->rel_pages += pgclass_stats_combo->rel_pages;
+				tmp_stats_combo->rel_tuples += pgclass_stats_combo->rel_tuples;
+				tmp_stats_combo->relallvisible += pgclass_stats_combo->relallvisible;
+				/* 
+				 * Accumulate the number of QEs, assuming sending only once
+				 * per QE for each relid in the VACUUM scenario.
+				 */
+				tmp_stats_combo->count++;
 				break;
 			}
 		}
 
-		if (lc == NULL)
+		if (lc == NULL) /* get the first stats result of the current relid */
 		{
 			Assert(pgresult->extraslen == sizeof(VPgClassStats));
 
 			old_context = MemoryContextSwitchTo(vac_context);
-			pgclass_stats = palloc(sizeof(VPgClassStats));
-			memcpy(pgclass_stats, pgresult->extras, pgresult->extraslen);
+			pgclass_stats_combo = palloc(sizeof(VPgClassStatsCombo));
+			memcpy(pgclass_stats_combo, pgresult->extras, pgresult->extraslen);
+			pgclass_stats_combo->count = 1;
 
 			stats_context->updated_stats =
-				lappend(stats_context->updated_stats, pgclass_stats);
+				lappend(stats_context->updated_stats, pgclass_stats_combo);
 			MemoryContextSwitchTo(old_context);
 		}
 	}
@@ -3140,8 +3149,9 @@ vacuum_combine_stats(VacuumStatsContext *stats_context, CdbPgResults *cdb_pgresu
  * Update relpages/reltuples of all the relations in the list.
  */
 static void
-vac_update_relstats_from_list(List *updated_stats)
+vac_update_relstats_from_list(VacuumStatsContext *stats_context)
 {
+	List *updated_stats = stats_context->updated_stats;
 	ListCell *lc;
 
 	/*
@@ -3152,7 +3162,7 @@ vac_update_relstats_from_list(List *updated_stats)
 
 	foreach (lc, updated_stats)
 	{
-		VPgClassStats *stats = (VPgClassStats *) lfirst(lc);
+		VPgClassStatsCombo *stats = (VPgClassStatsCombo *) lfirst(lc);
 		Relation	rel;
 		int16		ao_segfile_count = 0;
 
@@ -3160,6 +3170,8 @@ vac_update_relstats_from_list(List *updated_stats)
 
 		if (GpPolicyIsReplicated(rel->rd_cdbpolicy))
 		{
+			Assert(stats->count == rel->rd_cdbpolicy->numsegments);
+
 			stats->rel_pages = stats->rel_pages / rel->rd_cdbpolicy->numsegments;
 			stats->rel_tuples = stats->rel_tuples / rel->rd_cdbpolicy->numsegments;
 			stats->relallvisible = stats->relallvisible / rel->rd_cdbpolicy->numsegments;
@@ -3201,17 +3213,55 @@ vac_update_relstats_from_list(List *updated_stats)
 		}
 
 		/*
-		 * Pass 'false' for isvacuum, so that the stats are
-		 * actually updated.
+		 * Update QD stats only when receiving all dispatched QEs' stats, to
+		 * avoid being overwritten by a partial accumulated value (i.e., index->reltuples)
+		 * in case when not receiving all QEs' stats.
 		 */
-		vac_update_relstats(rel,
-							stats->rel_pages, stats->rel_tuples,
-							stats->relallvisible,
-							rel->rd_rel->relhasindex,
-							InvalidTransactionId,
-							InvalidMultiXactId,
-							false,
-							false /* isvacuum */);
+		if (stats_context->nsegs > 0 && stats->count == stats_context->nsegs)
+		{
+			/*
+			 * Pass 'false' for isvacuum, so that the stats are
+			 * actually updated.
+			 */
+			vac_update_relstats(rel,
+								stats->rel_pages, stats->rel_tuples,
+								stats->relallvisible,
+								rel->rd_rel->relhasindex,
+								InvalidTransactionId,
+								InvalidMultiXactId,
+								false,
+								false /* isvacuum */);
+		}
+		else
+		{
+			/*
+			 * We do have chance to enter this branch in the case when in compact phase.
+			 * For example, in compact phase, some QEs may need to drop dead segfiles,
+			 * while others may not. Only the QEs which dropping dead segfiles could go to
+			 * vacuum indexes path then update and send the statistics to QD, QD just
+			 * collected part of QEs' stats hence should not be as the final result to
+			 * overwrite QD's stats. 
+			 * 
+			 * One may think why not having the stats update only happens in the final
+			 * phase (POST_CLEANUP_PHASE), yes that's an alternative to get a final stats
+			 * accurately for QD. 
+			 * 
+			 * Given the AO/CO VACUUM is a multi-phases process which may have an interval
+			 * between each phase. In real circumstance, concurrent VACUUM is mostly a heavy
+			 * job and this interval could get longer than normal cases, hence it seems
+			 * better to collect and update QD's stats timely. So current strategy is, QD always 
+			 * collect QE's stats across phases, once we collected the expected number (means
+			 * same as dispatched QE number) of QE's stats, we update QD's stats subsequently,
+			 * instead of updating at the final phase.
+			 * 
+			 * Set the logging level to LOG as skipping sending stats here is not considered as
+			 * a real issue, displaying it in log may be helpful to hint.
+			 */
+			elog(LOG, "Vacuum update stats oid=%u pages=%d tuples=%f was skipped because "
+				 "collected segment number %d didn't match the expected %d.", stats->relid,
+				 stats->rel_pages, stats->rel_tuples, stats->count, stats_context->nsegs);
+		}
+
 		relation_close(rel, AccessShareLock);
 	}
 }
@@ -3277,3 +3327,18 @@ vacuumStatement_IsTemporary(Relation onerel)
 		bTemp = isAnyTempNamespace(RelationGetNamespace(onerel));
 	return bTemp;
 }
+
+/*
+ * GPDB: Check whether needs to update or send stats from QE to QD.
+ * This is GPDB specific check in vacuum-index scenario for collecting
+ * QEs' stats (such as index->relpages and index->reltuples) on QD.
+ * GPDB needs accumulating all QEs' stats for updating corresponding 
+ * statistics into QD's pg_class correctly. So if current instance is
+ * acting as QE, it should scan and send its current stats to QD instead
+ * of skipping them for cost saving.
+ */
+bool
+gp_vacuum_needs_update_stats(void)
+{
+	return (Gp_role == GP_ROLE_EXECUTE);
+}
diff --git a/src/backend/commands/vacuum_ao.c b/src/backend/commands/vacuum_ao.c
index 17c4d7826f2..531d0eade4a 100644
--- a/src/backend/commands/vacuum_ao.c
+++ b/src/backend/commands/vacuum_ao.c
@@ -145,7 +145,7 @@
 
 
 static void vacuum_appendonly_index(Relation indexRelation,
-									double rel_tuple_count,
+									Relation aoRelation,
 									Bitmapset *dead_segs,
 									int elevel,
 									BufferAccessStrategy bstrategy);
@@ -491,7 +491,10 @@ vacuum_appendonly_indexes(Relation aoRelation, int options, Bitmapset *dead_segs
 		{
 			for (i = 0; i < nindexes; i++)
 			{
-				scan_index(Irel[i], aoRelation , elevel, bstrategy);
+				scan_index(Irel[i],
+						   aoRelation,
+						   elevel,
+						   bstrategy);
 			}
 		}
 		else
@@ -499,7 +502,7 @@ vacuum_appendonly_indexes(Relation aoRelation, int options, Bitmapset *dead_segs
 			for (i = 0; i < nindexes; i++)
 			{
 				vacuum_appendonly_index(Irel[i],
-										aoRelation->rd_rel->reltuples,
+										aoRelation,
 										dead_segs,
 										elevel,
 										bstrategy);
@@ -516,11 +519,10 @@ vacuum_appendonly_indexes(Relation aoRelation, int options, Bitmapset *dead_segs
  *
  * This is called after an append-only segment file compaction to move
  * all tuples from the compacted segment files.
- * The segmentFileList is an
  */
 static void
 vacuum_appendonly_index(Relation indexRelation,
-						double rel_tuple_count,
+						Relation aoRelation,
 						Bitmapset *dead_segs,
 						int elevel,
 						BufferAccessStrategy bstrategy)
@@ -534,8 +536,14 @@ vacuum_appendonly_index(Relation indexRelation,
 	pg_rusage_init(&ru0);
 
 	ivinfo.index = indexRelation;
+	ivinfo.analyze_only = false;
 	ivinfo.message_level = elevel;
-	ivinfo.num_heap_tuples = rel_tuple_count;
+	/* 
+	 * We can only provide the AO rel's reltuples as an estimate
+	 * (similar to heapam. See: lazy_vacuum_index()).
+	 */
+	ivinfo.num_heap_tuples = aoRelation->rd_rel->reltuples;
+	ivinfo.estimated_count = true;
 	ivinfo.strategy = bstrategy;
 
 	/* Do bulk deletion */
@@ -680,9 +688,7 @@ vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, int elevel,
  * We use this when we have no deletions to do.
  */
 void
-scan_index(Relation indrel,
-		   Relation aorel,
-		   int elevel, BufferAccessStrategy vac_strategy)
+scan_index(Relation indrel, Relation aorel, int elevel, BufferAccessStrategy vac_strategy)
 {
 	IndexBulkDeleteResult *stats;
 	IndexVacuumInfo ivinfo;
@@ -692,9 +698,13 @@ scan_index(Relation indrel,
 
 	ivinfo.index = indrel;
 	ivinfo.analyze_only = false;
-	ivinfo.estimated_count = false;
 	ivinfo.message_level = elevel;
+	/* 
+	 * We can only provide the AO rel's reltuples as an estimate
+	 * (similar to heapam. See: lazy_vacuum_index()).
+	 */
 	ivinfo.num_heap_tuples = aorel->rd_rel->reltuples;
+	ivinfo.estimated_count = true;
 	ivinfo.strategy = vac_strategy;
 
 
diff --git a/src/include/cdb/cdbdispatchresult.h b/src/include/cdb/cdbdispatchresult.h
index f03bce05d12..efbce12bdd7 100644
--- a/src/include/cdb/cdbdispatchresult.h
+++ b/src/include/cdb/cdbdispatchresult.h
@@ -29,6 +29,7 @@ typedef struct CdbPgResults
 {
 	struct pg_result **pg_results;
 	int numResults;
+	int numDispatches; /* the number of dispatched QE, from CdbDispatchResults.resultCount */
 }CdbPgResults;
 
 /*
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 3fba8197b95..67f41613ad2 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -258,6 +258,16 @@ typedef struct VPgClassStats
 	BlockNumber relallvisible;
 } VPgClassStats;
 
+typedef struct VPgClassStatsCombo
+{
+	Oid			relid;
+	BlockNumber rel_pages;
+	double		rel_tuples;
+	BlockNumber relallvisible;
+
+	int			count; /* expect to equal to the number of dispatched segments */
+} VPgClassStatsCombo;
+
 /*
  * Parameters customizing behavior of VACUUM and ANALYZE.
  *
@@ -386,6 +396,7 @@ extern void analyze_rel(Oid relid, RangeVar *relation,
 extern void lazy_vacuum_rel_heap(Relation onerel,
 							VacuumParams *params, BufferAccessStrategy bstrategy);
 extern void scan_index(Relation indrel, Relation aorel, int elevel, BufferAccessStrategy bstrategy);
+
 /* in commands/vacuum_ao.c */
 extern void ao_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy);
 
@@ -407,4 +418,6 @@ extern int acquire_inherited_sample_rows(Relation onerel, int elevel,
 extern Datum gp_acquire_sample_rows(PG_FUNCTION_ARGS);
 extern Oid gp_acquire_sample_rows_col_type(Oid typid);
 
+extern bool gp_vacuum_needs_update_stats(void);
+
 #endif							/* VACUUM_H */
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index 1165fa5efa1..322a9187b32 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -335,18 +335,112 @@ VACUUM delete_test_table;
 -- The vacuum above should've turned the leaf page into a fast root. We just
 -- need to insert some rows to cause the fast root page to split.
 INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i;
--- Test unsupported btree opclass parameters
-create index on btree_tall_tbl (id int4_ops(foo=1));
-ERROR:  operator class int4_ops has no options
--- Test case of ALTER INDEX with abuse of column names for indexes.
--- This grammar is not officially supported, but the parser allows it.
-CREATE INDEX btree_tall_idx2 ON btree_tall_tbl (id);
-ALTER INDEX btree_tall_idx2 ALTER COLUMN id SET (n_distinct=100);
-ERROR:  "btree_tall_idx2" is not a table, materialized view, or foreign table
-DROP INDEX btree_tall_idx2;
--- Partitioned index
-CREATE TABLE btree_part (id int4) PARTITION BY RANGE (id);
-CREATE INDEX btree_part_idx ON btree_part(id);
-ALTER INDEX btree_part_idx ALTER COLUMN id SET (n_distinct=100);
-ERROR:  "btree_part_idx" is not a table, materialized view, or foreign table
-DROP TABLE btree_part;
+--
+-- GPDB: Test correctness of B-tree stats in consecutively VACUUM.
+--
+CREATE TABLE btree_stats_tbl(col_int int, col_text text, col_numeric numeric, col_unq int) DISTRIBUTED BY (col_int);
+CREATE INDEX btree_stats_idx ON btree_stats_tbl(col_int);
+INSERT INTO btree_stats_tbl VALUES (1, 'aa', 1001, 101), (2, 'bb', 1002, 102);
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+ reltuples 
+-----------
+         0
+(1 row)
+
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | btree_stats_idx |         0
+             1 | btree_stats_idx |         0
+             2 | btree_stats_idx |         0
+(3 rows)
+
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+ reltuples 
+-----------
+         0
+(1 row)
+
+-- 1st VACUUM, expect reltuples = 2
+vacuum btree_stats_tbl;
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | btree_stats_idx |         1
+             1 | btree_stats_idx |         1
+             2 | btree_stats_idx |         0
+(3 rows)
+
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- 2nd VACUUM, expect reltuples = 2
+vacuum btree_stats_tbl;
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | btree_stats_idx |         1
+             1 | btree_stats_idx |         1
+             2 | btree_stats_idx |         0
+(3 rows)
+
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- Prior to this fix, the case would be failed here. Given the
+-- scenario of updating stats during VACUUM:
+-- 1) coordinator vacuums and updates stats of its own;
+-- 2) then coordinator dispatches vacuum to segments;
+-- 3) coordinator combines stats received from segments to overwrite the stats of its own.
+-- Because upstream introduced a feature which could skip full index scan uring cleanup
+-- of B-tree indexes when possible (refer to:
+-- https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
+-- there was a case in QD-QEs distributed deployment that some QEs could skip full index scan and
+-- stop updating statistics, result in QD being unable to collect all QEs' stats thus overwrote
+-- a paritial accumulated value to index->reltuples. More interesting, it usually happened starting
+-- from the 3rd time of consecutively VACUUM after fresh inserts due to above skipping index scan
+-- criteria.
+-- 3rd VACUUM, expect reltuples = 2
+vacuum btree_stats_tbl;
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | btree_stats_idx |         1
+             1 | btree_stats_idx |         1
+             2 | btree_stats_idx |         0
+(3 rows)
+
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+ reltuples 
+-----------
+         2
+(1 row)
+
diff --git a/src/test/regress/expected/uao_compaction/index_stats.out b/src/test/regress/expected/uao_compaction/index_stats.out
index 1c32cfd8caa..afdad152b1d 100644
--- a/src/test/regress/expected/uao_compaction/index_stats.out
+++ b/src/test/regress/expected/uao_compaction/index_stats.out
@@ -40,4 +40,164 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab_int_idx1';
  mytab_int_idx1 |         2
 (1 row)
 
--- end_ignore
\ No newline at end of file
+-- Test to ensure that reltuples is updated for an index after lazy vacuum.
+-- This is vital as most index AMs that depend on this tuple count (eg btree, bitmap etc)
+-- which is passed up from the table AM during lazy vacuum.
+-- create a fresh table for the test
+CREATE TABLE mytab2(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true) DISTRIBUTED BY (col_int);
+create index mytab2_int_idx1 on mytab2 using bitmap(col_int);
+insert into mytab2 values(1,'aa',1001,101),(2,'bb',1002,102);
+SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab2_int_idx1';
+     relname     | reltuples 
+-----------------+-----------
+ mytab2_int_idx1 |         0
+(1 row)
+
+-- first vacuum collect table stat on segments
+vacuum mytab2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'mytab2_int_idx1';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | mytab2_int_idx1 |         0
+             1 | mytab2_int_idx1 |         0
+             2 | mytab2_int_idx1 |         0
+(3 rows)
+
+-- second vacuum update index stat with table stat
+vacuum mytab2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'mytab2_int_idx1';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | mytab2_int_idx1 |         1
+             1 | mytab2_int_idx1 |         1
+             2 | mytab2_int_idx1 |         0
+(3 rows)
+
+SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab2_int_idx1';
+     relname     | reltuples 
+-----------------+-----------
+ mytab2_int_idx1 |         2
+(1 row)
+
+-- Test correctness of index->reltuples in consecutively VACUUM.
+CREATE TABLE mytab3(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true) DISTRIBUTED BY (col_int);
+create index mytab3_int_idx1 on mytab3(col_int);
+insert into mytab3 values(1,'aa',1001,101),(2,'bb',1002,102);
+select reltuples from pg_class where relname='mytab3';
+ reltuples 
+-----------
+         0
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | mytab3_int_idx1 |         0
+             1 | mytab3_int_idx1 |         0
+             2 | mytab3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='mytab3_int_idx1';
+ reltuples 
+-----------
+         0
+(1 row)
+
+-- 1st VACUUM, expect reltuples = 2
+vacuum mytab3;
+select reltuples from pg_class where relname='mytab3';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | mytab3_int_idx1 |         1
+             1 | mytab3_int_idx1 |         1
+             2 | mytab3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='mytab3_int_idx1';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- 2nd VACUUM, expect reltuples = 2
+vacuum mytab3;
+select reltuples from pg_class where relname='mytab3';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | mytab3_int_idx1 |         1
+             1 | mytab3_int_idx1 |         1
+             2 | mytab3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='mytab3_int_idx1';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- Prior to this fix, the case would be failed here. Given the
+-- scenario of updating stats during VACUUM:
+-- 1) coordinator vacuums and updates stats of its own;
+-- 2) then coordinator dispatches vacuum to segments;
+-- 3) coordinator combines stats received from segments to overwrite the stats of its own.
+-- Because upstream introduced a feature which could skip full index scan uring cleanup
+-- of B-tree indexes when possible (refer to:
+-- https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
+-- there was a case in QD-QEs distributed deployment that some QEs could skip full index scan and
+-- stop updating statistics, result in QD being unable to collect all QEs' stats thus overwrote
+-- a paritial accumulated value to index->reltuples. More interesting, it usually happened starting
+-- from the 3rd time of consecutively VACUUM after fresh inserts due to above skipping index scan
+-- criteria.
+-- 3rd VACUUM, expect reltuples = 2
+vacuum mytab3;
+select reltuples from pg_class where relname='mytab3';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+ gp_segment_id |     relname     | reltuples 
+---------------+-----------------+-----------
+             0 | mytab3_int_idx1 |         1
+             1 | mytab3_int_idx1 |         1
+             2 | mytab3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='mytab3_int_idx1';
+ reltuples 
+-----------
+         2
+(1 row)
+
+drop table mytab;
+drop table mytab2;
+drop table mytab3;
diff --git a/src/test/regress/expected/uaocs_compaction/index_stats.out b/src/test/regress/expected/uaocs_compaction/index_stats.out
index adc286c0811..7ada461449a 100644
--- a/src/test/regress/expected/uaocs_compaction/index_stats.out
+++ b/src/test/regress/expected/uaocs_compaction/index_stats.out
@@ -45,4 +45,164 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats_int_i
  uaocs_index_stats_int_idx1 |         2
 (1 row)
 
--- end_ignore
\ No newline at end of file
+-- Test to ensure that reltuples is updated for an index after lazy vacuum.
+-- This is vital as most index AMs that depend on this tuple count (eg btree, bitmap etc)
+-- which is passed up from the table AM during lazy vacuum.
+-- create a fresh table for the test
+CREATE TABLE uaocs_index_stats2(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true, orientation=column) DISTRIBUTED BY (col_int);
+create index uaocs_index_stats2_int_idx1 on uaocs_index_stats2 using bitmap(col_int);
+insert into uaocs_index_stats2 values(1,'aa',1001,101),(2,'bb',1002,102);
+SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats2_int_idx1';
+           relname           | reltuples 
+-----------------------------+-----------
+ uaocs_index_stats2_int_idx1 |         0
+(1 row)
+
+-- first vacuum collect table stat on segments
+vacuum uaocs_index_stats2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'uaocs_index_stats2_int_idx1';
+ gp_segment_id |           relname           | reltuples 
+---------------+-----------------------------+-----------
+             0 | uaocs_index_stats2_int_idx1 |         0
+             1 | uaocs_index_stats2_int_idx1 |         0
+             2 | uaocs_index_stats2_int_idx1 |         0
+(3 rows)
+
+-- second vacuum update index stat with table stat
+vacuum uaocs_index_stats2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'uaocs_index_stats2_int_idx1';
+ gp_segment_id |           relname           | reltuples 
+---------------+-----------------------------+-----------
+             0 | uaocs_index_stats2_int_idx1 |         1
+             1 | uaocs_index_stats2_int_idx1 |         1
+             2 | uaocs_index_stats2_int_idx1 |         0
+(3 rows)
+
+SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats2_int_idx1';
+           relname           | reltuples 
+-----------------------------+-----------
+ uaocs_index_stats2_int_idx1 |         2
+(1 row)
+
+-- Test correctness of index->reltuples in consecutively VACUUM.
+CREATE TABLE uaocs_index_stats3(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true, orientation=column) DISTRIBUTED BY (col_int);
+create index uaocs_index_stats3_int_idx1 on uaocs_index_stats3(col_int);
+insert into uaocs_index_stats3 values(1,'aa',1001,101),(2,'bb',1002,102);
+select reltuples from pg_class where relname='uaocs_index_stats3';
+ reltuples 
+-----------
+         0
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+ gp_segment_id |           relname           | reltuples 
+---------------+-----------------------------+-----------
+             0 | uaocs_index_stats3_int_idx1 |         0
+             1 | uaocs_index_stats3_int_idx1 |         0
+             2 | uaocs_index_stats3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+ reltuples 
+-----------
+         0
+(1 row)
+
+-- 1st VACUUM, expect reltuples = 2
+vacuum uaocs_index_stats3;
+select reltuples from pg_class where relname='uaocs_index_stats';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+ gp_segment_id |           relname           | reltuples 
+---------------+-----------------------------+-----------
+             0 | uaocs_index_stats3_int_idx1 |         1
+             1 | uaocs_index_stats3_int_idx1 |         1
+             2 | uaocs_index_stats3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- 2nd VACUUM, expect reltuples = 2
+vacuum uaocs_index_stats3;
+select reltuples from pg_class where relname='uaocs_index_stats3';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+ gp_segment_id |           relname           | reltuples 
+---------------+-----------------------------+-----------
+             0 | uaocs_index_stats3_int_idx1 |         1
+             1 | uaocs_index_stats3_int_idx1 |         1
+             2 | uaocs_index_stats3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- Prior to this fix, the case would be failed here. Given the
+-- scenario of updating stats during VACUUM:
+-- 1) coordinator vacuums and updates stats of its own;
+-- 2) then coordinator dispatches vacuum to segments;
+-- 3) coordinator combines stats received from segments to overwrite the stats of its own.
+-- Because upstream introduced a feature which could skip full index scan uring cleanup
+-- of B-tree indexes when possible (refer to:
+-- https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
+-- there was a case in QD-QEs distributed deployment that some QEs could skip full index scan and
+-- stop updating statistics, result in QD being unable to collect all QEs' stats thus overwrote
+-- a paritial accumulated value to index->reltuples. More interesting, it usually happened starting
+-- from the 3rd time of consecutively VACUUM after fresh inserts due to above skipping index scan
+-- criteria.
+-- 3rd VACUUM, expect reltuples = 2
+vacuum uaocs_index_stats3;
+select reltuples from pg_class where relname='uaocs_index_stats3';
+ reltuples 
+-----------
+         2
+(1 row)
+
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+ gp_segment_id |           relname           | reltuples 
+---------------+-----------------------------+-----------
+             0 | uaocs_index_stats3_int_idx1 |         1
+             1 | uaocs_index_stats3_int_idx1 |         1
+             2 | uaocs_index_stats3_int_idx1 |         0
+(3 rows)
+
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+ reltuples 
+-----------
+         2
+(1 row)
+
+drop table uaocs_index_stats;
+drop table uaocs_index_stats2;
+drop table uaocs_index_stats3;
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index 64e158fafc5..a771eb62b8b 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -175,16 +175,45 @@ VACUUM delete_test_table;
 -- need to insert some rows to cause the fast root page to split.
 INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i;
 
--- Test unsupported btree opclass parameters
-create index on btree_tall_tbl (id int4_ops(foo=1));
-
--- Test case of ALTER INDEX with abuse of column names for indexes.
--- This grammar is not officially supported, but the parser allows it.
-CREATE INDEX btree_tall_idx2 ON btree_tall_tbl (id);
-ALTER INDEX btree_tall_idx2 ALTER COLUMN id SET (n_distinct=100);
-DROP INDEX btree_tall_idx2;
--- Partitioned index
-CREATE TABLE btree_part (id int4) PARTITION BY RANGE (id);
-CREATE INDEX btree_part_idx ON btree_part(id);
-ALTER INDEX btree_part_idx ALTER COLUMN id SET (n_distinct=100);
-DROP TABLE btree_part;
+--
+-- GPDB: Test correctness of B-tree stats in consecutively VACUUM.
+--
+CREATE TABLE btree_stats_tbl(col_int int, col_text text, col_numeric numeric, col_unq int) DISTRIBUTED BY (col_int);
+CREATE INDEX btree_stats_idx ON btree_stats_tbl(col_int);
+INSERT INTO btree_stats_tbl VALUES (1, 'aa', 1001, 101), (2, 'bb', 1002, 102);
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+-- 1st VACUUM, expect reltuples = 2
+vacuum btree_stats_tbl;
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+-- 2nd VACUUM, expect reltuples = 2
+vacuum btree_stats_tbl;
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
+
+-- Prior to this fix, the case would be failed here. Given the
+-- scenario of updating stats during VACUUM:
+-- 1) coordinator vacuums and updates stats of its own;
+-- 2) then coordinator dispatches vacuum to segments;
+-- 3) coordinator combines stats received from segments to overwrite the stats of its own.
+-- Because upstream introduced a feature which could skip full index scan uring cleanup
+-- of B-tree indexes when possible (refer to:
+-- https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
+-- there was a case in QD-QEs distributed deployment that some QEs could skip full index scan and
+-- stop updating statistics, result in QD being unable to collect all QEs' stats thus overwrote
+-- a paritial accumulated value to index->reltuples. More interesting, it usually happened starting
+-- from the 3rd time of consecutively VACUUM after fresh inserts due to above skipping index scan
+-- criteria.
+-- 3rd VACUUM, expect reltuples = 2
+vacuum btree_stats_tbl;
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'btree_stats_idx';
+SELECT reltuples FROM pg_class WHERE relname='btree_stats_idx';
diff --git a/src/test/regress/sql/uao_compaction/index_stats.sql b/src/test/regress/sql/uao_compaction/index_stats.sql
index 05ad79953ef..f88e000e707 100644
--- a/src/test/regress/sql/uao_compaction/index_stats.sql
+++ b/src/test/regress/sql/uao_compaction/index_stats.sql
@@ -23,4 +23,83 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab';
 -- for reltuples to coordinate with the new behavior.
 -- start_ignore
 SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab_int_idx1';
--- end_ignore
+
+-- Test to ensure that reltuples is updated for an index after lazy vacuum.
+-- This is vital as most index AMs that depend on this tuple count (eg btree, bitmap etc)
+-- which is passed up from the table AM during lazy vacuum.
+-- create a fresh table for the test
+CREATE TABLE mytab2(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true) DISTRIBUTED BY (col_int);
+
+create index mytab2_int_idx1 on mytab2 using bitmap(col_int);
+
+insert into mytab2 values(1,'aa',1001,101),(2,'bb',1002,102);
+
+SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab2_int_idx1';
+
+-- first vacuum collect table stat on segments
+vacuum mytab2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'mytab2_int_idx1';
+-- second vacuum update index stat with table stat
+vacuum mytab2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'mytab2_int_idx1';
+SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab2_int_idx1';
+
+-- Test correctness of index->reltuples in consecutively VACUUM.
+CREATE TABLE mytab3(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true) DISTRIBUTED BY (col_int);
+
+create index mytab3_int_idx1 on mytab3(col_int);
+
+insert into mytab3 values(1,'aa',1001,101),(2,'bb',1002,102);
+
+select reltuples from pg_class where relname='mytab3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+select reltuples from pg_class where relname='mytab3_int_idx1';
+-- 1st VACUUM, expect reltuples = 2
+vacuum mytab3;
+select reltuples from pg_class where relname='mytab3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+select reltuples from pg_class where relname='mytab3_int_idx1';
+-- 2nd VACUUM, expect reltuples = 2
+vacuum mytab3;
+select reltuples from pg_class where relname='mytab3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+select reltuples from pg_class where relname='mytab3_int_idx1';
+
+-- Prior to this fix, the case would be failed here. Given the
+-- scenario of updating stats during VACUUM:
+-- 1) coordinator vacuums and updates stats of its own;
+-- 2) then coordinator dispatches vacuum to segments;
+-- 3) coordinator combines stats received from segments to overwrite the stats of its own.
+-- Because upstream introduced a feature which could skip full index scan uring cleanup
+-- of B-tree indexes when possible (refer to:
+-- https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
+-- there was a case in QD-QEs distributed deployment that some QEs could skip full index scan and
+-- stop updating statistics, result in QD being unable to collect all QEs' stats thus overwrote
+-- a paritial accumulated value to index->reltuples. More interesting, it usually happened starting
+-- from the 3rd time of consecutively VACUUM after fresh inserts due to above skipping index scan
+-- criteria.
+-- 3rd VACUUM, expect reltuples = 2
+vacuum mytab3;
+select reltuples from pg_class where relname='mytab3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'mytab3_int_idx1';
+select reltuples from pg_class where relname='mytab3_int_idx1';
+
+drop table mytab;
+drop table mytab2;
+drop table mytab3;
diff --git a/src/test/regress/sql/uaocs_compaction/index_stats.sql b/src/test/regress/sql/uaocs_compaction/index_stats.sql
index c93ee37f686..d87b41a8b45 100644
--- a/src/test/regress/sql/uaocs_compaction/index_stats.sql
+++ b/src/test/regress/sql/uaocs_compaction/index_stats.sql
@@ -23,4 +23,83 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats';
 -- for reltuples to coordinate with the new behavior.
 -- start_ignore
 SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats_int_idx1';
--- end_ignore
+
+-- Test to ensure that reltuples is updated for an index after lazy vacuum.
+-- This is vital as most index AMs that depend on this tuple count (eg btree, bitmap etc)
+-- which is passed up from the table AM during lazy vacuum.
+-- create a fresh table for the test
+CREATE TABLE uaocs_index_stats2(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true, orientation=column) DISTRIBUTED BY (col_int);
+
+create index uaocs_index_stats2_int_idx1 on uaocs_index_stats2 using bitmap(col_int);
+
+insert into uaocs_index_stats2 values(1,'aa',1001,101),(2,'bb',1002,102);
+
+SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats2_int_idx1';
+
+-- first vacuum collect table stat on segments
+vacuum uaocs_index_stats2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'uaocs_index_stats2_int_idx1';
+-- second vacuum update index stat with table stat
+vacuum uaocs_index_stats2;
+-- inspect the state of the stats on segments
+SELECT gp_segment_id, relname, reltuples FROM gp_dist_random('pg_class') WHERE relname = 'uaocs_index_stats2_int_idx1';
+SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats2_int_idx1';
+
+-- Test correctness of index->reltuples in consecutively VACUUM.
+CREATE TABLE uaocs_index_stats3(
+          col_int int,
+          col_text text,
+          col_numeric numeric,
+          col_unq int
+          ) with(appendonly=true, orientation=column) DISTRIBUTED BY (col_int);
+
+create index uaocs_index_stats3_int_idx1 on uaocs_index_stats3(col_int);
+
+insert into uaocs_index_stats3 values(1,'aa',1001,101),(2,'bb',1002,102);
+
+select reltuples from pg_class where relname='uaocs_index_stats3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+-- 1st VACUUM, expect reltuples = 2
+vacuum uaocs_index_stats3;
+select reltuples from pg_class where relname='uaocs_index_stats';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+-- 2nd VACUUM, expect reltuples = 2
+vacuum uaocs_index_stats3;
+select reltuples from pg_class where relname='uaocs_index_stats3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+
+-- Prior to this fix, the case would be failed here. Given the
+-- scenario of updating stats during VACUUM:
+-- 1) coordinator vacuums and updates stats of its own;
+-- 2) then coordinator dispatches vacuum to segments;
+-- 3) coordinator combines stats received from segments to overwrite the stats of its own.
+-- Because upstream introduced a feature which could skip full index scan uring cleanup
+-- of B-tree indexes when possible (refer to:
+-- https://github.com/postgres/postgres/commit/857f9c36cda520030381bd8c2af20adf0ce0e1d4),
+-- there was a case in QD-QEs distributed deployment that some QEs could skip full index scan and
+-- stop updating statistics, result in QD being unable to collect all QEs' stats thus overwrote
+-- a paritial accumulated value to index->reltuples. More interesting, it usually happened starting
+-- from the 3rd time of consecutively VACUUM after fresh inserts due to above skipping index scan
+-- criteria.
+-- 3rd VACUUM, expect reltuples = 2
+vacuum uaocs_index_stats3;
+select reltuples from pg_class where relname='uaocs_index_stats3';
+-- inspect the state of the stats on segments
+select gp_segment_id, relname, reltuples from gp_dist_random('pg_class') where relname = 'uaocs_index_stats3_int_idx1';
+select reltuples from pg_class where relname='uaocs_index_stats3_int_idx1';
+
+drop table uaocs_index_stats;
+drop table uaocs_index_stats2;
+drop table uaocs_index_stats3;

From 28060dd5a2ca10129d59fcd81f596ad6e17a1fcc Mon Sep 17 00:00:00 2001
From: liushengsong <liushengsong@hashdata.cn>
Date: Fri, 22 Sep 2023 11:12:20 +0800
Subject: [PATCH 2/2] Fix conflict in _bt_vacuum_needs_cleanup and tests

There is conflict in _bt_vacuum_needs_cleanup between CloudberryDB and
GPDB. We update the _bt_vacuum_needs_cleanup in CloudberryDB. We enforce
update stats when Gp_role is GP_ROLE_EXECUTE, and set estimated_count to
false. which in the scan_index  We call vac_update_relstats to send back
stats to QE.

Fix serveral tests due to update stats after vacuum. We keep these tests
result same with GPDB.
---
 src/backend/access/nbtree/nbtpage.c           |  5 ++
 src/backend/access/nbtree/nbtree.c            | 70 -------------------
 src/test/regress/expected/btree_index.out     |  2 +-
 src/test/regress/expected/join.out            |  4 +-
 src/test/regress/expected/memoize.out         | 64 +++++++++--------
 .../regress/expected/memoize_optimizer.out    | 36 +++++-----
 src/test/regress/expected/misc_functions.out  | 15 ++--
 src/test/regress/expected/select_parallel.out |  6 +-
 8 files changed, 68 insertions(+), 134 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index ebec8fa5b89..901573fa433 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -36,6 +36,7 @@
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
+#include "commands/vacuum.h"
 
 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
@@ -186,6 +187,10 @@ _bt_vacuum_needs_cleanup(Relation rel)
 	uint32		btm_version;
 	BlockNumber prev_num_delpages;
 
+	/* Return true directly on QE for stats collection from QD. */
+	if (gp_vacuum_needs_update_stats())
+		return true;
+
 	/*
 	 * Copy details from metapage to local variables quickly.
 	 *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 46bc034fd2f..5e3a2b7aa47 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -877,75 +877,6 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
 	SpinLockRelease(&btscan->btps_mutex);
 }
 
-/*
- * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
- *			btbulkdelete() wasn't called.
- */
-static bool
-_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
-{
-	Buffer		metabuf;
-	Page		metapg;
-	BTMetaPageData *metad;
-	bool		result = false;
-
-	/* Return true directly on QE for stats collection from QD. */
-	if (gp_vacuum_needs_update_stats())
-		return true;
-
-	metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
-	metapg = BufferGetPage(metabuf);
-	metad = BTPageGetMeta(metapg);
-
-	if (metad->btm_version < BTREE_NOVAC_VERSION)
-	{
-		/*
-		 * Do cleanup if metapage needs upgrade, because we don't have
-		 * cleanup-related meta-information yet.
-		 */
-		result = true;
-	}
-	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
-			 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
-								   RecentGlobalXmin))
-	{
-		/*
-		 * If oldest btpo.xact in the deleted pages is older than
-		 * RecentGlobalXmin, then at least one deleted page can be recycled.
-		 */
-		result = true;
-	}
-	else
-	{
-		StdRdOptions *relopts;
-		float8		cleanup_scale_factor;
-		float8		prev_num_heap_tuples;
-
-		/*
-		 * If table receives enough insertions and no cleanup was performed,
-		 * then index would appear have stale statistics.  If scale factor is
-		 * set, we avoid that by performing cleanup if the number of inserted
-		 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
-		 * original tuples count.
-		 */
-		relopts = (StdRdOptions *) info->index->rd_options;
-		cleanup_scale_factor = (relopts &&
-								relopts->vacuum_cleanup_index_scale_factor >= 0)
-			? relopts->vacuum_cleanup_index_scale_factor
-			: vacuum_cleanup_index_scale_factor;
-		prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
-
-		if (cleanup_scale_factor <= 0 ||
-			prev_num_heap_tuples <= 0 ||
-			(info->num_heap_tuples - prev_num_heap_tuples) /
-			prev_num_heap_tuples >= cleanup_scale_factor)
-			result = true;
-	}
-
-	_bt_relbuf(info->index, metabuf);
-	return result;
-}
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -1025,7 +956,6 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		 */
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 		btvacuumscan(info, stats, NULL, NULL, 0);
-		stats->estimated_count = true;
 	}
 
 	/*
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index 322a9187b32..39fea6969cf 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -344,7 +344,7 @@ INSERT INTO btree_stats_tbl VALUES (1, 'aa', 1001, 101), (2, 'bb', 1002, 102);
 SELECT reltuples FROM pg_class WHERE relname='btree_stats_tbl';
  reltuples 
 -----------
-         0
+        -1
 (1 row)
 
 -- inspect the state of the stats on segments
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index d0f1c2c4ac7..0f1e6ae13f9 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -2469,9 +2469,7 @@ select count(*) from
                ->  Gather Motion 3:1  (slice2; segments: 3)
                      Merge Key: y.unique2
                      ->  Subquery Scan on y
-                           ->  Sort
-                                 Sort Key: y_1.unique2
-                                 ->  Seq Scan on tenk1 y_1
+                           ->  Index Scan using tenk1_unique2 on tenk1 y_1
  Optimizer: Postgres query optimizer
 (18 rows)
 
diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out
index 7453cc83507..58c2e436571 100644
--- a/src/test/regress/expected/memoize.out
+++ b/src/test/regress/expected/memoize.out
@@ -39,23 +39,23 @@ SELECT explain_memoize('
 SELECT COUNT(*),AVG(t1.unique1) FROM tenk1 t1
 INNER JOIN tenk1 t2 ON t1.unique1 = t2.twenty
 WHERE t2.unique1 < 1000;', false);
-                                            explain_memoize                                             
---------------------------------------------------------------------------------------------------------
+                                            explain_memoize                                            
+-------------------------------------------------------------------------------------------------------
  Finalize Aggregate (actual rows=1 loops=N)
    ->  Gather Motion 3:1  (slice1; segments: 3) (actual rows=3 loops=N)
          ->  Partial Aggregate (actual rows=1 loops=N)
-               ->  Merge Join (actual rows=400 loops=N)
-                     Merge Cond: (t1.unique1 = t2.twenty)
-                     ->  Index Only Scan using tenk1_unique1 on tenk1 t1 (actual rows=9 loops=N)
-                           Heap Fetches: N
-                     ->  Sort (actual rows=400 loops=N)
-                           Sort Key: t2.twenty
-                           Sort Method:  quicksort  Memory: NkB
-                           ->  Redistribute Motion 3:3  (slice2; segments: 3) (actual rows=400 loops=N)
-                                 Hash Key: t2.twenty
-                                 ->  Seq Scan on tenk1 t2 (actual rows=340 loops=N)
-                                       Filter: (unique1 < 1000)
-                                       Rows Removed by Filter: 2906
+               ->  Nested Loop (actual rows=400 loops=N)
+                     ->  Redistribute Motion 3:3  (slice2; segments: 3) (actual rows=400 loops=N)
+                           Hash Key: t2.twenty
+                           ->  Seq Scan on tenk1 t2 (actual rows=340 loops=N)
+                                 Filter: (unique1 < 1000)
+                                 Rows Removed by Filter: 2906
+                     ->  Memoize (actual rows=1 loops=N)
+                           Cache Key: t2.twenty
+                           Cache Mode: logical
+                           ->  Index Only Scan using tenk1_unique1 on tenk1 t1 (actual rows=1 loops=N)
+                                 Index Cond: (unique1 = t2.twenty)
+                                 Heap Fetches: N
  Optimizer: Postgres query optimizer
 (16 rows)
 
@@ -73,23 +73,23 @@ SELECT explain_memoize('
 SELECT COUNT(*),AVG(t2.unique1) FROM tenk1 t1,
 LATERAL (SELECT t2.unique1 FROM tenk1 t2 WHERE t1.twenty = t2.unique1) t2
 WHERE t1.unique1 < 1000;', false);
-                                            explain_memoize                                             
---------------------------------------------------------------------------------------------------------
+                                            explain_memoize                                            
+-------------------------------------------------------------------------------------------------------
  Finalize Aggregate (actual rows=1 loops=N)
    ->  Gather Motion 3:1  (slice1; segments: 3) (actual rows=3 loops=N)
          ->  Partial Aggregate (actual rows=1 loops=N)
-               ->  Merge Join (actual rows=400 loops=N)
-                     Merge Cond: (t2.unique1 = t1.twenty)
-                     ->  Index Only Scan using tenk1_unique1 on tenk1 t2 (actual rows=9 loops=N)
-                           Heap Fetches: N
-                     ->  Sort (actual rows=400 loops=N)
-                           Sort Key: t1.twenty
-                           Sort Method:  quicksort  Memory: NkB
-                           ->  Redistribute Motion 3:3  (slice2; segments: 3) (actual rows=400 loops=N)
-                                 Hash Key: t1.twenty
-                                 ->  Seq Scan on tenk1 t1 (actual rows=340 loops=N)
-                                       Filter: (unique1 < 1000)
-                                       Rows Removed by Filter: 2906
+               ->  Nested Loop (actual rows=400 loops=N)
+                     ->  Redistribute Motion 3:3  (slice2; segments: 3) (actual rows=400 loops=N)
+                           Hash Key: t1.twenty
+                           ->  Seq Scan on tenk1 t1 (actual rows=340 loops=N)
+                                 Filter: (unique1 < 1000)
+                                 Rows Removed by Filter: 2906
+                     ->  Memoize (actual rows=1 loops=N)
+                           Cache Key: t1.twenty
+                           Cache Mode: logical
+                           ->  Index Only Scan using tenk1_unique1 on tenk1 t2 (actual rows=1 loops=N)
+                                 Index Cond: (unique1 = t1.twenty)
+                                 Heap Fetches: N
  Optimizer: Postgres query optimizer
 (16 rows)
 
@@ -284,10 +284,12 @@ WHERE t1.unique1 < 1000;
                      ->  Hash
                            ->  Redistribute Motion 3:3  (slice2; segments: 3)
                                  Hash Key: t1.twenty
-                                 ->  Seq Scan on tenk1 t1
-                                       Filter: (unique1 < 1000)
+                                 ->  Bitmap Heap Scan on tenk1 t1
+                                       Recheck Cond: (unique1 < 1000)
+                                       ->  Bitmap Index Scan on tenk1_unique1
+                                             Index Cond: (unique1 < 1000)
  Optimizer: Postgres query optimizer
-(12 rows)
+(14 rows)
 
 -- And ensure the parallel plan gives us the correct results.
 SELECT COUNT(*),AVG(t2.unique1) FROM tenk1 t1,
diff --git a/src/test/regress/expected/memoize_optimizer.out b/src/test/regress/expected/memoize_optimizer.out
index e65e98d36ca..f34752dab7e 100644
--- a/src/test/regress/expected/memoize_optimizer.out
+++ b/src/test/regress/expected/memoize_optimizer.out
@@ -69,23 +69,23 @@ SELECT explain_memoize('
 SELECT COUNT(*),AVG(t2.unique1) FROM tenk1 t1,
 LATERAL (SELECT t2.unique1 FROM tenk1 t2 WHERE t1.twenty = t2.unique1) t2
 WHERE t1.unique1 < 1000;', false);
-                                            explain_memoize                                             
---------------------------------------------------------------------------------------------------------
+                                            explain_memoize                                            
+-------------------------------------------------------------------------------------------------------
  Finalize Aggregate (actual rows=1 loops=N)
    ->  Gather Motion 3:1  (slice1; segments: 3) (actual rows=3 loops=N)
          ->  Partial Aggregate (actual rows=1 loops=N)
-               ->  Merge Join (actual rows=400 loops=N)
-                     Merge Cond: (t2.unique1 = t1.twenty)
-                     ->  Index Only Scan using tenk1_unique1 on tenk1 t2 (actual rows=9 loops=N)
-                           Heap Fetches: N
-                     ->  Sort (actual rows=400 loops=N)
-                           Sort Key: t1.twenty
-                           Sort Method:  quicksort  Memory: NkB
-                           ->  Redistribute Motion 3:3  (slice2; segments: 3) (actual rows=400 loops=N)
-                                 Hash Key: t1.twenty
-                                 ->  Seq Scan on tenk1 t1 (actual rows=340 loops=N)
-                                       Filter: (unique1 < 1000)
-                                       Rows Removed by Filter: 2906
+               ->  Nested Loop (actual rows=400 loops=N)
+                     ->  Redistribute Motion 3:3  (slice2; segments: 3) (actual rows=400 loops=N)
+                           Hash Key: t1.twenty
+                           ->  Seq Scan on tenk1 t1 (actual rows=340 loops=N)
+                                 Filter: (unique1 < 1000)
+                                 Rows Removed by Filter: 2906
+                     ->  Memoize (actual rows=1 loops=N)
+                           Cache Key: t1.twenty
+                           Cache Mode: logical
+                           ->  Index Only Scan using tenk1_unique1 on tenk1 t2 (actual rows=1 loops=N)
+                                 Index Cond: (unique1 = t1.twenty)
+                                 Heap Fetches: N
  Optimizer: Postgres query optimizer
 (16 rows)
 
@@ -276,10 +276,12 @@ WHERE t1.unique1 < 1000;
                      ->  Hash
                            ->  Redistribute Motion 3:3  (slice2; segments: 3)
                                  Hash Key: t1.twenty
-                                 ->  Seq Scan on tenk1 t1
-                                       Filter: (unique1 < 1000)
+                                 ->  Bitmap Heap Scan on tenk1 t1
+                                       Recheck Cond: (unique1 < 1000)
+                                       ->  Bitmap Index Scan on tenk1_unique1
+                                             Index Cond: (unique1 < 1000)
  Optimizer: Postgres query optimizer
-(12 rows)
+(14 rows)
 
 -- And ensure the parallel plan gives us the correct results.
 SELECT COUNT(*),AVG(t2.unique1) FROM tenk1 t1,
diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
index 08e4ccff440..73696351a42 100644
--- a/src/test/regress/expected/misc_functions.out
+++ b/src/test/regress/expected/misc_functions.out
@@ -274,14 +274,13 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,1000) g ON a.unique1 = g;
 
 EXPLAIN (COSTS OFF)
 SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g;
-                     QUERY PLAN                     
-----------------------------------------------------
+                      QUERY PLAN                       
+-------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
-   ->  Hash Join
-         Hash Cond: (a.unique1 = g.g)
-         ->  Seq Scan on tenk1 a
-         ->  Hash
-               ->  Function Scan on my_gen_series g
+   ->  Nested Loop
+         ->  Function Scan on my_gen_series g
+         ->  Index Scan using tenk1_unique1 on tenk1 a
+               Index Cond: (unique1 = g.g)
  Optimizer: Postgres query optimizer
-(7 rows)
+(6 rows)
 
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 951c6a5438f..180498dae04 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -651,11 +651,9 @@ explain (costs off)
    ->  Gather Motion 3:1  (slice1; segments: 3)
          ->  Partial Aggregate
                ->  Merge Join
-                     Merge Cond: (tenk2.unique1 = tenk1.unique1)
+                     Merge Cond: (tenk1.unique1 = tenk2.unique1)
+                     ->  Index Only Scan using tenk1_unique1 on tenk1
                      ->  Index Only Scan using tenk2_unique1 on tenk2
-                     ->  Sort
-                           Sort Key: tenk1.unique1
-                           ->  Seq Scan on tenk1
  Optimizer: Postgres query optimizer
 (10 rows)