From 7f2a746f6e8924ba40e38f3e5f3a879611c8bf00 Mon Sep 17 00:00:00 2001
From: zhangbutao <zhangbutao@cmss.chinamobile.com>
Date: Thu, 25 Apr 2024 22:44:12 +0800
Subject: [PATCH 1/6] Iceberg: Retrieve row count from iceberg SnapshotSummary
 in case of iceberg.hive.keep.stats=false

---
 .../src/test/queries/positive/iceberg_stats.q |  22 +++
 .../test/results/positive/iceberg_stats.q.out | 159 ++++++++++++++++++
 .../hive/ql/optimizer/StatsOptimizer.java     |   6 +
 3 files changed, 187 insertions(+)

diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
index de88018f32e0..6fc965e17456 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
@@ -28,5 +28,27 @@ select count(*) from ice01;
 insert overwrite table ice01 select * from ice01;
 explain select count(*) from ice01;
 
+-- false means that count(*) query won't use row count stored in HMS
+set iceberg.hive.keep.stats=false;
+
+create external table ice03 (id int, key int) Stored by Iceberg stored as ORC
+  TBLPROPERTIES('format-version'='2');
+
+insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1);
+-- Iceberg table can utilize fetch task to directly retrieve the row count from iceberg SnapshotSummary
+explain select count(*) from ice03;
+select count(*) from ice03;
+
+-- delete some values
+delete from ice03 where id in (2,4);
+
+explain select count(*) from ice03;
+select count(*) from ice03;
+
+-- iow
+insert overwrite table ice03 select * from ice03;
+explain select count(*) from ice03;
+
 drop table ice01;
 drop table ice02;
+drop table ice03;
diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
index 33c60b54608d..4e5b70945016 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
@@ -192,6 +192,155 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: create external table ice03 (id int, key int) Stored by Iceberg stored as ORC
+  TBLPROPERTIES('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice03
+POSTHOOK: query: create external table ice03 (id int, key int) Stored by Iceberg stored as ORC
+  TBLPROPERTIES('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice03
+PREHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice03
+POSTHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5
+PREHOOK: query: delete from ice03 where id in (2,4)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: default@ice03
+POSTHOOK: query: delete from ice03 where id in (2,4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: ice03
+                  Statistics: Num rows: 3 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    Statistics: Num rows: 3 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      aggregations: count()
+                      minReductionHashAggr: 0.6666666
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: bigint)
+            Execution mode: vectorized
+        Reducer 2 
+            Execution mode: vectorized
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: insert overwrite table ice03 select * from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: default@ice03
+POSTHOOK: query: insert overwrite table ice03 select * from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: default@ice03
+PREHOOK: query: explain select count(*) from ice03
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice03
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice03
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: drop table ice01
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@ice01
@@ -212,3 +361,13 @@ POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@ice02
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@ice02
+PREHOOK: query: drop table ice03
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@ice03
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice03
+POSTHOOK: query: drop table ice03
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@ice03
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice03
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 38d67660c639..ecfe9f8af1ff 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -61,6 +61,7 @@
 import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
 import org.apache.hadoop.hive.ql.plan.FetchWork;
 import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.stats.Partish;
 import org.apache.hadoop.hive.ql.stats.StatsUtils;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
@@ -943,6 +944,11 @@ private Long getRowCnt(
         }
       } else { // unpartitioned table
         if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) {
+          if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())
+                  && tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
+            return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl))
+                    .get(StatsSetupConst.ROW_COUNT));
+          }
           return null;
         }
         rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT));

From 1cc6c63f4f369348608a2f4c15ee77e04fbb4c77 Mon Sep 17 00:00:00 2001
From: zhangbutao <zhangbutao@cmss.chinamobile.com>
Date: Fri, 26 Apr 2024 09:47:34 +0800
Subject: [PATCH 2/6] code minor optimization

---
 .../java/org/apache/hadoop/hive/ql/stats/StatsUtils.java | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 239f57b69b3e..81fde429cb31 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -2036,10 +2036,7 @@ public static Range combineRange(Range range1, Range range2) {
 
   public static boolean checkCanProvideStats(Table table) {
     if (MetaStoreUtils.isExternalTable(table.getTTable())) {
-      if (MetaStoreUtils.isNonNativeTable(table.getTTable()) && table.getStorageHandler().canProvideBasicStatistics()) {
-        return true;
-      }
-      return false;
+      return MetaStoreUtils.isNonNativeTable(table.getTTable()) && table.getStorageHandler().canProvideBasicStatistics();
     }
     return true;
   }
@@ -2049,7 +2046,7 @@ public static boolean checkCanProvideStats(Table table) {
    * Can run additional checks compared to the version in StatsSetupConst.
    */
   public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map<String, String> params) {
-    return checkCanProvideStats(table) == true ? StatsSetupConst.areBasicStatsUptoDate(params) : false;
+    return checkCanProvideStats(table) && StatsSetupConst.areBasicStatsUptoDate(params);
   }
 
   /**
@@ -2057,7 +2054,7 @@ public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map<St
    * Can run additional checks compared to the version in StatsSetupConst.
    */
   public static boolean areColumnStatsUptoDateForQueryAnswering(Table table, Map<String, String> params, String colName) {
-    return checkCanProvideStats(table) == true ? StatsSetupConst.areColumnStatsUptoDate(params, colName) : false;
+    return checkCanProvideStats(table) && StatsSetupConst.areColumnStatsUptoDate(params, colName);
   }
 
   /**

From 6349d23e5f40aa396de9a71ca83a0b28ae3182f5 Mon Sep 17 00:00:00 2001
From: zhangbutao <zhangbutao@cmss.chinamobile.com>
Date: Thu, 10 Oct 2024 22:39:24 -0400
Subject: [PATCH 3/6] Get stats based on specific snapshot

---
 .../mr/hive/HiveIcebergStorageHandler.java    | 41 +++++++++++++--
 .../positive/write_iceberg_branch.q.out       | 50 +++++++++----------
 .../hadoop/hive/ql/metadata/HiveUtils.java    |  8 +--
 3 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index 50280a8a0abe..ed487d96d301 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -25,6 +25,7 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.ByteBuffer;
+import java.time.ZoneId;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@@ -55,6 +56,8 @@
 import org.apache.hadoop.hive.common.type.Date;
 import org.apache.hadoop.hive.common.type.SnapshotContext;
 import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.common.type.TimestampTZ;
+import org.apache.hadoop.hive.common.type.TimestampTZUtil;
 import org.apache.hadoop.hive.conf.Constants;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -158,6 +161,7 @@
 import org.apache.iceberg.SchemaParser;
 import org.apache.iceberg.SerializableTable;
 import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.SnapshotSummary;
 import org.apache.iceberg.SortDirection;
 import org.apache.iceberg.SortField;
@@ -465,10 +469,11 @@ public Map<String, String> getBasicStatistics(Partish partish) {
     org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
     // For write queries where rows got modified, don't fetch from cache as values could have changed.
     Table table = getTable(hmsTable);
+    Snapshot snapshot = getSpecificSnapshot(partish.getTable(), table);
     Map<String, String> stats = Maps.newHashMap();
     if (getStatsSource().equals(HiveMetaHook.ICEBERG)) {
-      if (table.currentSnapshot() != null) {
-        Map<String, String> summary = table.currentSnapshot().summary();
+      if (snapshot != null) {
+        Map<String, String> summary = snapshot.summary();
         if (summary != null) {
 
           if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
@@ -613,8 +618,9 @@ private ColumnStatistics readColStats(Table table, Path statsPath) {
   public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
     if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) {
       Table table = getTable(hmsTable);
-      if (table.currentSnapshot() != null) {
-        Map<String, String> summary = table.currentSnapshot().summary();
+      Snapshot snapshot = getSpecificSnapshot(hmsTable, table);
+      if (snapshot != null) {
+        Map<String, String> summary = snapshot.summary();
         if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
             summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
 
@@ -2197,4 +2203,31 @@ private static List<FieldSchema> schema(List<VirtualColumn> exprs) {
   private static List<FieldSchema> orderBy(VirtualColumn... exprs) {
     return schema(Arrays.asList(exprs));
   }
+  private Snapshot getSpecificSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) {
+    String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef());
+    Snapshot snapshot;
+    if (refName != null) {
+      snapshot = table.snapshot(refName);
+    } else if (hmsTable.getAsOfTimestamp() != null) {
+      ZoneId timeZone = SessionState.get() == null ? new HiveConf().getLocalTimeZone() :
+              SessionState.get().getConf().getLocalTimeZone();
+      TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(), timeZone);
+      long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table, time.toEpochMilli());
+      snapshot = table.snapshot(snapshotId);
+    } else if (hmsTable.getAsOfVersion() != null) {
+      try {
+        snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion()));
+      } catch (NumberFormatException e) {
+        SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion());
+        if (ref == null) {
+          throw new RuntimeException("Cannot find matching snapshot ID or reference name for version " +
+                  hmsTable.getAsOfVersion());
+        }
+        snapshot = table.snapshot(ref.snapshotId());
+      }
+    } else {
+      snapshot = table.currentSnapshot();
+    }
+    return snapshot;
+  }
 }
diff --git a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
index bc662c426a44..cbf1e0562d19 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out
@@ -234,17 +234,17 @@ STAGE PLANS:
                   alias: ice01
                   filterExpr: (a = 22) (type: boolean)
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 5 Data size: 485 Basic stats: COMPLETE Column stats: COMPLETE
                   Filter Operator
                     predicate: (a = 22) (type: boolean)
-                    Statistics: Num rows: 2 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string), 22 (type: int), b (type: string), c (type: int)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7
-                      Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE
                         table:
                             input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
                             output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -316,7 +316,7 @@ STAGE PLANS:
                   alias: ice01
                   filterExpr: (c = 66) (type: boolean)
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE
                   Filter Operator
                     predicate: (c = 66) (type: boolean)
                     Statistics: Num rows: 2 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
@@ -451,20 +451,20 @@ STAGE PLANS:
                   alias: ice01
                   filterExpr: a is not null (type: boolean)
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE
                   Filter Operator
                     predicate: a is not null (type: boolean)
-                    Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string), a (type: int), b (type: string), c (type: int)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7
-                      Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 4 Data size: 1940 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col5 (type: int)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col5 (type: int)
-                        Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 4 Data size: 1940 Basic stats: COMPLETE Column stats: COMPLETE
                         value expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string), _col6 (type: string), _col7 (type: int)
             Execution mode: vectorized
         Reducer 2 
@@ -476,11 +476,11 @@ STAGE PLANS:
                   0 _col0 (type: int)
                   1 _col5 (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 3 Data size: 1743 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 4 Data size: 2324 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col1 (type: string), _col0 (type: int), _col5 (type: string), _col7 (type: string), _col2 (type: int), _col6 (type: bigint), _col4 (type: bigint), _col3 (type: int), _col10 (type: int), _col9 (type: string), _col8 (type: int)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                  Statistics: Num rows: 3 Data size: 1743 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 2324 Basic stats: COMPLETE Column stats: COMPLETE
                   Filter Operator
                     predicate: ((_col10 = _col1) and (_col10 > 100)) (type: boolean)
                     Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE
@@ -498,14 +498,14 @@ STAGE PLANS:
                             name: default.ice01
                   Filter Operator
                     predicate: ((_col10 = _col1) and (_col10 <= 100)) (type: boolean)
-                    Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: _col7 (type: int), _col6 (type: bigint), _col2 (type: string), _col5 (type: bigint), _col3 (type: string), _col10 (type: int), _col9 (type: string), _col8 (type: int)
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7
-                      Statistics: Num rows: 1 Data size: 485 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 1 Data size: 485 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE
                         table:
                             input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
                             output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -513,14 +513,14 @@ STAGE PLANS:
                             name: default.ice01
                   Filter Operator
                     predicate: ((_col10 = _col1) and (_col10 <= 100)) (type: boolean)
-                    Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: _col10 (type: int), 'Merged' (type: string), (_col8 + 10) (type: int)
                       outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 1 Data size: 98 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 196 Basic stats: COMPLETE Column stats: COMPLETE
                       File Output Operator
                         compressed: false
-                        Statistics: Num rows: 1 Data size: 98 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 196 Basic stats: COMPLETE Column stats: COMPLETE
                         table:
                             input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
                             output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
@@ -543,24 +543,24 @@ STAGE PLANS:
                             name: default.ice01
                   Filter Operator
                     predicate: (_col10 = _col1) (type: boolean)
-                    Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE
                     Select Operator
                       expressions: _col2 (type: string), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: int)
                       outputColumnNames: _col2, _col5, _col6, _col7
-                      Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE
                       Group By Operator
                         aggregations: count()
                         keys: _col7 (type: int), _col6 (type: bigint), _col2 (type: string), _col5 (type: bigint)
                         minReductionHashAggr: 0.4
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                        Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint)
                           null sort order: zzzz
                           sort order: ++++
                           Map-reduce partition columns: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint)
-                          Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE
                           value expressions: _col4 (type: bigint)
         Reducer 3 
             Execution mode: vectorized
@@ -570,7 +570,7 @@ STAGE PLANS:
                 keys: KEY._col0 (type: int), KEY._col1 (type: bigint), KEY._col2 (type: string), KEY._col3 (type: bigint)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4
-                Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE
                 Filter Operator
                   predicate: (_col4 > 1L) (type: boolean)
                   Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE
@@ -795,14 +795,14 @@ STAGE PLANS:
                 TableScan
                   alias: ice01
                   Snapshot ref: branch_test1
-                  Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE
                   Select Operator
                     expressions: a (type: int), b (type: string), c (type: int)
                     outputColumnNames: _col0, _col1, _col2
-                    Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE
                     File Output Operator
                       compressed: false
-                      Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE
                       table:
                           input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                           output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
index 5343a1bb3bb4..c66285a0b424 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
@@ -518,9 +518,11 @@ public static Path getDumpPath(Path root, String dbName, String tableName) {
   }
 
   public static String getTableSnapshotRef(String refName) {
-    Matcher ref = SNAPSHOT_REF.matcher(refName);
-    if (ref.matches()) {
-      return ref.group(1);
+    if (refName != null && !refName.isEmpty()) {
+      Matcher ref = SNAPSHOT_REF.matcher(refName);
+      if (ref.matches()) {
+        return ref.group(1);
+      }
     }
     return null;
   }

From 69c806afc780aadeffc5a9b83c5fc40f2132e5fb Mon Sep 17 00:00:00 2001
From: zhangbutao <zhangbutao@cmss.chinamobile.com>
Date: Fri, 11 Oct 2024 01:58:32 -0400
Subject: [PATCH 4/6] Fix review comments

---
 .../mr/hive/HiveIcebergStorageHandler.java    | 35 ++-----------------
 .../iceberg/mr/hive/IcebergTableUtil.java     | 35 +++++++++++++++++++
 .../hadoop/hive/ql/metadata/HiveUtils.java    | 10 +++---
 3 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index ed487d96d301..abb4369e7b4f 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -25,7 +25,6 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.ByteBuffer;
-import java.time.ZoneId;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@@ -56,8 +55,6 @@
 import org.apache.hadoop.hive.common.type.Date;
 import org.apache.hadoop.hive.common.type.SnapshotContext;
 import org.apache.hadoop.hive.common.type.Timestamp;
-import org.apache.hadoop.hive.common.type.TimestampTZ;
-import org.apache.hadoop.hive.common.type.TimestampTZUtil;
 import org.apache.hadoop.hive.conf.Constants;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -161,7 +158,6 @@
 import org.apache.iceberg.SchemaParser;
 import org.apache.iceberg.SerializableTable;
 import org.apache.iceberg.Snapshot;
-import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.SnapshotSummary;
 import org.apache.iceberg.SortDirection;
 import org.apache.iceberg.SortField;
@@ -469,9 +465,9 @@ public Map<String, String> getBasicStatistics(Partish partish) {
     org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
     // For write queries where rows got modified, don't fetch from cache as values could have changed.
     Table table = getTable(hmsTable);
-    Snapshot snapshot = getSpecificSnapshot(partish.getTable(), table);
     Map<String, String> stats = Maps.newHashMap();
     if (getStatsSource().equals(HiveMetaHook.ICEBERG)) {
+      Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table);
       if (snapshot != null) {
         Map<String, String> summary = snapshot.summary();
         if (summary != null) {
@@ -618,7 +614,7 @@ private ColumnStatistics readColStats(Table table, Path statsPath) {
   public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
     if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) {
       Table table = getTable(hmsTable);
-      Snapshot snapshot = getSpecificSnapshot(hmsTable, table);
+      Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table);
       if (snapshot != null) {
         Map<String, String> summary = snapshot.summary();
         if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
@@ -2203,31 +2199,4 @@ private static List<FieldSchema> schema(List<VirtualColumn> exprs) {
   private static List<FieldSchema> orderBy(VirtualColumn... exprs) {
     return schema(Arrays.asList(exprs));
   }
-  private Snapshot getSpecificSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) {
-    String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef());
-    Snapshot snapshot;
-    if (refName != null) {
-      snapshot = table.snapshot(refName);
-    } else if (hmsTable.getAsOfTimestamp() != null) {
-      ZoneId timeZone = SessionState.get() == null ? new HiveConf().getLocalTimeZone() :
-              SessionState.get().getConf().getLocalTimeZone();
-      TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(), timeZone);
-      long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table, time.toEpochMilli());
-      snapshot = table.snapshot(snapshotId);
-    } else if (hmsTable.getAsOfVersion() != null) {
-      try {
-        snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion()));
-      } catch (NumberFormatException e) {
-        SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion());
-        if (ref == null) {
-          throw new RuntimeException("Cannot find matching snapshot ID or reference name for version " +
-                  hmsTable.getAsOfVersion());
-        }
-        snapshot = table.snapshot(ref.snapshotId());
-      }
-    } else {
-      snapshot = table.currentSnapshot();
-    }
-    return snapshot;
-  }
 }
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 88dd006b7721..537d57dc1c72 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -20,6 +20,7 @@
 package org.apache.iceberg.mr.hive;
 
 import java.io.IOException;
+import java.time.ZoneId;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
@@ -31,6 +32,9 @@
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.TimestampTZ;
+import org.apache.hadoop.hive.common.type.TimestampTZUtil;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
@@ -42,6 +46,7 @@
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.parse.TransformSpec;
 import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.session.SessionStateUtil;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DeleteFile;
@@ -59,6 +64,7 @@
 import org.apache.iceberg.RowLevelOperationMode;
 import org.apache.iceberg.ScanTask;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.StructLike;
 import org.apache.iceberg.Table;
@@ -79,6 +85,7 @@
 import org.apache.iceberg.types.Type;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.PropertyUtil;
+import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.iceberg.util.StructProjection;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -559,4 +566,32 @@ public static List<String> getPartitionNames(Table icebergTable, Map<String, Str
       throw new SemanticException(String.format("Error while fetching the partitions due to: %s", e));
     }
   }
+
+  public static Snapshot getSpecificSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) {
+    String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef());
+    Snapshot snapshot;
+    if (refName != null) {
+      snapshot = table.snapshot(refName);
+    } else if (hmsTable.getAsOfTimestamp() != null) {
+      ZoneId timeZone = SessionState.get() == null ? new HiveConf().getLocalTimeZone() :
+          SessionState.get().getConf().getLocalTimeZone();
+      TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(), timeZone);
+      long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table, time.toEpochMilli());
+      snapshot = table.snapshot(snapshotId);
+    } else if (hmsTable.getAsOfVersion() != null) {
+      try {
+        snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion()));
+      } catch (NumberFormatException e) {
+        SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion());
+        if (ref == null) {
+          throw new RuntimeException("Cannot find matching snapshot ID or reference name for version " +
+              hmsTable.getAsOfVersion());
+        }
+        snapshot = table.snapshot(ref.snapshotId());
+      }
+    } else {
+      snapshot = table.currentSnapshot();
+    }
+    return snapshot;
+  }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
index c66285a0b424..9784e944ca0b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
@@ -518,13 +518,11 @@ public static Path getDumpPath(Path root, String dbName, String tableName) {
   }
 
   public static String getTableSnapshotRef(String refName) {
-    if (refName != null && !refName.isEmpty()) {
-      Matcher ref = SNAPSHOT_REF.matcher(refName);
-      if (ref.matches()) {
-        return ref.group(1);
-      }
+    if (org.apache.commons.lang3.StringUtils.isEmpty(refName)) {
+      return null;
     }
-    return null;
+    Matcher ref = SNAPSHOT_REF.matcher(refName);
+    return ref.matches()? ref.group(1) : null;
   }
 
   public static Boolean isTableTag(String refName) {

From 22a6a989cb9bfc7c1a200948ffa6abf39506ac04 Mon Sep 17 00:00:00 2001
From: Denys Kuzmenko <dkuzmenko@apache.org>
Date: Fri, 11 Oct 2024 02:58:55 -0400
Subject: [PATCH 5/6] Do not use HMS stats when statsSource is Iceberg #5400

---
 .../mr/hive/HiveIcebergStorageHandler.java    | 60 +++++++++----------
 .../hive/ql/metadata/HiveStorageHandler.java  |  5 ++
 .../hive/ql/optimizer/StatsOptimizer.java     |  6 +-
 3 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index abb4369e7b4f..d753f9ead1f0 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -119,7 +119,6 @@
 import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.session.SessionStateUtil;
-import org.apache.hadoop.hive.ql.stats.Partish;
 import org.apache.hadoop.hive.serde2.AbstractSerDe;
 import org.apache.hadoop.hive.serde2.DefaultFetchFormatter;
 import org.apache.hadoop.hive.serde2.Deserializer;
@@ -461,45 +460,46 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr
   }
 
   @Override
-  public Map<String, String> getBasicStatistics(Partish partish) {
-    org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
+  public Map<String, String> getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
     // For write queries where rows got modified, don't fetch from cache as values could have changed.
-    Table table = getTable(hmsTable);
     Map<String, String> stats = Maps.newHashMap();
-    if (getStatsSource().equals(HiveMetaHook.ICEBERG)) {
-      Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table);
-      if (snapshot != null) {
-        Map<String, String> summary = snapshot.summary();
-        if (summary != null) {
+    if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) {
+      return hmsTable.getParameters();
+    }
+    Table table = getTable(hmsTable);
 
-          if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
-            stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
-          }
+    Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table);
+    if (snapshot != null) {
+      Map<String, String> summary = snapshot.summary();
+      if (summary != null) {
 
-          if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
-            long totalRecords = Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
-            if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
-                summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
+        if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
+          stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
+        }
 
-              long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
-              long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
+        if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
+          long totalRecords = Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
+          if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
+              summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
 
-              long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : totalPosDeletes);
-              totalRecords = actualRecords > 0 ? actualRecords : totalRecords;
-              // actualRecords maybe -ve in edge cases
-            }
-            stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords));
-          }
+            long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
+            long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
 
-          if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
-            stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
+            long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : totalPosDeletes);
+            totalRecords = actualRecords > 0 ? actualRecords : totalRecords;
+            // actualRecords maybe -ve in edge cases
           }
+          stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords));
+        }
+
+        if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
+          stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
         }
-      } else {
-        stats.put(StatsSetupConst.NUM_FILES, "0");
-        stats.put(StatsSetupConst.ROW_COUNT, "0");
-        stats.put(StatsSetupConst.TOTAL_SIZE, "0");
       }
+    } else {
+      stats.put(StatsSetupConst.NUM_FILES, "0");
+      stats.put(StatsSetupConst.ROW_COUNT, "0");
+      stats.put(StatsSetupConst.TOTAL_SIZE, "0");
     }
     return stats;
   }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index 2b05837a8848..857820ed1ada 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -248,7 +248,12 @@ default Map<String, String> getOperatorDescProperties(OperatorDesc operatorDesc,
    * @param partish a partish wrapper class
    * @return map of basic statistics, can be null
    */
+  @Deprecated
   default Map<String, String> getBasicStatistics(Partish partish) {
+    return getBasicStatistics(partish.getTable());
+  }
+
+  default Map<String, String> getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
     return null;
   }
 
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index ecfe9f8af1ff..93108b841ddc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -946,12 +946,14 @@ private Long getRowCnt(
         if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) {
           if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())
                   && tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
-            return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl))
+            return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(tbl)
                     .get(StatsSetupConst.ROW_COUNT));
           }
           return null;
         }
-        rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+        Map<String, String> basicStats = MetaStoreUtils.isNonNativeTable(tbl.getTTable()) ?
+            tbl.getStorageHandler().getBasicStatistics(tbl) : tbl.getParameters();
+        rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT));
       }
       return rowCnt;
     }

From deb46ff07f92cf12eddbcb8b91e11f7b8afac0e5 Mon Sep 17 00:00:00 2001
From: zhangbutao <zhangbutao@cmss.chinamobile.com>
Date: Fri, 25 Oct 2024 02:51:58 -0400
Subject: [PATCH 6/6] Resolve comments

---
 .../iceberg/mr/hive/HiveIcebergStorageHandler.java  |  8 +++++---
 .../apache/iceberg/mr/hive/IcebergTableUtil.java    |  2 +-
 .../hadoop/hive/ql/metadata/HiveStorageHandler.java |  5 -----
 .../apache/hadoop/hive/ql/metadata/HiveUtils.java   |  7 ++-----
 .../hadoop/hive/ql/optimizer/StatsOptimizer.java    | 13 ++++++-------
 5 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index d753f9ead1f0..9bbc6d3566dd 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -119,6 +119,7 @@
 import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.session.SessionStateUtil;
+import org.apache.hadoop.hive.ql.stats.Partish;
 import org.apache.hadoop.hive.serde2.AbstractSerDe;
 import org.apache.hadoop.hive.serde2.DefaultFetchFormatter;
 import org.apache.hadoop.hive.serde2.Deserializer;
@@ -460,7 +461,8 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr
   }
 
   @Override
-  public Map<String, String> getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+  public Map<String, String> getBasicStatistics(Partish partish) {
+    org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
     // For write queries where rows got modified, don't fetch from cache as values could have changed.
     Map<String, String> stats = Maps.newHashMap();
     if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) {
@@ -468,7 +470,7 @@ public Map<String, String> getBasicStatistics(org.apache.hadoop.hive.ql.metadata
     }
     Table table = getTable(hmsTable);
 
-    Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table);
+    Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table);
     if (snapshot != null) {
       Map<String, String> summary = snapshot.summary();
       if (summary != null) {
@@ -614,7 +616,7 @@ private ColumnStatistics readColStats(Table table, Path statsPath) {
   public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
     if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) {
       Table table = getTable(hmsTable);
-      Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table);
+      Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table);
       if (snapshot != null) {
         Map<String, String> summary = snapshot.summary();
         if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
index 537d57dc1c72..c096246bc53a 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -567,7 +567,7 @@ public static List<String> getPartitionNames(Table icebergTable, Map<String, Str
     }
   }
 
-  public static Snapshot getSpecificSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) {
+  public static Snapshot getTableSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) {
     String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef());
     Snapshot snapshot;
     if (refName != null) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index 857820ed1ada..2b05837a8848 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -248,12 +248,7 @@ default Map<String, String> getOperatorDescProperties(OperatorDesc operatorDesc,
    * @param partish a partish wrapper class
    * @return map of basic statistics, can be null
    */
-  @Deprecated
   default Map<String, String> getBasicStatistics(Partish partish) {
-    return getBasicStatistics(partish.getTable());
-  }
-
-  default Map<String, String> getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
     return null;
   }
 
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
index 9784e944ca0b..e4076d113678 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java
@@ -518,11 +518,8 @@ public static Path getDumpPath(Path root, String dbName, String tableName) {
   }
 
   public static String getTableSnapshotRef(String refName) {
-    if (org.apache.commons.lang3.StringUtils.isEmpty(refName)) {
-      return null;
-    }
-    Matcher ref = SNAPSHOT_REF.matcher(refName);
-    return ref.matches()? ref.group(1) : null;
+    Matcher ref = SNAPSHOT_REF.matcher(String.valueOf(refName));
+    return ref.matches() ? ref.group(1) : null;
   }
 
   public static Boolean isTableTag(String refName) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 93108b841ddc..1de37c421051 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -943,16 +943,15 @@ private Long getRowCnt(
           rowCnt += partRowCnt;
         }
       } else { // unpartitioned table
-        if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) {
-          if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())
-                  && tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
-            return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(tbl)
-                    .get(StatsSetupConst.ROW_COUNT));
+        Map<String, String> basicStats = tbl.getParameters();
+        if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())) {
+          if (!tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
+            return null;
           }
+          basicStats = tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl));
+        } else if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) {
           return null;
         }
-        Map<String, String> basicStats = MetaStoreUtils.isNonNativeTable(tbl.getTTable()) ?
-            tbl.getStorageHandler().getBasicStatistics(tbl) : tbl.getParameters();
         rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT));
       }
       return rowCnt;