From 7f2a746f6e8924ba40e38f3e5f3a879611c8bf00 Mon Sep 17 00:00:00 2001 From: zhangbutao Date: Thu, 25 Apr 2024 22:44:12 +0800 Subject: [PATCH 1/6] Iceberg: Retrieve row count from iceberg SnapshotSummary in case of iceberg.hive.keep.stats=false --- .../src/test/queries/positive/iceberg_stats.q | 22 +++ .../test/results/positive/iceberg_stats.q.out | 159 ++++++++++++++++++ .../hive/ql/optimizer/StatsOptimizer.java | 6 + 3 files changed, 187 insertions(+) diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q index de88018f32e0..6fc965e17456 100644 --- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q @@ -28,5 +28,27 @@ select count(*) from ice01; insert overwrite table ice01 select * from ice01; explain select count(*) from ice01; +-- false means that count(*) query won't use row count stored in HMS +set iceberg.hive.keep.stats=false; + +create external table ice03 (id int, key int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2'); + +insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1); +-- Iceberg table can utilize fetch task to directly retrieve the row count from iceberg SnapshotSummary +explain select count(*) from ice03; +select count(*) from ice03; + +-- delete some values +delete from ice03 where id in (2,4); + +explain select count(*) from ice03; +select count(*) from ice03; + +-- iow +insert overwrite table ice03 select * from ice03; +explain select count(*) from ice03; + drop table ice01; drop table ice02; +drop table ice03; diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out index 33c60b54608d..4e5b70945016 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out @@ -192,6 +192,155 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: create external table ice03 (id int, key int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@ice03 +POSTHOOK: query: create external table ice03 (id int, key int) Stored by Iceberg stored as ORC + TBLPROPERTIES('format-version'='2') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ice03 +PREHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@ice03 +POSTHOOK: query: insert into ice03 values (1,1),(2,1),(3,1),(4,1),(5,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@ice03 +PREHOOK: query: explain select count(*) from ice03 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain select count(*) from ice03 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: hdfs://### HDFS PATH ### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from ice03 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(*) from ice03 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: hdfs://### HDFS PATH ### +5 +PREHOOK: query: delete from ice03 where id in (2,4) +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: default@ice03 +POSTHOOK: query: delete from ice03 where id in (2,4) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: default@ice03 +PREHOOK: query: explain select count(*) from ice03 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain select count(*) from ice03 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: hdfs://### HDFS PATH ### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: ice03 + Statistics: Num rows: 3 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 3 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + minReductionHashAggr: 0.6666666 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from ice03 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(*) from ice03 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: hdfs://### HDFS PATH ### +3 +PREHOOK: query: insert overwrite table ice03 select * from ice03 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: default@ice03 +POSTHOOK: query: insert overwrite table ice03 select * from ice03 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: default@ice03 +PREHOOK: query: explain select count(*) from ice03 +PREHOOK: type: QUERY +PREHOOK: Input: default@ice03 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain select count(*) from ice03 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: hdfs://### HDFS PATH ### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + PREHOOK: query: drop table ice01 PREHOOK: type: DROPTABLE PREHOOK: Input: default@ice01 @@ -212,3 +361,13 @@ POSTHOOK: type: DROPTABLE POSTHOOK: Input: default@ice02 POSTHOOK: Output: database:default POSTHOOK: Output: default@ice02 +PREHOOK: query: drop table ice03 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@ice03 +PREHOOK: Output: database:default +PREHOOK: Output: default@ice03 +POSTHOOK: query: drop table ice03 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@ice03 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ice03 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 38d67660c639..ecfe9f8af1ff 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -61,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.stats.Partish; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax; @@ -943,6 +944,11 @@ private Long getRowCnt( } } else { // unpartitioned table if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { + if (MetaStoreUtils.isNonNativeTable(tbl.getTTable()) + && tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) { + return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl)) + .get(StatsSetupConst.ROW_COUNT)); + } return null; } rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT)); From 1cc6c63f4f369348608a2f4c15ee77e04fbb4c77 Mon Sep 17 00:00:00 2001 From: zhangbutao Date: Fri, 26 Apr 2024 09:47:34 +0800 Subject: [PATCH 2/6] code minor optimization --- .../java/org/apache/hadoop/hive/ql/stats/StatsUtils.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 239f57b69b3e..81fde429cb31 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -2036,10 +2036,7 @@ public static Range combineRange(Range range1, Range range2) { public static boolean checkCanProvideStats(Table table) { if (MetaStoreUtils.isExternalTable(table.getTTable())) { - if (MetaStoreUtils.isNonNativeTable(table.getTTable()) && table.getStorageHandler().canProvideBasicStatistics()) { - return true; - } - return false; + return MetaStoreUtils.isNonNativeTable(table.getTTable()) && table.getStorageHandler().canProvideBasicStatistics(); } return true; } @@ -2049,7 +2046,7 @@ public static boolean checkCanProvideStats(Table table) { * Can run additional checks compared to the version in StatsSetupConst. */ public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map params) { - return checkCanProvideStats(table) == true ? StatsSetupConst.areBasicStatsUptoDate(params) : false; + return checkCanProvideStats(table) && StatsSetupConst.areBasicStatsUptoDate(params); } /** @@ -2057,7 +2054,7 @@ public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map params, String colName) { - return checkCanProvideStats(table) == true ? StatsSetupConst.areColumnStatsUptoDate(params, colName) : false; + return checkCanProvideStats(table) && StatsSetupConst.areColumnStatsUptoDate(params, colName); } /** From 6349d23e5f40aa396de9a71ca83a0b28ae3182f5 Mon Sep 17 00:00:00 2001 From: zhangbutao Date: Thu, 10 Oct 2024 22:39:24 -0400 Subject: [PATCH 3/6] Get stats based on specific snapshot --- .../mr/hive/HiveIcebergStorageHandler.java | 41 +++++++++++++-- .../positive/write_iceberg_branch.q.out | 50 +++++++++---------- .../hadoop/hive/ql/metadata/HiveUtils.java | 8 +-- 3 files changed, 67 insertions(+), 32 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index 50280a8a0abe..ed487d96d301 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -25,6 +25,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.nio.ByteBuffer; +import java.time.ZoneId; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -55,6 +56,8 @@ import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.SnapshotContext; import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.common.type.TimestampTZ; +import org.apache.hadoop.hive.common.type.TimestampTZUtil; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; @@ -158,6 +161,7 @@ import org.apache.iceberg.SchemaParser; import org.apache.iceberg.SerializableTable; import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortField; @@ -465,10 +469,11 @@ public Map getBasicStatistics(Partish partish) { org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable(); // For write queries where rows got modified, don't fetch from cache as values could have changed. Table table = getTable(hmsTable); + Snapshot snapshot = getSpecificSnapshot(partish.getTable(), table); Map stats = Maps.newHashMap(); if (getStatsSource().equals(HiveMetaHook.ICEBERG)) { - if (table.currentSnapshot() != null) { - Map summary = table.currentSnapshot().summary(); + if (snapshot != null) { + Map summary = snapshot.summary(); if (summary != null) { if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) { @@ -613,8 +618,9 @@ private ColumnStatistics readColStats(Table table, Path statsPath) { public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) { Table table = getTable(hmsTable); - if (table.currentSnapshot() != null) { - Map summary = table.currentSnapshot().summary(); + Snapshot snapshot = getSpecificSnapshot(hmsTable, table); + if (snapshot != null) { + Map summary = snapshot.summary(); if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) { @@ -2197,4 +2203,31 @@ private static List schema(List exprs) { private static List orderBy(VirtualColumn... exprs) { return schema(Arrays.asList(exprs)); } + private Snapshot getSpecificSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) { + String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef()); + Snapshot snapshot; + if (refName != null) { + snapshot = table.snapshot(refName); + } else if (hmsTable.getAsOfTimestamp() != null) { + ZoneId timeZone = SessionState.get() == null ? new HiveConf().getLocalTimeZone() : + SessionState.get().getConf().getLocalTimeZone(); + TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(), timeZone); + long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table, time.toEpochMilli()); + snapshot = table.snapshot(snapshotId); + } else if (hmsTable.getAsOfVersion() != null) { + try { + snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion())); + } catch (NumberFormatException e) { + SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion()); + if (ref == null) { + throw new RuntimeException("Cannot find matching snapshot ID or reference name for version " + + hmsTable.getAsOfVersion()); + } + snapshot = table.snapshot(ref.snapshotId()); + } + } else { + snapshot = table.currentSnapshot(); + } + return snapshot; + } } diff --git a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out index bc662c426a44..cbf1e0562d19 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/write_iceberg_branch.q.out @@ -234,17 +234,17 @@ STAGE PLANS: alias: ice01 filterExpr: (a = 22) (type: boolean) Snapshot ref: branch_test1 - Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 485 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (a = 22) (type: boolean) - Statistics: Num rows: 2 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string), 22 (type: int), b (type: string), c (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat @@ -316,7 +316,7 @@ STAGE PLANS: alias: ice01 filterExpr: (c = 66) (type: boolean) Snapshot ref: branch_test1 - Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (c = 66) (type: boolean) Statistics: Num rows: 2 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE @@ -451,20 +451,20 @@ STAGE PLANS: alias: ice01 filterExpr: a is not null (type: boolean) Snapshot ref: branch_test1 - Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: a is not null (type: boolean) - Statistics: Num rows: 3 Data size: 291 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string), a (type: int), b (type: string), c (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 1940 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col5 (type: int) null sort order: z sort order: + Map-reduce partition columns: _col5 (type: int) - Statistics: Num rows: 3 Data size: 1455 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 1940 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string), _col6 (type: string), _col7 (type: int) Execution mode: vectorized Reducer 2 @@ -476,11 +476,11 @@ STAGE PLANS: 0 _col0 (type: int) 1 _col5 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 3 Data size: 1743 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 2324 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col1 (type: string), _col0 (type: int), _col5 (type: string), _col7 (type: string), _col2 (type: int), _col6 (type: bigint), _col4 (type: bigint), _col3 (type: int), _col10 (type: int), _col9 (type: string), _col8 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 3 Data size: 1743 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 2324 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((_col10 = _col1) and (_col10 > 100)) (type: boolean) Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE @@ -498,14 +498,14 @@ STAGE PLANS: name: default.ice01 Filter Operator predicate: ((_col10 = _col1) and (_col10 <= 100)) (type: boolean) - Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col7 (type: int), _col6 (type: bigint), _col2 (type: string), _col5 (type: bigint), _col3 (type: string), _col10 (type: int), _col9 (type: string), _col8 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 1 Data size: 485 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 485 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 970 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat @@ -513,14 +513,14 @@ STAGE PLANS: name: default.ice01 Filter Operator predicate: ((_col10 = _col1) and (_col10 <= 100)) (type: boolean) - Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col10 (type: int), 'Merged' (type: string), (_col8 + 10) (type: int) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 98 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 196 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 98 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 196 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat @@ -543,24 +543,24 @@ STAGE PLANS: name: default.ice01 Filter Operator predicate: (_col10 = _col1) (type: boolean) - Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col2 (type: string), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: int) outputColumnNames: _col2, _col5, _col6, _col7 - Statistics: Num rows: 1 Data size: 581 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1162 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() keys: _col7 (type: int), _col6 (type: bigint), _col2 (type: string), _col5 (type: bigint) minReductionHashAggr: 0.4 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint) null sort order: zzzz sort order: ++++ Map-reduce partition columns: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint) - Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col4 (type: bigint) Reducer 3 Execution mode: vectorized @@ -570,7 +570,7 @@ STAGE PLANS: keys: KEY._col0 (type: int), KEY._col1 (type: bigint), KEY._col2 (type: string), KEY._col3 (type: bigint) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 424 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (_col4 > 1L) (type: boolean) Statistics: Num rows: 1 Data size: 212 Basic stats: COMPLETE Column stats: COMPLETE @@ -795,14 +795,14 @@ STAGE PLANS: TableScan alias: ice01 Snapshot ref: branch_test1 - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: a (type: int), b (type: string), c (type: int) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java index 5343a1bb3bb4..c66285a0b424 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java @@ -518,9 +518,11 @@ public static Path getDumpPath(Path root, String dbName, String tableName) { } public static String getTableSnapshotRef(String refName) { - Matcher ref = SNAPSHOT_REF.matcher(refName); - if (ref.matches()) { - return ref.group(1); + if (refName != null && !refName.isEmpty()) { + Matcher ref = SNAPSHOT_REF.matcher(refName); + if (ref.matches()) { + return ref.group(1); + } } return null; } From 69c806afc780aadeffc5a9b83c5fc40f2132e5fb Mon Sep 17 00:00:00 2001 From: zhangbutao Date: Fri, 11 Oct 2024 01:58:32 -0400 Subject: [PATCH 4/6] Fix review comments --- .../mr/hive/HiveIcebergStorageHandler.java | 35 ++----------------- .../iceberg/mr/hive/IcebergTableUtil.java | 35 +++++++++++++++++++ .../hadoop/hive/ql/metadata/HiveUtils.java | 10 +++--- 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index ed487d96d301..abb4369e7b4f 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -25,7 +25,6 @@ import java.net.URI; import java.net.URISyntaxException; import java.nio.ByteBuffer; -import java.time.ZoneId; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -56,8 +55,6 @@ import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.common.type.SnapshotContext; import org.apache.hadoop.hive.common.type.Timestamp; -import org.apache.hadoop.hive.common.type.TimestampTZ; -import org.apache.hadoop.hive.common.type.TimestampTZUtil; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; @@ -161,7 +158,6 @@ import org.apache.iceberg.SchemaParser; import org.apache.iceberg.SerializableTable; import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortField; @@ -469,9 +465,9 @@ public Map getBasicStatistics(Partish partish) { org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable(); // For write queries where rows got modified, don't fetch from cache as values could have changed. Table table = getTable(hmsTable); - Snapshot snapshot = getSpecificSnapshot(partish.getTable(), table); Map stats = Maps.newHashMap(); if (getStatsSource().equals(HiveMetaHook.ICEBERG)) { + Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table); if (snapshot != null) { Map summary = snapshot.summary(); if (summary != null) { @@ -618,7 +614,7 @@ private ColumnStatistics readColStats(Table table, Path statsPath) { public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) { Table table = getTable(hmsTable); - Snapshot snapshot = getSpecificSnapshot(hmsTable, table); + Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table); if (snapshot != null) { Map summary = snapshot.summary(); if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && @@ -2203,31 +2199,4 @@ private static List schema(List exprs) { private static List orderBy(VirtualColumn... exprs) { return schema(Arrays.asList(exprs)); } - private Snapshot getSpecificSnapshot(org.apache.hadoop.hive.ql.metadata.Table hmsTable, Table table) { - String refName = HiveUtils.getTableSnapshotRef(hmsTable.getSnapshotRef()); - Snapshot snapshot; - if (refName != null) { - snapshot = table.snapshot(refName); - } else if (hmsTable.getAsOfTimestamp() != null) { - ZoneId timeZone = SessionState.get() == null ? new HiveConf().getLocalTimeZone() : - SessionState.get().getConf().getLocalTimeZone(); - TimestampTZ time = TimestampTZUtil.parse(hmsTable.getAsOfTimestamp(), timeZone); - long snapshotId = SnapshotUtil.snapshotIdAsOfTime(table, time.toEpochMilli()); - snapshot = table.snapshot(snapshotId); - } else if (hmsTable.getAsOfVersion() != null) { - try { - snapshot = table.snapshot(Long.parseLong(hmsTable.getAsOfVersion())); - } catch (NumberFormatException e) { - SnapshotRef ref = table.refs().get(hmsTable.getAsOfVersion()); - if (ref == null) { - throw new RuntimeException("Cannot find matching snapshot ID or reference name for version " + - hmsTable.getAsOfVersion()); - } - snapshot = table.snapshot(ref.snapshotId()); - } - } else { - snapshot = table.currentSnapshot(); - } - return snapshot; - } } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java index 88dd006b7721..537d57dc1c72 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java @@ -20,6 +20,7 @@ package org.apache.iceberg.mr.hive; import java.io.IOException; +import java.time.ZoneId; import java.util.Collections; import java.util.List; import java.util.Map; @@ -31,6 +32,9 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.TimestampTZ; +import org.apache.hadoop.hive.common.type.TimestampTZUtil; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; @@ -42,6 +46,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.TransformSpec; import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionStateUtil; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; @@ -59,6 +64,7 @@ import org.apache.iceberg.RowLevelOperationMode; import org.apache.iceberg.ScanTask; import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; @@ -79,6 +85,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.SnapshotUtil; import org.apache.iceberg.util.StructProjection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -559,4 +566,32 @@ public static List getPartitionNames(Table icebergTable, Map Date: Fri, 11 Oct 2024 02:58:55 -0400 Subject: [PATCH 5/6] Do not use HMS stats when statsSource is Iceberg #5400 --- .../mr/hive/HiveIcebergStorageHandler.java | 60 +++++++++---------- .../hive/ql/metadata/HiveStorageHandler.java | 5 ++ .../hive/ql/optimizer/StatsOptimizer.java | 6 +- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index abb4369e7b4f..d753f9ead1f0 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -119,7 +119,6 @@ import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionStateUtil; -import org.apache.hadoop.hive.ql.stats.Partish; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.DefaultFetchFormatter; import org.apache.hadoop.hive.serde2.Deserializer; @@ -461,45 +460,46 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr } @Override - public Map getBasicStatistics(Partish partish) { - org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable(); + public Map getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { // For write queries where rows got modified, don't fetch from cache as values could have changed. - Table table = getTable(hmsTable); Map stats = Maps.newHashMap(); - if (getStatsSource().equals(HiveMetaHook.ICEBERG)) { - Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table); - if (snapshot != null) { - Map summary = snapshot.summary(); - if (summary != null) { + if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) { + return hmsTable.getParameters(); + } + Table table = getTable(hmsTable); - if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) { - stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); - } + Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table); + if (snapshot != null) { + Map summary = snapshot.summary(); + if (summary != null) { - if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) { - long totalRecords = Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP)); - if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && - summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) { + if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) { + stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); + } - long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP)); - long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP)); + if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) { + long totalRecords = Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP)); + if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && + summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) { - long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : totalPosDeletes); - totalRecords = actualRecords > 0 ? actualRecords : totalRecords; - // actualRecords maybe -ve in edge cases - } - stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords)); - } + long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP)); + long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP)); - if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) { - stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); + long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : totalPosDeletes); + totalRecords = actualRecords > 0 ? actualRecords : totalRecords; + // actualRecords maybe -ve in edge cases } + stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords)); + } + + if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) { + stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); } - } else { - stats.put(StatsSetupConst.NUM_FILES, "0"); - stats.put(StatsSetupConst.ROW_COUNT, "0"); - stats.put(StatsSetupConst.TOTAL_SIZE, "0"); } + } else { + stats.put(StatsSetupConst.NUM_FILES, "0"); + stats.put(StatsSetupConst.ROW_COUNT, "0"); + stats.put(StatsSetupConst.TOTAL_SIZE, "0"); } return stats; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java index 2b05837a8848..857820ed1ada 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java @@ -248,7 +248,12 @@ default Map getOperatorDescProperties(OperatorDesc operatorDesc, * @param partish a partish wrapper class * @return map of basic statistics, can be null */ + @Deprecated default Map getBasicStatistics(Partish partish) { + return getBasicStatistics(partish.getTable()); + } + + default Map getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table table) { return null; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index ecfe9f8af1ff..93108b841ddc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -946,12 +946,14 @@ private Long getRowCnt( if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { if (MetaStoreUtils.isNonNativeTable(tbl.getTTable()) && tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) { - return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl)) + return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(tbl) .get(StatsSetupConst.ROW_COUNT)); } return null; } - rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT)); + Map basicStats = MetaStoreUtils.isNonNativeTable(tbl.getTTable()) ? + tbl.getStorageHandler().getBasicStatistics(tbl) : tbl.getParameters(); + rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT)); } return rowCnt; } From deb46ff07f92cf12eddbcb8b91e11f7b8afac0e5 Mon Sep 17 00:00:00 2001 From: zhangbutao Date: Fri, 25 Oct 2024 02:51:58 -0400 Subject: [PATCH 6/6] Resolve comments --- .../iceberg/mr/hive/HiveIcebergStorageHandler.java | 8 +++++--- .../apache/iceberg/mr/hive/IcebergTableUtil.java | 2 +- .../hadoop/hive/ql/metadata/HiveStorageHandler.java | 5 ----- .../apache/hadoop/hive/ql/metadata/HiveUtils.java | 7 ++----- .../hadoop/hive/ql/optimizer/StatsOptimizer.java | 13 ++++++------- 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java index d753f9ead1f0..9bbc6d3566dd 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java @@ -119,6 +119,7 @@ import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionStateUtil; +import org.apache.hadoop.hive.ql.stats.Partish; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.DefaultFetchFormatter; import org.apache.hadoop.hive.serde2.Deserializer; @@ -460,7 +461,8 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr } @Override - public Map getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { + public Map getBasicStatistics(Partish partish) { + org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable(); // For write queries where rows got modified, don't fetch from cache as values could have changed. Map stats = Maps.newHashMap(); if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) { @@ -468,7 +470,7 @@ public Map getBasicStatistics(org.apache.hadoop.hive.ql.metadata } Table table = getTable(hmsTable); - Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table); + Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table); if (snapshot != null) { Map summary = snapshot.summary(); if (summary != null) { @@ -614,7 +616,7 @@ private ColumnStatistics readColStats(Table table, Path statsPath) { public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) { Table table = getTable(hmsTable); - Snapshot snapshot = IcebergTableUtil.getSpecificSnapshot(hmsTable, table); + Snapshot snapshot = IcebergTableUtil.getTableSnapshot(hmsTable, table); if (snapshot != null) { Map summary = snapshot.summary(); if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java index 537d57dc1c72..c096246bc53a 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java @@ -567,7 +567,7 @@ public static List getPartitionNames(Table icebergTable, Map getOperatorDescProperties(OperatorDesc operatorDesc, * @param partish a partish wrapper class * @return map of basic statistics, can be null */ - @Deprecated default Map getBasicStatistics(Partish partish) { - return getBasicStatistics(partish.getTable()); - } - - default Map getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table table) { return null; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java index 9784e944ca0b..e4076d113678 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveUtils.java @@ -518,11 +518,8 @@ public static Path getDumpPath(Path root, String dbName, String tableName) { } public static String getTableSnapshotRef(String refName) { - if (org.apache.commons.lang3.StringUtils.isEmpty(refName)) { - return null; - } - Matcher ref = SNAPSHOT_REF.matcher(refName); - return ref.matches()? ref.group(1) : null; + Matcher ref = SNAPSHOT_REF.matcher(String.valueOf(refName)); + return ref.matches() ? ref.group(1) : null; } public static Boolean isTableTag(String refName) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 93108b841ddc..1de37c421051 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -943,16 +943,15 @@ private Long getRowCnt( rowCnt += partRowCnt; } } else { // unpartitioned table - if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { - if (MetaStoreUtils.isNonNativeTable(tbl.getTTable()) - && tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) { - return Long.valueOf(tbl.getStorageHandler().getBasicStatistics(tbl) - .get(StatsSetupConst.ROW_COUNT)); + Map basicStats = tbl.getParameters(); + if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())) { + if (!tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) { + return null; } + basicStats = tbl.getStorageHandler().getBasicStatistics(Partish.buildFor(tbl)); + } else if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { return null; } - Map basicStats = MetaStoreUtils.isNonNativeTable(tbl.getTTable()) ? - tbl.getStorageHandler().getBasicStatistics(tbl) : tbl.getParameters(); rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT)); } return rowCnt;