From 85836d9fe98361450af872736a638e2bb4e83917 Mon Sep 17 00:00:00 2001 From: wuwenchi Date: Mon, 19 Feb 2024 11:56:08 +0800 Subject: [PATCH 1/2] add useHiveSyncPartition --- .../catalog/external/HMSExternalTable.java | 10 ++++++++ .../hudi/HudiCachedPartitionProcessor.java | 25 +++++++++++-------- .../planner/external/hudi/HudiScanNode.java | 8 ++++-- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index bbadac6ecbf7b1..961dc6240d61bc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -89,6 +89,8 @@ public class HMSExternalTable extends ExternalTable { private static final String NUM_ROWS = "numRows"; + private static final String USE_HIVE_SYNC_PARTITION = "use_hive_sync_partition"; + static { SUPPORTED_HIVE_FILE_FORMATS = Sets.newHashSet(); SUPPORTED_HIVE_FILE_FORMATS.add("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"); @@ -194,6 +196,14 @@ private boolean supportedHoodieTable() { return inputFormatName != null && SUPPORTED_HUDI_FILE_FORMATS.contains(inputFormatName); } + /** + * Some data lakes (such as Hudi) will synchronize their partition information to HMS, + * then we can quickly obtain the partition information of the table from HMS. + */ + public boolean useHiveSyncPartition() { + return Boolean.parseBoolean(catalog.getProperties().getOrDefault(USE_HIVE_SYNC_PARTITION, "false")); + } + public boolean isHoodieCowTable() { if (remoteTable.getSd() == null) { return false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiCachedPartitionProcessor.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiCachedPartitionProcessor.java index f18c133f393ba5..065dd04a39e49d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiCachedPartitionProcessor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiCachedPartitionProcessor.java @@ -85,7 +85,7 @@ public void cleanTablePartitions(String dbName, String tblName) { } public TablePartitionValues getSnapshotPartitionValues(HMSExternalTable table, - HoodieTableMetaClient tableMetaClient, String timestamp) { + HoodieTableMetaClient tableMetaClient, String timestamp, boolean useHiveSyncPartition) { Preconditions.checkState(catalogId == table.getCatalog().getId()); Option partitionColumns = tableMetaClient.getTableConfig().getPartitionFields(); if (!partitionColumns.isPresent()) { @@ -98,7 +98,7 @@ public TablePartitionValues getSnapshotPartitionValues(HMSExternalTable table, } long lastTimestamp = Long.parseLong(lastInstant.get().getTimestamp()); if (Long.parseLong(timestamp) == lastTimestamp) { - return getPartitionValues(table, tableMetaClient); + return getPartitionValues(table, tableMetaClient, useHiveSyncPartition); } List partitionNameAndValues = getPartitionNamesBeforeOrEquals(timeline, timestamp); List partitionNames = Arrays.asList(partitionColumns.get()); @@ -109,7 +109,8 @@ public TablePartitionValues getSnapshotPartitionValues(HMSExternalTable table, return partitionValues; } - public TablePartitionValues getPartitionValues(HMSExternalTable table, HoodieTableMetaClient tableMetaClient) + public TablePartitionValues getPartitionValues(HMSExternalTable table, HoodieTableMetaClient tableMetaClient, + boolean useHiveSyncPartition) throws CacheException { Preconditions.checkState(catalogId == table.getCatalog().getId()); Option partitionColumns = tableMetaClient.getTableConfig().getPartitionFields(); @@ -143,13 +144,17 @@ public TablePartitionValues getPartitionValues(HMSExternalTable table, HoodieTab } HMSExternalCatalog catalog = (HMSExternalCatalog) table.getCatalog(); List partitionNames; - // When a Hudi table is synchronized to HMS, the partition information is also synchronized, - // so even if the metastore is not enabled in the Hudi table - // (for example, if the Metastore is false for a Hudi table created with Flink), - // we can still obtain the partition information through the HMS API. - partitionNames = catalog.getClient().listPartitionNames(table.getDbName(), table.getName()); - if (partitionNames.size() == 0) { - LOG.warn("Failed to get partitions from hms api, switch it from hudi api."); + if (useHiveSyncPartition) { + // When a Hudi table is synchronized to HMS, the partition information is also synchronized, + // so even if the metastore is not enabled in the Hudi table + // (for example, if the Metastore is false for a Hudi table created with Flink), + // we can still obtain the partition information through the HMS API. + partitionNames = catalog.getClient().listPartitionNames(table.getDbName(), table.getName()); + if (partitionNames.size() == 0) { + LOG.warn("Failed to get partitions from hms api, switch it from hudi api."); + partitionNames = getAllPartitionNames(tableMetaClient); + } + } else { partitionNames = getAllPartitionNames(tableMetaClient); } List partitionColumnsList = Arrays.asList(partitionColumns.get()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiScanNode.java index 9d601e71daa951..803dadae03d5d4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/hudi/HudiScanNode.java @@ -82,6 +82,8 @@ public class HudiScanNode extends HiveScanNode { private final AtomicLong noLogsSplitNum = new AtomicLong(0); + private final boolean useHiveSyncPartition; + /** * External file scan node for Query Hudi table * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv @@ -97,6 +99,7 @@ public HudiScanNode(PlanNodeId id, TupleDescriptor desc, boolean needCheckColumn } else { LOG.debug("Hudi table {} is a mor table, and will use JNI to read data in BE", hmsTable.getName()); } + useHiveSyncPartition = hmsTable.useHiveSyncPartition(); } @Override @@ -166,9 +169,10 @@ private List getPrunedPartitions( .getExtMetaCacheMgr().getHudiPartitionProcess(hmsTable.getCatalog()); TablePartitionValues partitionValues; if (snapshotTimestamp.isPresent()) { - partitionValues = processor.getSnapshotPartitionValues(hmsTable, metaClient, snapshotTimestamp.get()); + partitionValues = processor.getSnapshotPartitionValues( + hmsTable, metaClient, snapshotTimestamp.get(), useHiveSyncPartition); } else { - partitionValues = processor.getPartitionValues(hmsTable, metaClient); + partitionValues = processor.getPartitionValues(hmsTable, metaClient, useHiveSyncPartition); } if (partitionValues != null) { // 2. prune partitions by expr From 566084b3477de30852f38b0420511806ca412cd3 Mon Sep 17 00:00:00 2001 From: wuwenchi Date: Mon, 19 Feb 2024 15:51:03 +0800 Subject: [PATCH 2/2] doc --- docs/en/docs/lakehouse/multi-catalog/hudi.md | 6 ++++++ docs/zh-CN/docs/lakehouse/multi-catalog/hudi.md | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/docs/en/docs/lakehouse/multi-catalog/hudi.md b/docs/en/docs/lakehouse/multi-catalog/hudi.md index 52892db2df2174..a52c2370ced161 100644 --- a/docs/en/docs/lakehouse/multi-catalog/hudi.md +++ b/docs/en/docs/lakehouse/multi-catalog/hudi.md @@ -55,6 +55,12 @@ CREATE CATALOG hudi PROPERTIES ( ); ``` +Optional configuration parameters: + +|name|description|default| +|---|---|---| +|use_hive_sync_partition|Use hms synchronized partition data|false| + ## Column Type Mapping Same as that in Hive Catalogs. See the relevant section in [Hive](./hive.md). diff --git a/docs/zh-CN/docs/lakehouse/multi-catalog/hudi.md b/docs/zh-CN/docs/lakehouse/multi-catalog/hudi.md index b619283cacf5bc..38bb26d3bc79ed 100644 --- a/docs/zh-CN/docs/lakehouse/multi-catalog/hudi.md +++ b/docs/zh-CN/docs/lakehouse/multi-catalog/hudi.md @@ -55,6 +55,12 @@ CREATE CATALOG hudi PROPERTIES ( ); ``` +可选配置参数: + +|参数名|说明|默认值| +|---|---|---| +|use_hive_sync_partition|使用hms已同步的分区数据|false| + ## 列类型映射 和 Hive Catalog 一致,可参阅 [Hive Catalog](./hive.md) 中 **列类型映射** 一节。