From 8169134718399b4c78b6dcba0e96a583ff72636d Mon Sep 17 00:00:00 2001 From: nicolas paris Date: Mon, 22 Aug 2022 16:54:42 +0200 Subject: [PATCH 1/5] Allow omit metadata fields for hive sync --- .../apache/hudi/common/table/TableSchemaResolver.java | 9 +++++++++ .../java/org/apache/hudi/hive/HiveSyncConfigHolder.java | 5 +++++ .../src/main/java/org/apache/hudi/hive/HiveSyncTool.java | 4 +++- .../org/apache/hudi/sync/common/HoodieSyncClient.java | 8 ++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index 657ac57c6375c..87ac71dcc35eb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -171,6 +171,15 @@ public MessageType getTableParquetSchema() throws Exception { return convertAvroSchemaToParquet(getTableAvroSchema(true)); } + /** + * Gets users data schema for a hoodie table in Parquet format. + * + * @return Parquet schema for the table + */ + public MessageType getTableParquetSchemaWithoutMetadataFields() throws Exception { + return convertAvroSchemaToParquet(getTableAvroSchema(false)); + } + /** * Gets users data schema for a hoodie table in Avro format. * diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java index 3877782c92026..61a4b0759d4b8 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java @@ -103,6 +103,11 @@ public class HiveSyncConfigHolder { .key("hoodie.datasource.hive_sync.create_managed_table") .defaultValue(false) .withDocumentation("Whether to sync the table as managed table."); + + public static final ConfigProperty HIVE_OMIT_METADATA_FIELDS = ConfigProperty + .key("hoodie.datasource.hive_sync.omit_metadata_fields") + .defaultValue(false) + .withDocumentation("Whether to omit the hoodie metadata fields in the target table."); public static final ConfigProperty HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty .key("hoodie.datasource.hive_sync.batch_num") .defaultValue(1000) diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index d0a40bbc181c5..d02144fd2c3d4 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -48,6 +48,7 @@ import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_OMIT_METADATA_FIELDS; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE; @@ -201,7 +202,8 @@ protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean tableExists = syncClient.tableExists(tableName); // Get the parquet schema for this table looking at the latest commit - MessageType schema = syncClient.getStorageSchema(); + MessageType schema = config.getBoolean(HIVE_OMIT_METADATA_FIELDS) ? syncClient.getStorageSchemaWithoutMetadataFields() : syncClient.getStorageSchema(); + // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, // so we disable the syncAsSparkDataSourceTable here to avoid read such kind table diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java index af06f5908ce39..6e380f8395aaa 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java @@ -104,6 +104,14 @@ public MessageType getStorageSchema() { } } + public MessageType getStorageSchemaWithoutMetadataFields() { + try { + return new TableSchemaResolver(metaClient).getTableParquetSchemaWithoutMetadataFields(); + } catch (Exception e) { + throw new HoodieSyncException("Failed to read schema from storage.", e); + } + } + public List getWrittenPartitionsSince(Option lastCommitTimeSynced) { if (!lastCommitTimeSynced.isPresent()) { LOG.info("Last commit time synced is not known, listing all partitions in " From b06e1de934baed1c3cf3be2e51a1827c07cbae15 Mon Sep 17 00:00:00 2001 From: nicolas paris Date: Mon, 22 Aug 2022 17:10:34 +0200 Subject: [PATCH 2/5] Add since version --- .../java/org/apache/hudi/hive/HiveSyncConfigHolder.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java index 61a4b0759d4b8..a11672c59788c 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java @@ -103,11 +103,11 @@ public class HiveSyncConfigHolder { .key("hoodie.datasource.hive_sync.create_managed_table") .defaultValue(false) .withDocumentation("Whether to sync the table as managed table."); - public static final ConfigProperty HIVE_OMIT_METADATA_FIELDS = ConfigProperty - .key("hoodie.datasource.hive_sync.omit_metadata_fields") - .defaultValue(false) - .withDocumentation("Whether to omit the hoodie metadata fields in the target table."); + .key("hoodie.datasource.hive_sync.omit_metadata_fields") + .defaultValue(false) + .sinceVersion("0.12.1") + .withDocumentation("Whether to omit the hoodie metadata fields in the target table."); public static final ConfigProperty HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty .key("hoodie.datasource.hive_sync.batch_num") .defaultValue(1000) From 0ffee71e82753d79b6dcfc595a1b25146883fd14 Mon Sep 17 00:00:00 2001 From: nicolas paris Date: Thu, 22 Sep 2022 14:52:09 +0200 Subject: [PATCH 3/5] Add feature for 0.13.0 --- .../main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java index a11672c59788c..bc62a6ba8afcf 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java @@ -106,7 +106,7 @@ public class HiveSyncConfigHolder { public static final ConfigProperty HIVE_OMIT_METADATA_FIELDS = ConfigProperty .key("hoodie.datasource.hive_sync.omit_metadata_fields") .defaultValue(false) - .sinceVersion("0.12.1") + .sinceVersion("0.13.0") .withDocumentation("Whether to omit the hoodie metadata fields in the target table."); public static final ConfigProperty HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty .key("hoodie.datasource.hive_sync.batch_num") From f06264ba800c37b82240d0d0ddab82e182b7da6f Mon Sep 17 00:00:00 2001 From: Raymond Xu <2701446+xushiyan@users.noreply.github.com> Date: Wed, 19 Oct 2022 20:30:58 +0800 Subject: [PATCH 4/5] parameterize api and fix config name --- .../apache/hudi/common/table/TableSchemaResolver.java | 4 ++-- .../java/org/apache/hudi/hive/HiveSyncConfigHolder.java | 2 +- .../src/main/java/org/apache/hudi/hive/HiveSyncTool.java | 4 ++-- .../hudi/sync/common/HoodieMetaSyncOperations.java | 9 +++++++++ .../org/apache/hudi/sync/common/HoodieSyncClient.java | 5 +++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java index 87ac71dcc35eb..d4379a258354c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java @@ -176,8 +176,8 @@ public MessageType getTableParquetSchema() throws Exception { * * @return Parquet schema for the table */ - public MessageType getTableParquetSchemaWithoutMetadataFields() throws Exception { - return convertAvroSchemaToParquet(getTableAvroSchema(false)); + public MessageType getTableParquetSchema(boolean includeMetadataField) throws Exception { + return convertAvroSchemaToParquet(getTableAvroSchema(includeMetadataField)); } /** diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java index bc62a6ba8afcf..02d6c0f2174ce 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java @@ -103,7 +103,7 @@ public class HiveSyncConfigHolder { .key("hoodie.datasource.hive_sync.create_managed_table") .defaultValue(false) .withDocumentation("Whether to sync the table as managed table."); - public static final ConfigProperty HIVE_OMIT_METADATA_FIELDS = ConfigProperty + public static final ConfigProperty HIVE_SYNC_OMIT_METADATA_FIELDS = ConfigProperty .key("hoodie.datasource.hive_sync.omit_metadata_fields") .defaultValue(false) .sinceVersion("0.13.0") diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index d02144fd2c3d4..88e7d743e1d31 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -48,7 +48,7 @@ import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS; -import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_OMIT_METADATA_FIELDS; +import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_OMIT_METADATA_FIELDS; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE; import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE; @@ -202,7 +202,7 @@ protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat, boolean tableExists = syncClient.tableExists(tableName); // Get the parquet schema for this table looking at the latest commit - MessageType schema = config.getBoolean(HIVE_OMIT_METADATA_FIELDS) ? syncClient.getStorageSchemaWithoutMetadataFields() : syncClient.getStorageSchema(); + MessageType schema = syncClient.getStorageSchema(!config.getBoolean(HIVE_SYNC_OMIT_METADATA_FIELDS)); // Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table, diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java index 49edbffd454a2..933c05e20d97d 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java @@ -124,6 +124,15 @@ default MessageType getStorageSchema() { return null; } + /** + * Get the schema from the Hudi table on storage. + * + * @param includeMetadataField true if to include metadata fields in the schema + */ + default MessageType getStorageSchema(boolean includeMetadataField) { + return null; + } + /** * Update schema for the table in the metastore. */ diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java index 6e380f8395aaa..56ff82f5e45a5 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java @@ -104,9 +104,10 @@ public MessageType getStorageSchema() { } } - public MessageType getStorageSchemaWithoutMetadataFields() { + @Override + public MessageType getStorageSchema(boolean includeMetadataField) { try { - return new TableSchemaResolver(metaClient).getTableParquetSchemaWithoutMetadataFields(); + return new TableSchemaResolver(metaClient).getTableParquetSchema(includeMetadataField); } catch (Exception e) { throw new HoodieSyncException("Failed to read schema from storage.", e); } From 071adff869adf398f19ef0a1e77fcbc9b0baa7d3 Mon Sep 17 00:00:00 2001 From: Raymond Xu <2701446+xushiyan@users.noreply.github.com> Date: Wed, 19 Oct 2022 20:38:46 +0800 Subject: [PATCH 5/5] fix config plumbing --- .../src/main/java/org/apache/hudi/hive/HiveSyncConfig.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java index 0d490c4ab14a1..d9d733119aa5c 100644 --- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java @@ -58,6 +58,7 @@ public class HiveSyncConfig extends HoodieSyncConfig { public static final ConfigProperty HIVE_SYNC_AS_DATA_SOURCE_TABLE = HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE; public static final ConfigProperty HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD; public static final ConfigProperty HIVE_CREATE_MANAGED_TABLE = HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE; + public static final ConfigProperty HIVE_SYNC_OMIT_METADATA_FIELDS = HiveSyncConfigHolder.HIVE_SYNC_OMIT_METADATA_FIELDS; public static final ConfigProperty HIVE_BATCH_SYNC_PARTITION_NUM = HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM; public static final ConfigProperty HIVE_SYNC_MODE = HiveSyncConfigHolder.HIVE_SYNC_MODE; public static final ConfigProperty HIVE_SYNC_BUCKET_SYNC = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC; @@ -130,6 +131,8 @@ public static class HiveSyncConfigParams { public Boolean supportTimestamp; @Parameter(names = {"--managed-table"}, description = "Create a managed table") public Boolean createManagedTable; + @Parameter(names = {"--omit-metafields"}, description = "Omit metafields in schema") + public Boolean omitMetaFields; @Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive") public Integer batchSyncNum; @Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.") @@ -167,6 +170,7 @@ public TypedProperties toProps() { props.setPropertyIfNonNull(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), syncAsSparkDataSourceTable); props.setPropertyIfNonNull(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), sparkSchemaLengthThreshold); props.setPropertyIfNonNull(HIVE_CREATE_MANAGED_TABLE.key(), createManagedTable); + props.setPropertyIfNonNull(HIVE_SYNC_OMIT_METADATA_FIELDS.key(), omitMetaFields); props.setPropertyIfNonNull(HIVE_BATCH_SYNC_PARTITION_NUM.key(), batchSyncNum); props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC.key(), bucketSync); props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), bucketSpec);