From c139aacaf9fde30425c3be2eb8756f816a1d03d8 Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Fri, 20 May 2022 12:57:16 +0530 Subject: [PATCH 1/2] [HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle --- .../utilities/deltastreamer/DeltaSync.java | 2 - .../hudi-utilities-slim-bundle/README.md | 89 +++++++++++- packaging/hudi-utilities-slim-bundle/pom.xml | 135 +++--------------- pom.xml | 6 + 4 files changed, 108 insertions(+), 124 deletions(-) diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index a4a7e10abc004..0ae72f94b82e0 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -605,8 +605,6 @@ private Pair, JavaRDD> writeToSink(JavaRDD 0; - long hiveSyncTimeMs = 0; - long metaSyncTimeMs = 0; if (!hasErrors || cfg.commitOnErrors) { HashMap checkpointCommitMetadata = new HashMap<>(); if (checkpointStr != null) { diff --git a/packaging/hudi-utilities-slim-bundle/README.md b/packaging/hudi-utilities-slim-bundle/README.md index 58353c403d325..60ee739153fdd 100644 --- a/packaging/hudi-utilities-slim-bundle/README.md +++ b/packaging/hudi-utilities-slim-bundle/README.md @@ -17,6 +17,89 @@ # Usage of hudi-utilities-slim-bundle -Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. -This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely -introduces problems for a specific Spark version. \ No newline at end of file +Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using +hudi-utilities-bundle solely introduces problems for a specific Spark version. + +## Example with Spark 2.4.7 + +* Build Hudi: `mvn clean install -DskipTests` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.11:2.4.7 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` + +## Example with Spark 3.1.2 + +* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.12:3.1.2 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` + +## Example with Spark 3.2.0 + +* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.12:3.2.0 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml b/packaging/hudi-utilities-slim-bundle/pom.xml index 60f0af9d64f07..cd1813a2861cd 100644 --- a/packaging/hudi-utilities-slim-bundle/pom.xml +++ b/packaging/hudi-utilities-slim-bundle/pom.xml @@ -77,7 +77,7 @@ - true + true META-INF/LICENSE @@ -92,9 +92,7 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-client-common - org.apache.hudi:hudi-spark-client org.apache.hudi:hudi-utilities_${scala.binary.version} - org.apache.hudi:hudi-hive-sync org.apache.hudi:hudi-sync-common org.apache.hudi:hudi-hadoop-mr org.apache.hudi:hudi-timeline-service @@ -136,13 +134,6 @@ org.apache.kafka:kafka_${scala.binary.version} com.101tec:zkclient org.apache.kafka:kafka-clients - - org.apache.hive:hive-common - org.apache.hive:hive-service - org.apache.hive:hive-service-rpc - org.apache.hive:hive-metastore - org.apache.hive:hive-jdbc - org.apache.hbase:hbase-client org.apache.hbase:hbase-common org.apache.hbase:hbase-hadoop-compat @@ -339,121 +330,27 @@ org.apache.hudi - hudi-common - ${project.version} - - - org.apache.hudi - hudi-client-common - ${project.version} - - - org.apache.hudi - hudi-spark-client - ${project.version} - - - org.apache.hudi - hudi-hive-sync + hudi-utilities_${scala.binary.version} ${project.version} - javax.servlet - servlet-api + org.apache.hudi + hudi-spark-common_${scala.binary.version} + + + org.apache.hudi + hudi-spark_${scala.binary.version} + + + org.apache.hudi + ${hudi.spark.module}_${scala.binary.version} + + + org.apache.hudi + ${hudi.spark.common.module} - - org.apache.hudi - hudi-spark-common_${scala.binary.version} - ${project.version} - provided - - - org.apache.hudi - hudi-spark_${scala.binary.version} - ${project.version} - provided - - - org.apache.hudi - ${hudi.spark.module}_${scala.binary.version} - ${project.version} - provided - - - org.apache.hudi - ${hudi.spark.common.module} - ${project.version} - provided - - - org.apache.hudi - hudi-utilities_${scala.binary.version} - ${project.version} - - - - - ${hive.groupid} - hive-service - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-service-rpc - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-jdbc - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-metastore - ${hive.version} - ${utilities.bundle.hive.scope} - - - - ${hive.groupid} - hive-common - ${hive.version} - ${utilities.bundle.hive.scope} - - - - org.apache.htrace - htrace-core - ${htrace.version} - compile - - - - - org.apache.curator - curator-framework - ${zk-curator.version} - - - - org.apache.curator - curator-client - ${zk-curator.version} - - - - org.apache.curator - curator-recipes - ${zk-curator.version} - diff --git a/pom.xml b/pom.xml index d898d34d35e43..1e67bdd331564 100644 --- a/pom.xml +++ b/pom.xml @@ -99,6 +99,7 @@ 2.8.1 5.3.4 2.17 + 3.0.1-b12 1.10.1 5.7.0-M1 5.7.0-M1 @@ -556,6 +557,11 @@ jersey-container-servlet-core ${glassfish.version} + + org.glassfish + javax.el + ${glassfish.el.version} + From 6cbe9ef01b44d6897578ebe6203d0670d2629e19 Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Wed, 25 May 2022 22:26:22 +0530 Subject: [PATCH 2/2] Remove hudi-sync-common and address other feedback --- packaging/hudi-utilities-slim-bundle/pom.xml | 18 +++++------------- pom.xml | 1 + 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml b/packaging/hudi-utilities-slim-bundle/pom.xml index cd1813a2861cd..993e2ad7fd912 100644 --- a/packaging/hudi-utilities-slim-bundle/pom.xml +++ b/packaging/hudi-utilities-slim-bundle/pom.xml @@ -93,7 +93,6 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-client-common org.apache.hudi:hudi-utilities_${scala.binary.version} - org.apache.hudi:hudi-sync-common org.apache.hudi:hudi-hadoop-mr org.apache.hudi:hudi-timeline-service org.apache.hudi:hudi-aws @@ -169,10 +168,6 @@ com.beust.jcommander. org.apache.hudi.com.beust.jcommander. - - org.apache.hive.jdbc. - ${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc. - org.apache.commons.io. org.apache.hudi.org.apache.commons.io. @@ -196,10 +191,6 @@ org.apache.hadoop.hive.metastore. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore. - - org.apache.hive.common. - ${utilities.bundle.hive.shade.prefix}org.apache.hive.common. - org.apache.hadoop.hive.common. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common. @@ -208,10 +199,6 @@ org.apache.hadoop.hive.conf. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf. - - org.apache.hive.service. - ${utilities.bundle.hive.shade.prefix}org.apache.hive.service. - org.apache.hadoop.hive.service. ${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service. @@ -328,6 +315,11 @@ + + org.apache.hudi + hudi-common + ${project.version} + org.apache.hudi hudi-utilities_${scala.binary.version} diff --git a/pom.xml b/pom.xml index 1e67bdd331564..1188ec620aa39 100644 --- a/pom.xml +++ b/pom.xml @@ -561,6 +561,7 @@ org.glassfish javax.el ${glassfish.el.version} + provided