From 4e96bf34b549d30e1dffc3f75be8ba9af5aea451 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 10 Dec 2024 18:45:01 -0500 Subject: [PATCH 1/2] Switch to use unshaded hive-exec for io expansion service * This enables the shadow jar pick up dependencies of newer versions --- .../IO_Iceberg_Integration_Tests.json | 2 +- ...eam_PostCommit_Python_Xlang_IO_Direct.json | 2 +- sdks/java/io/expansion-service/build.gradle | 3 +- sdks/java/io/iceberg/hive/build.gradle | 34 ++++++++-- sdks/java/io/iceberg/hive/exec/build.gradle | 65 ------------------- settings.gradle.kts | 2 - 6 files changed, 33 insertions(+), 75 deletions(-) delete mode 100644 sdks/java/io/iceberg/hive/exec/build.gradle diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 3f63c0c9975f..bbdc3a3910ef 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 3 } diff --git a/.github/trigger_files/beam_PostCommit_Python_Xlang_IO_Direct.json b/.github/trigger_files/beam_PostCommit_Python_Xlang_IO_Direct.json index b26833333238..e3d6056a5de9 100644 --- a/.github/trigger_files/beam_PostCommit_Python_Xlang_IO_Direct.json +++ b/.github/trigger_files/beam_PostCommit_Python_Xlang_IO_Direct.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 1 } diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle index 421719b8f986..eec2d0b8671b 100644 --- a/sdks/java/io/expansion-service/build.gradle +++ b/sdks/java/io/expansion-service/build.gradle @@ -60,10 +60,11 @@ dependencies { runtimeOnly library.java.bigdataoss_gcs_connector // Needed for HiveCatalog runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2") - runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") + runtimeOnly project(path: ":sdks:java:io:iceberg:hive") runtimeOnly library.java.kafka_clients runtimeOnly library.java.slf4j_jdk14 + testImplementation(library.java.junit) } task runExpansionService (type: JavaExec) { diff --git a/sdks/java/io/iceberg/hive/build.gradle b/sdks/java/io/iceberg/hive/build.gradle index bfa6c75251c4..ea4ed630d2ed 100644 --- a/sdks/java/io/iceberg/hive/build.gradle +++ b/sdks/java/io/iceberg/hive/build.gradle @@ -21,19 +21,40 @@ plugins { id 'org.apache.beam.module' } applyJavaNature( automaticModuleName: 'org.apache.beam.sdk.io.iceberg.hive', exportJavadoc: false, - shadowClosure: {}, + validateShadowJar: false, // fails with "Could not receive a message from the daemon.", likely a shadow plugin bug + publish: false, // it's an intermediate jar for io-expansion-service ) description = "Apache Beam :: SDKs :: Java :: IO :: Iceberg :: Hive" ext.summary = "Runtime dependencies needed for Hive catalog integration." def hive_version = "3.1.3" +def hbase_version = "2.6.1-hadoop3" +def hadoop_version = "3.4.1" def iceberg_version = "1.4.2" +def avatica_version = "1.25.0" dependencies { // dependencies needed to run with iceberg's hive catalog - runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:$iceberg_version") - runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") + // these dependencies are going to be included in io-expansion-service + implementation ("org.apache.iceberg:iceberg-hive-metastore:$iceberg_version") + permitUnusedDeclared ("org.apache.iceberg:iceberg-hive-metastore:$iceberg_version") + // analyzeClassesDependencies fails with "Cannot accept visitor on URL", likely the plugin does not recognize "core" classifier + // use "core" classifier to depend on un-shaded jar + runtimeOnly ("org.apache.hive:hive-exec:$hive_version:core") { + // old hadoop-yarn-server-resourcemanager contains critical log4j vulneribility + exclude group: "org.apache.hadoop", module: "hadoop-yarn-server-resourcemanager" + // old hadoop-yarn-server-resourcemanager contains critical log4j and hadoop vulneribility + exclude group: "org.apache.hbase", module: "hbase-client" + // old calcite leaks old protobuf-java + exclude group: "org.apache.calcite.avatica", module: "avatica" + } + runtimeOnly ("org.apache.hadoop:hadoop-yarn-server-resourcemanager:$hadoop_version") + runtimeOnly ("org.apache.hbase:hbase-client:$hbase_version") + runtimeOnly ("org.apache.calcite.avatica:avatica-core:$avatica_version") + implementation ("org.apache.hive:hive-metastore:$hive_version") + runtimeOnly ("org.apache.iceberg:iceberg-parquet:$iceberg_version") + permitUnusedDeclared ("org.apache.hive:hive-metastore:$hive_version") // ----- below dependencies are for testing and will not appear in the shaded jar ----- // Beam IcebergIO dependencies @@ -52,8 +73,6 @@ dependencies { testImplementation library.java.junit // needed to set up test Hive Metastore and run tests - testImplementation ("org.apache.iceberg:iceberg-hive-metastore:$iceberg_version") - testImplementation project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") testRuntimeOnly ("org.apache.hive.hcatalog:hive-hcatalog-core:$hive_version") { exclude group: "org.apache.hive", module: "hive-exec" exclude group: "org.apache.parquet", module: "parquet-hadoop-bundle" @@ -62,6 +81,11 @@ dependencies { testImplementation "org.apache.parquet:parquet-column:1.12.0" } +configurations.all { + // the fatjar "parquet-hadoop-bundle" conflicts with "parquet-hadoop" used by org.apache.iceberg:iceberg-parquet + exclude group: "org.apache.parquet", module: "parquet-hadoop-bundle" +} + task integrationTest(type: Test) { group = "Verification" def gcpTempLocation = project.findProperty('gcpTempLocation') ?: 'gs://temp-storage-for-end-to-end-tests/iceberg-hive-it' diff --git a/sdks/java/io/iceberg/hive/exec/build.gradle b/sdks/java/io/iceberg/hive/exec/build.gradle deleted file mode 100644 index f266ab2ef4db..000000000000 --- a/sdks/java/io/iceberg/hive/exec/build.gradle +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -plugins { - id 'org.apache.beam.module' - id 'java' - id 'com.github.johnrengelman.shadow' -} - -dependencies { - implementation("org.apache.hive:hive-exec:3.1.3") - permitUnusedDeclared("org.apache.hive:hive-exec:3.1.3") -} - -configurations { - shadow -} - -artifacts { - shadow(archives(shadowJar) { - builtBy shadowJar - }) -} - -shadowJar { - zip64 true - - def problematicPackages = [ - 'com.google.protobuf', - 'com.google.common', - 'shaded.parquet', - 'org.apache.parquet', - 'org.joda' - ] - - problematicPackages.forEach { - relocate it, getJavaRelocatedPath("iceberg.hive.${it}") - } - - version "3.1.3" - mergeServiceFiles() - - exclude 'LICENSE' - exclude( - 'org/xml/**', - 'javax/**', - 'com/sun/**' - ) -} -description = "Apache Beam :: SDKs :: Java :: IO :: Iceberg :: Hive :: Exec" -ext.summary = "A copy of the hive-exec dependency with some popular libraries relocated." diff --git a/settings.gradle.kts b/settings.gradle.kts index d90bb3fb5b82..a8bee45a05ac 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -357,5 +357,3 @@ include("sdks:java:extensions:combiners") findProject(":sdks:java:extensions:combiners")?.name = "combiners" include("sdks:java:io:iceberg:hive") findProject(":sdks:java:io:iceberg:hive")?.name = "hive" -include("sdks:java:io:iceberg:hive:exec") -findProject(":sdks:java:io:iceberg:hive:exec")?.name = "exec" From 0f3704fdc0447ca1a0846c2c9ef6769aa3237650 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Thu, 12 Dec 2024 12:35:03 -0500 Subject: [PATCH 2/2] cleanup leftovers --- sdks/java/io/expansion-service/build.gradle | 1 - sdks/java/io/iceberg/hive/build.gradle | 1 - 2 files changed, 2 deletions(-) diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle index eec2d0b8671b..a27a66b1f3dc 100644 --- a/sdks/java/io/expansion-service/build.gradle +++ b/sdks/java/io/expansion-service/build.gradle @@ -64,7 +64,6 @@ dependencies { runtimeOnly library.java.kafka_clients runtimeOnly library.java.slf4j_jdk14 - testImplementation(library.java.junit) } task runExpansionService (type: JavaExec) { diff --git a/sdks/java/io/iceberg/hive/build.gradle b/sdks/java/io/iceberg/hive/build.gradle index ea4ed630d2ed..2d0d2bcc5cde 100644 --- a/sdks/java/io/iceberg/hive/build.gradle +++ b/sdks/java/io/iceberg/hive/build.gradle @@ -21,7 +21,6 @@ plugins { id 'org.apache.beam.module' } applyJavaNature( automaticModuleName: 'org.apache.beam.sdk.io.iceberg.hive', exportJavadoc: false, - validateShadowJar: false, // fails with "Could not receive a message from the daemon.", likely a shadow plugin bug publish: false, // it's an intermediate jar for io-expansion-service )