From 673837a6f05c582a74cb5fd2aa98d178d3af2b7c Mon Sep 17 00:00:00 2001
From: jlf <1251489546@qq.com>
Date: Tue, 8 Apr 2025 15:13:17 +0800
Subject: [PATCH 1/4] [CH][draft] iceberg UT
---
backends-clickhouse/pom.xml | 111 ++++
.../execution/iceberg/TestFlinkUpsert.java | 538 ++++++++++++++++++
.../ClickHouseIcebergHiveTableSupport.scala | 101 ++++
.../Storages/SubstraitSource/FileReader.cpp | 4 +-
4 files changed, 752 insertions(+), 2 deletions(-)
create mode 100644 backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
create mode 100644 backends-clickhouse/src-iceberg/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
diff --git a/backends-clickhouse/pom.xml b/backends-clickhouse/pom.xml
index 8a20a0a2cb06..a8f4a25b7b41 100644
--- a/backends-clickhouse/pom.xml
+++ b/backends-clickhouse/pom.xml
@@ -48,6 +48,9 @@
false
+
+ 1.16.2
+
org.apache.gluten
@@ -64,6 +67,103 @@
org.scala-lang.modules
scala-collection-compat_${scala.binary.version}
+
+ org.apache.iceberg
+ iceberg-api
+ ${iceberg.version}
+ test-jar
+ test
+
+
+ org.apache.iceberg
+ iceberg-flink-runtime-1.16
+ ${iceberg.version}
+ provided
+
+
+ org.apache.iceberg
+ iceberg-flink-1.16
+ ${iceberg.version}
+ test-jar
+ test
+
+
+ org.apache.iceberg
+ iceberg-hive-metastore
+ ${iceberg.version}
+ test-jar
+ test
+
+
+
+ org.junit.jupiter
+ junit-jupiter
+ test
+ 5.10.1
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+ test
+ 5.10.1
+
+
+ org.apache.flink
+ flink-streaming-java
+ ${flink.version}
+ provided
+
+
+ org.apache.flink
+ flink-table-api-bridge-base
+ ${flink.version}
+ provided
+
+
+ org.apache.flink
+ flink-table-api-java
+ ${flink.version}
+ provided
+
+
+ org.apache.flink
+ flink-table-api-java-bridge
+ ${flink.version}
+ provided
+
+
+ org.apache.flink
+ flink-test-utils
+ ${flink.version}
+ provided
+
+
+
+ org.apache.flink
+ flink-table-planner_2.12
+ ${flink.version}
+ provided
+
+
+ org.scala-lang
+ scala-library
+
+
+
+
+ org.apache.iceberg
+ iceberg-spark-${sparkbundle.version}_${scala.binary.version}
+ ${iceberg.version}
+ test-jar
+ test
+
+
+ org.apache.iceberg
+ iceberg-data
+ ${iceberg.version}
+ test-jar
+ test
+
@@ -551,6 +651,17 @@
true
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+
+ **/*Test.java
+ **/Test*.java
+
+ false
+
+
diff --git a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java b/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
new file mode 100644
index 000000000000..ec7503bb1a2a
--- /dev/null
+++ b/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
@@ -0,0 +1,538 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.execution.iceberg;
+
+import java.time.LocalDate;
+import java.time.ZoneId;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.table.api.EnvironmentSettings;
+import org.apache.flink.table.api.TableEnvironment;
+import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
+import org.apache.flink.types.Row;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Parameter;
+import org.apache.iceberg.Parameters;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.catalog.Namespace;
+import org.apache.iceberg.flink.CatalogTestBase;
+import org.apache.iceberg.flink.MiniClusterResource;
+import org.apache.iceberg.flink.TestHelpers;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.TestTemplate;
+
+public class TestFlinkUpsert extends CatalogTestBase {
+
+ @Parameter(index = 2)
+ private FileFormat format;
+
+ @Parameter(index = 3)
+ private boolean isStreamingJob;
+
+ private final Map tableUpsertProps = Maps.newHashMap();
+ private TableEnvironment tEnv;
+ private SparkSession spark;
+ private ClickHouseIcebergHiveTableSupport hiveTableSupport;
+
+ @Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}")
+ public static List
+
+ true
+
+
org.apache.gluten
@@ -659,7 +663,7 @@
**/*Test.java
**/Test*.java
- false
+ ${surefire.skipTests}
diff --git a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java b/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
index ec7503bb1a2a..429feab086f3 100644
--- a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
+++ b/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
@@ -61,8 +61,8 @@ public class TestFlinkUpsert extends CatalogTestBase {
@Parameters(name = "catalogName={0}, baseNamespace={1}, format={2}, isStreaming={3}")
public static List parameters() {
List parameters = Lists.newArrayList();
- // ignore orc, it is not supported by ch backend
- for (FileFormat format : new FileFormat[] {FileFormat.PARQUET, FileFormat.AVRO}) {
+ // ignore ORC and AVRO, ch backend only support PARQUET
+ for (FileFormat format : new FileFormat[] {FileFormat.PARQUET}) {
for (Boolean isStreaming : new Boolean[] {true, false}) {
// Only test with one catalog as this is a file operation concern.
// FlinkCatalogTestBase requires the catalog name start with testhadoop if using hadoop
diff --git a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java b/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
new file mode 100644
index 000000000000..c13c2fea70c2
--- /dev/null
+++ b/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gluten.execution.iceberg;
+
+import java.util.Map;
+import org.apache.iceberg.CatalogUtil;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.catalog.Namespace;
+import org.apache.iceberg.exceptions.AlreadyExistsException;
+import org.apache.iceberg.hive.HiveCatalog;
+import org.apache.iceberg.hive.TestHiveMetastore;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.spark.SparkCatalogConfig;
+import org.apache.iceberg.spark.source.TestPositionDeletesTable;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.junit.BeforeClass;
+import org.junit.runners.Parameterized;
+
+public class TestPositionDeletesTableGluten extends TestPositionDeletesTable {
+ private static final Map CATALOG_PROPS =
+ ImmutableMap.of("type", "hive", "default-namespace", "default", "cache-enabled", "false");
+ private static ClickHouseIcebergHiveTableSupport hiveTableSupport;
+
+ @BeforeClass
+ public static void startMetastoreAndSpark() {
+ metastore = new TestHiveMetastore();
+ metastore.start();
+ hiveConf = metastore.hiveConf();
+ hiveTableSupport = new ClickHouseIcebergHiveTableSupport();
+ hiveTableSupport.initSparkConf(
+ hiveConf.get("hive.metastore.uris"), SparkCatalogConfig.HIVE.catalogName(), null);
+ hiveTableSupport.initializeSession();
+ spark = hiveTableSupport.spark();
+ sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
+ catalog =
+ (HiveCatalog)
+ CatalogUtil.loadCatalog(
+ HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf);
+
+ try {
+ catalog.createNamespace(Namespace.of(new String[] {"default"}));
+ } catch (AlreadyExistsException ignore) {
+ }
+ }
+
+ @Parameterized.Parameters(
+ name =
+ "formatVersion = {0}, catalogName = {1}, implementation = {2}, config = {3}, fileFormat = {4}")
+ public static Object[][] parameters() {
+ // ignore ORC and AVRO, ch backend only support PARQUET
+ return new Object[][] {
+ {
+ SparkCatalogConfig.HIVE.catalogName(),
+ SparkCatalogConfig.HIVE.implementation(),
+ CATALOG_PROPS,
+ FileFormat.PARQUET
+ }
+ };
+ }
+
+ public TestPositionDeletesTableGluten(
+ String catalogName, String implementation, Map config, FileFormat format) {
+ super(catalogName, implementation, config, format);
+ }
+}
From 0f9d2240fc68e3496703737368e9a3c4fbccdb6c Mon Sep 17 00:00:00 2001
From: jlf <1251489546@qq.com>
Date: Wed, 16 Apr 2025 17:12:16 +0800
Subject: [PATCH 3/4] [CH][draft] run iceberg UT with diff spark version
---
backends-clickhouse/pom.xml | 281 +++++++++++++++---
.../execution/iceberg/TestFlinkUpsert.java | 0
.../TestPositionDeletesTableGluten.java | 11 +
.../ClickHouseIcebergHiveTableSupport.scala | 0
.../TestPositionDeletesTableGluten.java | 89 ++++++
.../ClickHouseIcebergHiveTableSupport.scala | 101 +++++++
6 files changed, 444 insertions(+), 38 deletions(-)
rename backends-clickhouse/{src-iceberg => src-spark33}/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java (100%)
rename backends-clickhouse/{src-iceberg => src-spark33}/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java (92%)
rename backends-clickhouse/{src-iceberg => src-spark33}/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala (100%)
create mode 100644 backends-clickhouse/src-spark35/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
create mode 100644 backends-clickhouse/src-spark35/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
diff --git a/backends-clickhouse/pom.xml b/backends-clickhouse/pom.xml
index 0b35dc01535f..f095d499034c 100644
--- a/backends-clickhouse/pom.xml
+++ b/backends-clickhouse/pom.xml
@@ -43,6 +43,181 @@
+
+ spark-3.3
+
+ false
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+
+
+ add-spark33-sources
+ generate-sources
+
+ add-source
+
+
+
+ ${project.basedir}/src-spark33/main/scala
+ ${project.basedir}src-spark33/main/java
+
+
+
+
+ add-spark33-resources
+ generate-resources
+
+ add-resource
+
+
+
+
+ ${project.basedir}/src-spark33/main/resources
+
+
+
+
+
+ add-spark33-test-sources
+ generate-test-sources
+
+ add-test-source
+
+
+
+ ${project.basedir}/src-spark33/test/scala
+ ${project.basedir}/src-spark33/test/java
+
+
+
+
+ add-spark33-test-resources
+ generate-test-resources
+
+ add-test-resource
+
+
+
+
+ ${project.basedir}/src-spark33/test/resources
+
+
+
+
+
+
+
+
+
+
+ spark-3.5
+
+ 1.7.36
+
+
+ false
+
+
+
+ org.apache.hadoop
+ hadoop-common
+ ${hadoop.version}
+ provided
+
+
+ org.apache.logging.log4j
+ log4j-slf4j-impl
+
+
+
+
+ org.apache.hadoop
+ hadoop-client
+ ${hadoop.version}
+ test
+
+
+ org.apache.logging.log4j
+ log4j-slf4j-impl
+
+
+
+
+ com.google.guava
+ guava
+ ${guava.version}
+ provided
+
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+
+
+ add-spark35-sources
+ generate-sources
+
+ add-source
+
+
+
+ ${project.basedir}/src-spark35/main/scala
+ ${project.basedir}src-spark35/main/java
+
+
+
+
+ add-spark35-resources
+ generate-resources
+
+ add-resource
+
+
+
+
+ ${project.basedir}/src-spark35/main/resources
+
+
+
+
+
+ add-spark35-test-sources
+ generate-test-sources
+
+ add-test-source
+
+
+
+ ${project.basedir}/src-spark35/test/scala
+ ${project.basedir}/src-spark35/test/java
+
+
+
+
+ add-spark35-test-resources
+ generate-test-resources
+
+ add-test-resource
+
+
+
+
+ ${project.basedir}/src-spark35/test/resources
+
+
+
+
+
+
+
+
+
iceberg
@@ -74,6 +249,12 @@
test-jar
test
+
+ org.apache.iceberg
+ iceberg-api
+ ${iceberg.version}
+ test
+
org.apache.iceberg
iceberg-flink-runtime-1.16
@@ -87,6 +268,12 @@
test-jar
test
+
+ org.apache.iceberg
+ iceberg-flink-1.16
+ ${iceberg.version}
+ test
+
org.apache.iceberg
iceberg-hive-metastore
@@ -94,6 +281,12 @@
test-jar
test
+
+ org.apache.iceberg
+ iceberg-hive-metastore
+ ${iceberg.version}
+ test
+
org.junit.jupiter
@@ -157,6 +350,12 @@
test-jar
test
+
+ org.apache.iceberg
+ iceberg-spark-${sparkbundle.version}_${scala.binary.version}
+ ${iceberg.version}
+ test
+
org.apache.iceberg
iceberg-data
@@ -164,6 +363,12 @@
test-jar
test
+
+ org.apache.iceberg
+ iceberg-data
+ ${iceberg.version}
+ test
+
@@ -344,44 +549,44 @@
test
- org.apache.hive.hcatalog
- hive-hcatalog-core
- 2.3.9
- test
-
-
- org.pentaho
- pentaho-aggdesigner-algorithm
-
-
- net.minidev
- json-smart
-
-
- org.apache.hive
- hive-exec
-
-
- guava
- com.google.guava
-
-
- hadoop-common
- org.apache.hadoop
-
-
- hadoop-hdfs
- org.apache.hadoop
-
-
- protobuf-java
- com.google.protobuf
-
-
- jdk.tools
- jdk.tools
-
-
+ org.apache.hive.hcatalog
+ hive-hcatalog-core
+ 2.3.9
+ test
+
+
+ org.pentaho
+ pentaho-aggdesigner-algorithm
+
+
+ net.minidev
+ json-smart
+
+
+ org.apache.hive
+ hive-exec
+
+
+ guava
+ com.google.guava
+
+
+ hadoop-common
+ org.apache.hadoop
+
+
+ hadoop-hdfs
+ org.apache.hadoop
+
+
+ protobuf-java
+ com.google.protobuf
+
+
+ jdk.tools
+ jdk.tools
+
+
org.apache.hadoop
diff --git a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java b/backends-clickhouse/src-spark33/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
similarity index 100%
rename from backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
rename to backends-clickhouse/src-spark33/test/java/org/apache/gluten/execution/iceberg/TestFlinkUpsert.java
diff --git a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java b/backends-clickhouse/src-spark33/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
similarity index 92%
rename from backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
rename to backends-clickhouse/src-spark33/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
index c13c2fea70c2..103cdd7438db 100644
--- a/backends-clickhouse/src-iceberg/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
+++ b/backends-clickhouse/src-spark33/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
@@ -28,6 +28,7 @@
import org.apache.iceberg.spark.SparkCatalogConfig;
import org.apache.iceberg.spark.source.TestPositionDeletesTable;
import org.apache.spark.api.java.JavaSparkContext;
+import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.runners.Parameterized;
@@ -58,6 +59,16 @@ public static void startMetastoreAndSpark() {
}
}
+ @AfterClass
+ public static void stopMetastoreAndSpark() throws Exception {
+ catalog = null;
+ if (metastore != null) {
+ metastore.stop();
+ metastore = null;
+ }
+ hiveTableSupport.clean();
+ }
+
@Parameterized.Parameters(
name =
"formatVersion = {0}, catalogName = {1}, implementation = {2}, config = {3}, fileFormat = {4}")
diff --git a/backends-clickhouse/src-iceberg/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala b/backends-clickhouse/src-spark33/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
similarity index 100%
rename from backends-clickhouse/src-iceberg/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
rename to backends-clickhouse/src-spark33/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
diff --git a/backends-clickhouse/src-spark35/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java b/backends-clickhouse/src-spark35/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
new file mode 100644
index 000000000000..3421a094fae6
--- /dev/null
+++ b/backends-clickhouse/src-spark35/test/java/org/apache/gluten/execution/iceberg/TestPositionDeletesTableGluten.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gluten.execution.iceberg;
+
+import java.util.Map;
+import org.apache.iceberg.CatalogUtil;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.ParameterizedTestExtension;
+import org.apache.iceberg.Parameters;
+import org.apache.iceberg.catalog.Namespace;
+import org.apache.iceberg.exceptions.AlreadyExistsException;
+import org.apache.iceberg.hive.HiveCatalog;
+import org.apache.iceberg.hive.TestHiveMetastore;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.spark.SparkCatalogConfig;
+import org.apache.iceberg.spark.source.TestPositionDeletesTable;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.extension.ExtendWith;
+
+@ExtendWith({ParameterizedTestExtension.class})
+public class TestPositionDeletesTableGluten extends TestPositionDeletesTable {
+ private static final Map CATALOG_PROPS =
+ ImmutableMap.of("type", "hive", "default-namespace", "default", "cache-enabled", "false");
+ private static ClickHouseIcebergHiveTableSupport hiveTableSupport;
+
+ @BeforeAll
+ public static void startMetastoreAndSpark() {
+ metastore = new TestHiveMetastore();
+ metastore.start();
+ hiveConf = metastore.hiveConf();
+ hiveTableSupport = new ClickHouseIcebergHiveTableSupport();
+ hiveTableSupport.initSparkConf(
+ hiveConf.get("hive.metastore.uris"), SparkCatalogConfig.HIVE.catalogName(), null);
+ hiveTableSupport.initializeSession();
+ spark = hiveTableSupport.spark();
+ sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
+ catalog =
+ (HiveCatalog)
+ CatalogUtil.loadCatalog(
+ HiveCatalog.class.getName(), "hive", ImmutableMap.of(), hiveConf);
+
+ try {
+ catalog.createNamespace(Namespace.of(new String[] {"default"}));
+ } catch (AlreadyExistsException ignore) {
+ }
+ }
+
+ @AfterAll
+ public static void stopMetastoreAndSpark() throws Exception {
+ catalog = null;
+ if (metastore != null) {
+ metastore.stop();
+ metastore = null;
+ }
+ hiveTableSupport.clean();
+ }
+
+ public TestPositionDeletesTableGluten() {}
+
+ @Parameters(name = "catalogName = {1}, implementation = {2}, config = {3}, fileFormat = {4}")
+ public static Object[][] parameters() {
+ // ignore ORC and AVRO, ch backend only support PARQUET
+ return new Object[][] {
+ {
+ SparkCatalogConfig.HIVE.catalogName(),
+ SparkCatalogConfig.HIVE.implementation(),
+ CATALOG_PROPS,
+ FileFormat.PARQUET
+ }
+ };
+ }
+}
diff --git a/backends-clickhouse/src-spark35/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala b/backends-clickhouse/src-spark35/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
new file mode 100644
index 000000000000..c9e6a8ee5eef
--- /dev/null
+++ b/backends-clickhouse/src-spark35/test/scala/org/apache/gluten/execution/iceberg/ClickHouseIcebergHiveTableSupport.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gluten.execution.iceberg
+
+import com.google.common.base.Strings
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig
+
+class ClickHouseIcebergHiveTableSupport {
+
+ private val sparkConf: SparkConf = new SparkConf()
+
+ private var _hiveSpark: SparkSession = _
+
+ def spark: SparkSession = _hiveSpark
+
+ def initSparkConf(url: String, catalog: String, path: String): SparkConf = {
+ import org.apache.gluten.backendsapi.clickhouse.CHConfig._
+
+ sparkConf
+ .set("spark.plugins", "org.apache.gluten.GlutenPlugin")
+ .set("spark.memory.offHeap.enabled", "true")
+ .set("spark.memory.offHeap.size", "536870912")
+ .set("spark.sql.catalogImplementation", "hive")
+ .set("spark.sql.adaptive.enabled", "true")
+ .set("spark.sql.files.maxPartitionBytes", "1g")
+ .set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
+ .set("spark.sql.shuffle.partitions", "5")
+ .set("spark.sql.adaptive.enabled", "false")
+ .set("spark.sql.files.minPartitionNum", "1")
+ .set(ClickHouseConfig.CLICKHOUSE_WORKER_ID, "1")
+ .set("spark.gluten.sql.columnar.iterator", "true")
+ .set("spark.gluten.sql.columnar.hashagg.enablefinal", "true")
+ .set("spark.gluten.sql.enable.native.validation", "false")
+ .set("spark.gluten.sql.parquet.maxmin.index", "true")
+ .set("spark.hive.exec.dynamic.partition.mode", "nonstrict")
+ .set("spark.gluten.supported.hive.udfs", "my_add")
+ .set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")
+ .set("spark.sql.adaptive.enabled", "true")
+ .set("spark.sql.shuffle.partitions", "2")
+ .set("spark.memory.offHeap.size", "2g")
+ .set("spark.unsafe.exceptionOnMemoryLeak", "true")
+ .set("spark.sql.autoBroadcastJoinThreshold", "-1")
+ .setCHConfig("use_local_format", true)
+ .set("spark.sql.extensions",
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
+ .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
+ .set("spark.sql.catalog.spark_catalog.type", "hive")
+ .setMaster("local[*]")
+ if (!Strings.isNullOrEmpty(url)) {
+ sparkConf.set("spark.hadoop.hive.metastore.uris", url)
+ }
+ if (!Strings.isNullOrEmpty(catalog)) {
+ sparkConf.set("spark.sql.catalog." + catalog, "org.apache.iceberg.spark.SparkCatalog")
+ .set("spark.sql.catalog." + catalog + ".type", "hive")
+ }
+ if (!Strings.isNullOrEmpty(path)) {
+ sparkConf.set("spark.sql.warehouse.dir", path)
+ }
+ sparkConf
+ }
+
+ def initializeSession(): Unit = {
+ if (_hiveSpark == null) {
+ _hiveSpark =
+ SparkSession
+ .builder()
+ .config(sparkConf)
+ .enableHiveSupport()
+ .getOrCreate()
+ }
+ }
+
+ def clean(): Unit = {
+ try {
+ if (_hiveSpark != null) {
+ _hiveSpark.stop()
+ _hiveSpark = null
+ }
+ } finally {
+ SparkSession.clearActiveSession()
+ SparkSession.clearDefaultSession()
+ }
+ }
+}
From 765948d7133b274447cf736474781728e641d313 Mon Sep 17 00:00:00 2001
From: jlf <1251489546@qq.com>
Date: Tue, 22 Apr 2025 13:56:29 +0800
Subject: [PATCH 4/4] Fixed occasional UT anomalies
---
.../GlutenClickHouseCacheBaseTestSuite.scala | 4 ++-
.../apache/gluten/utils/CacheTestHelper.scala | 33 +++++++++----------
2 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/cache/GlutenClickHouseCacheBaseTestSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/cache/GlutenClickHouseCacheBaseTestSuite.scala
index bb72e34e3e19..47307cf2fa11 100644
--- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/cache/GlutenClickHouseCacheBaseTestSuite.scala
+++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/cache/GlutenClickHouseCacheBaseTestSuite.scala
@@ -37,7 +37,9 @@ abstract class GlutenClickHouseCacheBaseTestSuite
override protected val queriesResults: String = rootPath + "queries-output"
// Abstract methods to be implemented by subclasses
- protected def cleanupCache(): Unit = cacheHelper.deleteCache(spark, tablesPath)
+ protected def cleanupCache(): Unit =
+ cacheHelper.deleteCache(spark, s"$tablesPath/lineitem", s"$tablesPath/$SPARK_DIR_NAME")
+
protected def copyDataIfNeeded(): Unit
// Initialize the cache helper - accessible to subclasses
diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/utils/CacheTestHelper.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/utils/CacheTestHelper.scala
index 6cdc16e2a194..4b694086a905 100644
--- a/backends-clickhouse/src/test/scala/org/apache/gluten/utils/CacheTestHelper.scala
+++ b/backends-clickhouse/src/test/scala/org/apache/gluten/utils/CacheTestHelper.scala
@@ -45,22 +45,21 @@ class CacheTestHelper(val TMP_PREFIX: String) {
}
/** Delete cache files for all tables in the data path */
- def deleteCache(spark: SparkSession, dataPath: String): Unit = {
- val targetFile = new Path(dataPath)
- val fs = targetFile.getFileSystem(spark.sessionState.newHadoopConf())
- fs.listStatus(targetFile)
- .foreach(
- table => {
- if (table.isDirectory) {
- fs.listStatus(table.getPath)
- .foreach(
- data => {
- if (data.isFile) {
- CHNativeCacheManager
- .removeFiles(data.getPath.toUri.getPath.substring(1), CACHE_NAME)
- }
- })
- }
- })
+ def deleteCache(spark: SparkSession, dataPaths: String*): Unit = {
+ dataPaths.foreach(
+ dataPath => {
+ val targetFile = new Path(dataPath)
+ val fs = targetFile.getFileSystem(spark.sessionState.newHadoopConf())
+ if (fs.isDirectory(targetFile)) {
+ fs.listStatus(targetFile)
+ .foreach(
+ data => {
+ if (data.isFile) {
+ CHNativeCacheManager
+ .removeFiles(data.getPath.toUri.getPath.substring(1), CACHE_NAME)
+ }
+ })
+ }
+ })
}
}