apache · jlfsdtc · Mar 12, 2025 · Mar 18, 2025 · jackylee-ch · Mar 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -100,3 +100,6 @@ dist/
 metastore_db/
 
 .ipynb_checkpoints
+
+# For Spark warehouse
+gluten-ut/*/spark-warehouse/
diff --git a/backends-clickhouse/pom.xml b/backends-clickhouse/pom.xml
@@ -273,6 +273,10 @@
             <artifactId>protobuf-java</artifactId>
             <groupId>com.google.protobuf</groupId>
           </exclusion>
+          <exclusion>
+            <groupId>jdk.tools</groupId>
+            <artifactId>jdk.tools</artifactId>
+          </exclusion>
         </exclusions>
     </dependency>
     <dependency>

diff --git a/...ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/...ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala
@@ -504,6 +504,8 @@ class ClickHouseTestSettings extends BackendTestSettings {
       "session window groupBy with multiple keys statement - keys overlapped with sessions")
     .excludeCH("SPARK-36465: filter out events with negative/zero gap duration")
     .excludeCH("SPARK-36724: Support timestamp_ntz as a type of time column for SessionWindow")
+    .excludeCH(
+      "SPARK-49836 using window fn with window as parameter should preserve parent operator")
   enableSuite[GlutenDataFrameSetOperationsSuite]
     .exclude("SPARK-37371: UnionExec should support columnar if all children support columnar")
     // Result depends on the implementation for nondeterministic expression rand.
@@ -1659,6 +1661,10 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .excludeCH("full outer join with unique keys using SortMergeJoin (whole-stage-codegen off)")
     .excludeCH("full outer join with unique keys using SortMergeJoin (whole-stage-codegen on)")
     .excludeCH("SPARK-32717: AQEOptimizer should respect excludedRules configuration")
+    .excludeCH("SPARK-46037: ShuffledHashJoin build left with left outer join, "
+      + "codegen off (whole-stage-codegen off)")
+    .excludeCH("SPARK-46037: ShuffledHashJoin build left with left outer join, "
+      + "codegen off (whole-stage-codegen on)")
   enableSuite[GlutenOuterJoinSuiteForceShjOn]
     .excludeCH("basic left outer join using ShuffledHashJoin (whole-stage-codegen off)")
     .excludeCH("basic left outer join using ShuffledHashJoin (whole-stage-codegen on)")
@@ -1684,6 +1690,10 @@ class ClickHouseTestSettings extends BackendTestSettings {
     .excludeCH("full outer join with unique keys using ShuffledHashJoin (whole-stage-codegen on)")
     .excludeCH("full outer join with unique keys using SortMergeJoin (whole-stage-codegen off)")
     .excludeCH("full outer join with unique keys using SortMergeJoin (whole-stage-codegen on)")
+    .excludeCH("SPARK-46037: ShuffledHashJoin build left with left outer join, "
+      + "codegen off (whole-stage-codegen off)")
+    .excludeCH("SPARK-46037: ShuffledHashJoin build left with left outer join, "
+      + "codegen off (whole-stage-codegen on)")
   enableSuite[GlutenParametersSuite]
   enableSuite[GlutenParquetCodecSuite]
     // codec not supported in native

diff --git a/pom.xml b/pom.xml
@@ -278,6 +278,7 @@
       </activation>
       <properties>
         <java.version>1.8</java.version>
+        <iceberg.version>1.5.0</iceberg.version>
       </properties>
     </profile>
     <profile>
@@ -288,6 +289,7 @@
       <properties>
         <java.version>11</java.version>
         <caffeine.version>3.1.8</caffeine.version>
+        <iceberg.version>1.8.0</iceberg.version>
       </properties>
     </profile>
     <profile>
@@ -298,6 +300,7 @@
       <properties>
         <java.version>17</java.version>
         <caffeine.version>3.1.8</caffeine.version>
+        <iceberg.version>1.8.0</iceberg.version>
       </properties>
     </profile>
     <profile>
@@ -347,8 +350,7 @@
       <properties>
         <sparkbundle.version>3.5</sparkbundle.version>
         <sparkshim.artifactId>spark-sql-columnar-shims-spark35</sparkshim.artifactId>
-        <spark.version>3.5.2</spark.version>
-        <iceberg.version>1.5.0</iceberg.version>
+        <spark.version>3.5.5</spark.version>
         <delta.package.name>delta-spark</delta.package.name>
         <delta.version>3.2.0</delta.version>
         <delta.binary.version>32</delta.binary.version>

diff --git a/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/Spark35Shims.scala b/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/Spark35Shims.scala
@@ -149,7 +149,8 @@ class Spark35Shims extends SparkShims {
   override def filesGroupedToBuckets(
       selectedPartitions: Array[PartitionDirectory]): Map[Int, Array[PartitionedFile]] = {
     selectedPartitions
-      .flatMap(p => p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, p.values)))
+      .flatMap(
+        p => p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values)))
       .groupBy {
         f =>
           BucketingUtils
@@ -418,6 +419,7 @@ class Spark35Shims extends SparkShims {
     PartitionedFileUtil.splitFiles(
       sparkSession,
       FileStatusWithMetadata(file, metadata),
+      filePath,
       isSplitable,
       maxSplitBytes,
       partitionValues)

diff --git a/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/SparkShimProvider.scala b/shims/spark35/src/main/scala/org/apache/gluten/sql/shims/spark35/SparkShimProvider.scala
@@ -20,7 +20,7 @@ import org.apache.gluten.sql.shims.{SparkShimDescriptor, SparkShims}
 import org.apache.gluten.sql.shims.spark35.SparkShimProvider.DESCRIPTOR
 
 object SparkShimProvider {
-  val DESCRIPTOR = SparkShimDescriptor(3, 5, 2)
+  val DESCRIPTOR = SparkShimDescriptor(3, 5, 5)
 }
 
 class SparkShimProvider extends org.apache.gluten.sql.shims.SparkShimProvider {

diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala
@@ -181,7 +181,8 @@ abstract class AbstractFileSourceScanExec(
     logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
     val filesGroupedToBuckets =
       selectedPartitions
-        .flatMap(p => p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, p.values)))
+        .flatMap(
+          p => p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values)))
         .groupBy {
           f =>
             BucketingUtils
@@ -264,14 +265,14 @@ abstract class AbstractFileSourceScanExec(
         partition =>
           partition.files.flatMap {
             file =>
-              if (shouldProcess(file.getPath)) {
-                val isSplitable = relation.fileFormat.isSplitable(
-                  relation.sparkSession,
-                  relation.options,
-                  file.getPath)
+              val filePath = file.getPath
+              if (shouldProcess(filePath)) {
+                val isSplitable =
+                  relation.fileFormat.isSplitable(relation.sparkSession, relation.options, filePath)
                 PartitionedFileUtil.splitFiles(
                   sparkSession = relation.sparkSession,
                   file = file,
+                  filePath,
                   isSplitable = isSplitable,
                   maxSplitBytes = maxSplitBytes,
                   partitionValues = partition.values