From c580251b22b57b06df4573260e328dc27c250503 Mon Sep 17 00:00:00 2001 From: Yaniv Kunda Date: Sun, 23 Mar 2025 18:26:32 +0200 Subject: [PATCH 1/3] upgraded spark 3.5.4 to 3.5.5 --- .github/workflows/spark_sql_test.yml | 2 +- benchmarks/Dockerfile | 2 +- docs/source/contributor-guide/spark-sql-tests.md | 6 +++--- docs/source/user-guide/configs.md | 6 +++--- docs/source/user-guide/kubernetes.md | 10 +++++----- pom.xml | 2 +- .../spark/sql/comet/shims/ShimCometScanExec.scala | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml index 8d60f07692..a752531a50 100644 --- a/.github/workflows/spark_sql_test.yml +++ b/.github/workflows/spark_sql_test.yml @@ -45,7 +45,7 @@ jobs: matrix: os: [ubuntu-24.04] java-version: [11] - spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.4'}] + spark-version: [{short: '3.4', full: '3.4.3'}, {short: '3.5', full: '3.5.5'}] module: - {name: "catalyst", args1: "catalyst/test", args2: ""} - {name: "sql/core-1", args1: "", args2: sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest} diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile index 18727809ec..704c863d20 100644 --- a/benchmarks/Dockerfile +++ b/benchmarks/Dockerfile @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM apache/datafusion-comet:0.7.0-spark3.5.4-scala2.12-java11 +FROM apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11 RUN apt update \ && apt install -y git python3 python3-pip \ diff --git a/docs/source/contributor-guide/spark-sql-tests.md b/docs/source/contributor-guide/spark-sql-tests.md index de0584b6ea..1776a4a86e 100644 --- a/docs/source/contributor-guide/spark-sql-tests.md +++ b/docs/source/contributor-guide/spark-sql-tests.md @@ -72,11 +72,11 @@ of Apache Spark to enable Comet when running tests. This is a highly manual proc vary depending on the changes in the new version of Spark, but here is a general guide to the process. We typically start by applying a patch from a previous version of Spark. For example, when enabling the tests -for Spark version 3.5.4 we may start by applying the existing diff for 3.4.3 first. +for Spark version 3.5.5 we may start by applying the existing diff for 3.4.3 first. ```shell cd git/apache/spark -git checkout v3.5.4 +git checkout v3.5.5 git apply --reject --whitespace=fix ../datafusion-comet/dev/diffs/3.4.3.diff ``` @@ -118,7 +118,7 @@ wiggle --replace ./sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.sc ## Generating The Diff File ```shell -git diff v3.5.4 > ../datafusion-comet/dev/diffs/3.5.4.diff +git diff v3.5.5 > ../datafusion-comet/dev/diffs/3.5.5.diff ``` ## Running Tests in CI diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index d0f95480d6..529103d8b0 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -71,9 +71,9 @@ Comet provides the following configuration settings. | spark.comet.explain.verbose.enabled | When this setting is enabled, Comet will provide a verbose tree representation of the extended information. | false | | spark.comet.explainFallback.enabled | When this setting is enabled, Comet will provide logging explaining the reason(s) why a query stage cannot be executed natively. Set this to false to reduce the amount of logging. | false | | spark.comet.expression.allowIncompatible | Comet is not currently fully compatible with Spark for all expressions. Set this config to true to allow them anyway. For more information, refer to the Comet Compatibility Guide (https://datafusion.apache.org/comet/user-guide/compatibility.html). | false | -| spark.comet.memory.overhead.factor | Fraction of executor memory to be allocated as additional memory for Comet when running in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 0.2 | -| spark.comet.memory.overhead.min | Minimum amount of additional memory to be allocated per executor process for Comet, in MiB, when running in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 402653184b | -| spark.comet.memoryOverhead | The amount of additional memory to be allocated per executor process for Comet, in MiB, when running in on-heap mode. This config is optional. If this is not specified, it will be set to `spark.comet.memory.overhead.factor` * `spark.executor.memory`. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | | +| spark.comet.memory.overhead.factor | Fraction of executor memory to be allocated as additional memory for Comet when running Spark in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 0.2 | +| spark.comet.memory.overhead.min | Minimum amount of additional memory to be allocated per executor process for Comet, in MiB, when running Spark in on-heap mode. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | 402653184b | +| spark.comet.memoryOverhead | The amount of additional memory to be allocated per executor process for Comet, in MiB, when running Spark in on-heap mode. This config is optional. If this is not specified, it will be set to `spark.comet.memory.overhead.factor` * `spark.executor.memory`. For more information, refer to the Comet Tuning Guide (https://datafusion.apache.org/comet/user-guide/tuning.html). | | | spark.comet.metrics.updateInterval | The interval in milliseconds to update metrics. If interval is negative, metrics will be updated upon task completion. | 3000 | | spark.comet.nativeLoadRequired | Whether to require Comet native library to load successfully when Comet is enabled. If not, Comet will silently fallback to Spark when it fails to load the native lib. Otherwise, an error will be thrown and the Spark job will be aborted. | false | | spark.comet.parquet.enable.directBuffer | Whether to use Java direct byte buffer when reading Parquet. | false | diff --git a/docs/source/user-guide/kubernetes.md b/docs/source/user-guide/kubernetes.md index 0d1418f838..9cb7bab86a 100644 --- a/docs/source/user-guide/kubernetes.md +++ b/docs/source/user-guide/kubernetes.md @@ -66,10 +66,10 @@ metadata: spec: type: Scala mode: cluster - image: apache/datafusion-comet:0.7.0-spark3.5.4-scala2.12-java11 + image: apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11 imagePullPolicy: IfNotPresent mainClass: org.apache.spark.examples.SparkPi - mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.4.jar + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.5.jar sparkConf: "spark.executor.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar" "spark.driver.extraClassPath": "/opt/spark/jars/comet-spark-spark3.5_2.12-0.7.0.jar" @@ -80,17 +80,17 @@ spec: "spark.comet.exec.shuffle.enabled": "true" "spark.comet.exec.shuffle.mode": "auto" "spark.shuffle.manager": "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager" - sparkVersion: 3.5.4 + sparkVersion: 3.5.5 driver: labels: - version: 3.5.4 + version: 3.5.5 cores: 1 coreLimit: 1200m memory: 512m serviceAccount: spark-operator-spark executor: labels: - version: 3.5.4 + version: 3.5.5 instances: 1 cores: 1 coreLimit: 1200m diff --git a/pom.xml b/pom.xml index 259760d51f..fa0dfc7c32 100644 --- a/pom.xml +++ b/pom.xml @@ -556,7 +556,7 @@ under the License. spark-3.5 2.12.18 - 3.5.4 + 3.5.5 3.5 1.13.1 2.0.7 diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala index cee444e3f0..684953e2db 100644 --- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala +++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimCometScanExec.scala @@ -55,7 +55,7 @@ trait ShimCometScanExec { protected def isNeededForSchema(sparkSchema: StructType): Boolean = false protected def getPartitionedFile(f: FileStatusWithMetadata, p: PartitionDirectory): PartitionedFile = - PartitionedFileUtil.getPartitionedFile(f, p.values) + PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values) protected def splitFiles(sparkSession: SparkSession, file: FileStatusWithMetadata, @@ -63,7 +63,7 @@ trait ShimCometScanExec { isSplitable: Boolean, maxSplitBytes: Long, partitionValues: InternalRow): Seq[PartitionedFile] = - PartitionedFileUtil.splitFiles(sparkSession, file, isSplitable, maxSplitBytes, partitionValues) + PartitionedFileUtil.splitFiles(sparkSession, file, filePath, isSplitable, maxSplitBytes, partitionValues) protected def getPushedDownFilters(relation: HadoopFsRelation , dataFilters: Seq[Expression]): Seq[Filter] = { val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) From 2cddf5b0828a8496e0d145306daf62843a1d1a9d Mon Sep 17 00:00:00 2001 From: Yaniv Kunda Date: Mon, 24 Mar 2025 00:01:42 +0200 Subject: [PATCH 2/3] generated 3.5.5 diff file --- dev/diffs/{3.5.4.diff => 3.5.5.diff} | 196 ++++++++++----------------- 1 file changed, 74 insertions(+), 122 deletions(-) rename dev/diffs/{3.5.4.diff => 3.5.5.diff} (97%) diff --git a/dev/diffs/3.5.4.diff b/dev/diffs/3.5.5.diff similarity index 97% rename from dev/diffs/3.5.4.diff rename to dev/diffs/3.5.5.diff index dfa762739c..622e246f5a 100644 --- a/dev/diffs/3.5.4.diff +++ b/dev/diffs/3.5.5.diff @@ -1,5 +1,5 @@ diff --git a/pom.xml b/pom.xml -index 8dc47f391f9..8a3e72133a8 100644 +index 9b009c3a..8d933af9 100644 --- a/pom.xml +++ b/pom.xml @@ -152,6 +152,8 @@ @@ -11,7 +11,7 @@ index 8dc47f391f9..8a3e72133a8 100644