From 2031f5bea5c83800b28190bee40895da7c1d25a0 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 11:12:01 +0800 Subject: [PATCH 01/42] [SPARK-36579][SQL] Make spark source stagingDir can use user defined --- .../apache/spark/sql/internal/SQLConf.scala | 12 ++ .../SQLHadoopMapReduceCommitProtocol.scala | 7 ++ .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ++++++++++++++++++ .../sql/sources/PartitionedWriteSuite.scala | 27 ++++ .../sql/hive/execution/SaveAsHiveFile.scala | 94 +------------- 5 files changed, 166 insertions(+), 89 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index c6d9558d53eda..d8dbc36a4f9cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1216,6 +1216,16 @@ object SQLConf { .booleanConf .createWithDefault(true) + val FILE_COMMIT_STAGING_DIR = + buildConf("spark.sql.source.stagingDir") + .doc("The staging directory of Spark job. Spark uses it to deal with files with " + + "absolute output path, or writing data into partitioned directory when " + + "dynamic partition overwrite mode.") + .version("3.3.0") + .internal() + .stringConf + .createWithDefault(".spark-staging") + // The output committer class used by data sources. The specified class needs to be a // subclass of org.apache.hadoop.mapreduce.OutputCommitter. val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass") @@ -3824,6 +3834,8 @@ class SQLConf extends Serializable with Logging { def partitionColumnTypeInferenceEnabled: Boolean = getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE) + def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR) + def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS) def parallelPartitionDiscoveryThreshold: Int = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 144be2316f091..37a401d842abd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -23,7 +23,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.util.SQLFileCommitProtocolUtils /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -36,6 +38,11 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { + override val stagingDir: Path = + SQLFileCommitProtocolUtils.newVersionExternalTempPath( + new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(), + SQLConf.get.fileCommitStagingDir, "spark", jobId) + override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala new file mode 100644 index 0000000000000..1b2bfb42f0e35 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.spark.sql.util + +import java.net.URI +import java.text.SimpleDateFormat +import java.util.{Date, Locale, Random} + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} + +import org.apache.spark.internal.Logging + +object SQLFileCommitProtocolUtils extends Logging { + + def newVersionExternalTempPath( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val extURI = path.toUri + if (extURI.getScheme == "viewfs") { + getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) + } else { + new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") + } + } + + + private def getExtTmpPathRelTo( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") // Hive uses 10000 + } + + private def getExternalScratchDir( + extURI: URI, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + getStagingDir( + new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), + hadoopConf, + stagingDir, + engineType, + jobId) + } + + private def getStagingDir( + inputPath: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val inputPathName: String = inputPath.toString + val fs: FileSystem = inputPath.getFileSystem(hadoopConf) + var stagingPathName: String = + if (inputPathName.indexOf(stagingDir) == -1) { + new Path(inputPathName, stagingDir).toString + } else { + inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) + } + + // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the + // staging directory needs to avoid being deleted when users set hive.exec.stagingdir + // under the table directory. + if (isSubDir(new Path(stagingPathName), inputPath, fs) && + !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { + logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + + "directory.") + stagingPathName = new Path(inputPathName, ".hive-staging").toString + } + + val dir = fs.makeQualified( + new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) + logDebug("Created staging dir = " + dir + " for path = " + inputPath) + dir + } + + private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { + val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR + val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR + path1.startsWith(path2) + } + + def executionId(engineType: String): String = { + val rand: Random = new Random + val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) + s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index b9266429f81a5..4751767bfd239 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -189,6 +189,33 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { + withTempDir { stagingDir => + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString, + SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) { + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) + } + } + } + } + } } /** diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index ec189344f4fa5..581c97ebcbfdd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -19,8 +19,7 @@ package org.apache.spark.sql.hive.execution import java.io.IOException import java.net.URI -import java.text.SimpleDateFormat -import java.util.{Date, Locale, Random} +import java.util.Locale import scala.util.control.NonFatal @@ -40,6 +39,7 @@ import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.sql.hive.client.HiveVersion +import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._ // Base trait from which all hive insert statement physical execution extends. private[hive] trait SaveAsHiveFile extends DataWritingCommand { @@ -129,7 +129,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - newVersionExternalTempPath(path, hadoopConf, stagingDir) + newVersionExternalTempPath( + path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) } else { throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion) } @@ -159,7 +160,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { hadoopConf: Configuration, scratchDir: String): Path = { val extURI: URI = path.toUri - val scratchPath = new Path(scratchDir, executionId) + val scratchPath = new Path(scratchDir, executionId("hive")) var dirPath = new Path( extURI.getScheme, extURI.getAuthority, @@ -180,90 +181,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { } dirPath } - - // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2 - private def newVersionExternalTempPath( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val extURI: URI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000") - } - } - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir) - } - - private[hive] def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir: Path = - fs.makeQualified( - new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - try { - if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) { - throw new IllegalStateException("Cannot create staging directory '" + dir.toString + "'") - } - createdTempDir = Some(dir) - fs.deleteOnExit(dir) - } catch { - case e: IOException => - throw QueryExecutionErrors.cannotCreateStagingDirError( - s"'${dir.toString}': ${e.getMessage}", e) - } - dir - } - - // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - private def executionId: String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } } From c29f55ed00c7b50aa86a7d86b499739d7d377c96 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 14:27:08 +0800 Subject: [PATCH 02/42] Update SQLHadoopMapReduceCommitProtocol.scala --- .../datasources/SQLHadoopMapReduceCommitProtocol.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 37a401d842abd..a6f86256de2c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter +import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.util.SQLFileCommitProtocolUtils @@ -40,7 +40,7 @@ class SQLHadoopMapReduceCommitProtocol( override val stagingDir: Path = SQLFileCommitProtocolUtils.newVersionExternalTempPath( - new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(), + new Path(path), SparkContext.getActive.get.hadoopConfiguration, SQLConf.get.fileCommitStagingDir, "spark", jobId) override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { From 2604c9f2c95f50e90f77c300442344816baea3d3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 15:07:15 +0800 Subject: [PATCH 03/42] Update --- .../spark/internal/io/FileCommitProtocol.scala | 5 +++++ .../internal/io/HadoopMapReduceCommitProtocol.scala | 8 ++++++++ .../InsertIntoHadoopFsRelationCommand.scala | 13 ++----------- .../streaming/ManifestFileCommitProtocol.scala | 2 ++ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 5cd7397ea358f..b968a51d53d34 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -50,6 +50,11 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ + /** + * The output path of this committer. + */ + def outputPath: Path + /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index c061d617fce4b..2434ffa91d7a3 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -160,6 +160,14 @@ class HadoopMapReduceCommitProtocol( f"part-$split%05d-$jobId$ext" } + override def outputPath: Path = { + if (dynamicPartitionOverwrite) { + stagingDir + } else { + new Path(path) + } + } + override def setupJob(jobContext: JobContext): Unit = { // Setup IDs val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 267b360b474ca..7edb78471e0ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand( val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, - outputPath = outputPath.toString, + outputPath = qualifiedOutputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { @@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand( } } - // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files - // will be renamed from staging path to final output path during commit job - val committerOutputPath = if (dynamicPartitionOverwrite) { - FileCommitProtocol.getStagingDir(outputPath.toString, jobId) - .makeQualified(fs.getUri, fs.getWorkingDirectory) - } else { - qualifiedOutputPath - } - val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committerOutputPath.toString, customPartitionLocations, outputColumns), + committer.outputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 46ce33687890d..30709266f1806 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } + override def outputPath: Path = new Path(path) + override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] From 71f6b17c91b92a85913f816085290a0212e46939 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 19:27:56 +0800 Subject: [PATCH 04/42] fix ut --- .../sql/util/SQLFileCommitProtocolUtils.scala | 2 +- .../spark/sql/sources/InsertSuite.scala | 2 +- .../sql/hive/execution/SaveAsHiveFile.scala | 4 +++- .../apache/spark/sql/hive/InsertSuite.scala | 21 ++++++++++++------- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala index 1b2bfb42f0e35..0d5455a411c2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala @@ -69,7 +69,7 @@ object SQLFileCommitProtocolUtils extends Logging { jobId) } - private def getStagingDir( + def getStagingDir( inputPath: Path, hadoopConf: Configuration, stagingDir: String, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index d3c21032d0b32..ddcbfe44fbef8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -1073,6 +1073,6 @@ class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLoca } private def isSparkStagingDir(path: Path): Boolean = { - path.toString.contains(".spark-staging-") + path.toString.contains(".spark-staging_") } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 581c97ebcbfdd..12a9ed3d4d311 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -129,8 +129,10 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - newVersionExternalTempPath( + val externalTempPath = newVersionExternalTempPath( path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) + createdTempDir = Some(externalTempPath.getParent) + externalTempPath } else { throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index b715f484fa02a..307d30d88296d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -21,16 +21,17 @@ import java.io.File import com.google.common.io.Files import org.apache.hadoop.fs.Path +import org.apache.hadoop.hive.ql.exec.TaskRunner import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.SparkException import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.hive.execution.InsertIntoHiveTable import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.SQLFileCommitProtocolUtils import org.apache.spark.util.Utils case class TestData(key: Int, value: String) @@ -556,25 +557,29 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val conf = spark.sessionState.newHadoopConf() val inputPath = new Path("/tmp/b/c") var stagingDir = "tmp/b" - val saveHiveFile = InsertIntoHiveTable(null, Map.empty, null, false, false, null) - val getStagingDir = PrivateMethod[Path](Symbol("getStagingDir")) - var path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + val id = TaskRunner.getTaskRunnerID + var path = SQLFileCommitProtocolUtils. + getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b_hive_") != -1) stagingDir = "tmp/b/c" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = "d/e" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = ".d/e" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1) stagingDir = "/tmp/c/" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/c_hive_") != -1) } From 30113d28dc4a0a619e4bd732b360c84542eb25b3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 13:38:01 +0800 Subject: [PATCH 05/42] Update SaveAsHiveFile.scala --- .../sql/hive/execution/SaveAsHiveFile.scala | 85 ------------------- 1 file changed, 85 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 298fadcb01fac..e669f9cfcab2f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -189,90 +189,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { } dirPath } - - // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2 - private def newVersionExternalTempPath( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val extURI: URI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path, hadoopConf, stagingDir) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000") - } - } - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir) - } - - private[hive] def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir: Path = - fs.makeQualified( - new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - try { - if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) { - throw new IllegalStateException("Cannot create staging directory '" + dir.toString + "'") - } - createdTempDir = Some(dir) - fs.deleteOnExit(dir) - } catch { - case e: IOException => - throw QueryExecutionErrors.cannotCreateStagingDirError( - s"'${dir.toString}': ${e.getMessage}", e) - } - dir - } - - // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - private def executionId: String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } } From 6f405dc89d507d36da03d6a97d9aa268926f7bfc Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 14:15:46 +0800 Subject: [PATCH 06/42] update --- .../spark/internal/config/package.scala | 10 ++ .../internal/io/FileCommitProtocol.scala | 103 ++++++++++++++-- .../io/HadoopMapReduceCommitProtocol.scala | 11 +- .../apache/spark/sql/internal/SQLConf.scala | 12 -- .../InsertIntoHadoopFsRelationCommand.scala | 13 +- .../SQLHadoopMapReduceCommitProtocol.scala | 7 -- .../ManifestFileCommitProtocol.scala | 2 - .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ------------------ .../sql/sources/PartitionedWriteSuite.scala | 3 +- .../sql/hive/execution/SaveAsHiveFile.scala | 5 +- .../apache/spark/sql/hive/InsertSuite.scala | 12 +- 11 files changed, 129 insertions(+), 164 deletions(-) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 4b7ba9b19f4de..8e24105d7742b 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -2258,4 +2258,14 @@ package object config { .version("3.2.0") .stringConf .createOptional + + val EXEC_STAGING_DIR = + ConfigBuilder("spark.exec.stagingDir") + .doc("The staging directory of Spark job. Spark uses it to deal with files with " + + "absolute output path, or writing data into partitioned directory when " + + "dynamic partition overwrite mode.") + .version("3.3.0") + .internal() + .stringConf + .createWithDefault(".spark-staging") } diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index b968a51d53d34..958a9fa8aa034 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -17,11 +17,18 @@ package org.apache.spark.internal.io +import java.net.URI +import java.text.SimpleDateFormat +import java.util.{Date, Locale, Random} + +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ +import org.apache.spark.SparkContext import org.apache.spark.annotation.Unstable import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.util.Utils @@ -50,11 +57,6 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ - /** - * The output path of this committer. - */ - def outputPath: Path - /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ @@ -232,8 +234,95 @@ object FileCommitProtocol extends Logging { } } - def getStagingDir(path: String, jobId: String): Path = { - new Path(path, ".spark-staging-" + jobId) + def getStagingDir(path: String, jobId: String, conf: Configuration): Path = { + newVersionExternalTempPath( + new Path(path), conf, + SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId) + } + + + def newVersionExternalTempPath( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val extURI = path.toUri + if (extURI.getScheme == "viewfs") { + getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) + } else { + new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") + } + } + + + private def getExtTmpPathRelTo( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") // Hive uses 10000 + } + + private def getExternalScratchDir( + extURI: URI, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + getStagingDir( + new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), + hadoopConf, + stagingDir, + engineType, + jobId) + } + + def getStagingDir( + inputPath: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val inputPathName: String = inputPath.toString + val fs: FileSystem = inputPath.getFileSystem(hadoopConf) + var stagingPathName: String = + if (inputPathName.indexOf(stagingDir) == -1) { + new Path(inputPathName, stagingDir).toString + } else { + inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) + } + + // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the + // staging directory needs to avoid being deleted when users set hive.exec.stagingdir + // under the table directory. + if (isSubDir(new Path(stagingPathName), inputPath, fs) && + !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { + logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + + "directory.") + stagingPathName = new Path(inputPathName, ".hive-staging").toString + } + + val dir = fs.makeQualified( + new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) + logDebug("Created staging dir = " + dir + " for path = " + inputPath) + dir + } + + private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { + val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR + val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR + path1.startsWith(path2) + } + + def executionId(engineType: String): String = { + val rand: Random = new Random + val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) + s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) } } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index e1c0367e46b98..9796d805be35f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -104,7 +104,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected def stagingDir = getStagingDir(path, jobId) + protected var stagingDir: Path = _ protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() @@ -113,6 +113,7 @@ class HadoopMapReduceCommitProtocol( case c: Configurable => c.setConf(context.getConfiguration) case _ => () } + stagingDir = getStagingDir(path, jobId, context.getConfiguration) format.getOutputCommitter(context) } @@ -170,14 +171,6 @@ class HadoopMapReduceCommitProtocol( f"${spec.prefix}part-$split%05d-$jobId${spec.suffix}" } - override def outputPath: Path = { - if (dynamicPartitionOverwrite) { - stagingDir - } else { - new Path(path) - } - } - override def setupJob(jobContext: JobContext): Unit = { // Setup IDs val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0defec5becd7e..98aad1c9c83f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1219,16 +1219,6 @@ object SQLConf { .booleanConf .createWithDefault(true) - val FILE_COMMIT_STAGING_DIR = - buildConf("spark.sql.source.stagingDir") - .doc("The staging directory of Spark job. Spark uses it to deal with files with " + - "absolute output path, or writing data into partitioned directory when " + - "dynamic partition overwrite mode.") - .version("3.3.0") - .internal() - .stringConf - .createWithDefault(".spark-staging") - // The output committer class used by data sources. The specified class needs to be a // subclass of org.apache.hadoop.mapreduce.OutputCommitter. val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass") @@ -3842,8 +3832,6 @@ class SQLConf extends Serializable with Logging { def partitionColumnTypeInferenceEnabled: Boolean = getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE) - def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR) - def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS) def parallelPartitionDiscoveryThreshold: Int = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 7edb78471e0ff..f1a0e0dedbddd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand( val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, - outputPath = qualifiedOutputPath.toString, + outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { @@ -162,6 +162,15 @@ case class InsertIntoHadoopFsRelationCommand( } } + // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files + // will be renamed from staging path to final output path during commit job + val committerOutputPath = if (dynamicPartitionOverwrite) { + FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf) + .makeQualified(fs.getUri, fs.getWorkingDirectory) + } else { + qualifiedOutputPath + } + val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -169,7 +178,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committer.outputPath.toString, customPartitionLocations, outputColumns), + committerOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index a6f86256de2c3..144be2316f091 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,11 +21,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter -import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.util.SQLFileCommitProtocolUtils /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -38,11 +36,6 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - override val stagingDir: Path = - SQLFileCommitProtocolUtils.newVersionExternalTempPath( - new Path(path), SparkContext.getActive.get.hadoopConfiguration, - SQLConf.get.fileCommitStagingDir, "spark", jobId) - override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 30709266f1806..46ce33687890d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,8 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } - override def outputPath: Path = new Path(path) - override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala deleted file mode 100644 index 0d5455a411c2d..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.spark.sql.util - -import java.net.URI -import java.text.SimpleDateFormat -import java.util.{Date, Locale, Random} - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} - -import org.apache.spark.internal.Logging - -object SQLFileCommitProtocolUtils extends Logging { - - def newVersionExternalTempPath( - path: Path, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - val extURI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), - "-ext-10000") - } - } - - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), - "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir, - engineType, - jobId) - } - - def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir = fs.makeQualified( - new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - dir - } - - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - def executionId(engineType: String): String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 4751767bfd239..86894945591da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.TestUtils import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -194,7 +195,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { withTempDir { stagingDir => withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> SQLConf.PartitionOverwriteMode.DYNAMIC.toString, - SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) { + EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { withTempDir { d => withTable("t") { sql( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index e669f9cfcab2f..8c6c10fee7935 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -40,7 +40,6 @@ import org.apache.spark.sql.execution.datasources.{BucketingUtils, FileFormatWri import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.sql.hive.client.HiveVersion -import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._ // Base trait from which all hive insert statement physical execution extends. private[hive] trait SaveAsHiveFile extends DataWritingCommand { @@ -135,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - val externalTempPath = newVersionExternalTempPath( + val externalTempPath = FileCommitProtocol.newVersionExternalTempPath( path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) createdTempDir = Some(externalTempPath.getParent) externalTempPath @@ -168,7 +167,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { hadoopConf: Configuration, scratchDir: String): Path = { val extURI: URI = path.toUri - val scratchPath = new Path(scratchDir, executionId("hive")) + val scratchPath = new Path(scratchDir, FileCommitProtocol.executionId("hive")) var dirPath = new Path( extURI.getScheme, extURI.getAuthority, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index f0355eaf18b68..1dfeedcd9a18b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -26,13 +26,13 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.SparkException +import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.SQLFileCommitProtocolUtils import org.apache.spark.util.Utils case class TestData(key: Int, value: String) @@ -543,27 +543,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val inputPath = new Path("/tmp/b/c") var stagingDir = "tmp/b" val id = TaskRunner.getTaskRunnerID - var path = SQLFileCommitProtocolUtils. + var path = FileCommitProtocol. getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b_hive_") != -1) stagingDir = "tmp/b/c" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = "d/e" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = ".d/e" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1) stagingDir = "/tmp/c/" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/c_hive_") != -1) } From 361263b9b11e5668286268fc306b399b8760c48a Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 14:43:35 +0800 Subject: [PATCH 07/42] update --- .../spark/internal/io/FileCommitProtocol.scala | 9 ++++++--- .../internal/io/HadoopMapReduceCommitProtocol.scala | 13 +++++++++++-- .../InsertIntoHadoopFsRelationCommand.scala | 11 +---------- .../streaming/ManifestFileCommitProtocol.scala | 2 ++ 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 958a9fa8aa034..73b7fd62a25b6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ -import org.apache.spark.SparkContext import org.apache.spark.annotation.Unstable import org.apache.spark.internal.Logging import org.apache.spark.internal.config.EXEC_STAGING_DIR @@ -57,6 +56,11 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ + /** + * The output path of this committer. + */ + def outputPath: Path + /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ @@ -236,8 +240,7 @@ object FileCommitProtocol extends Logging { def getStagingDir(path: String, jobId: String, conf: Configuration): Path = { newVersionExternalTempPath( - new Path(path), conf, - SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId) + new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 9796d805be35f..86d087cdb5cc2 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl +import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.mapred.SparkHadoopMapRedUtil @@ -104,7 +105,8 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected var stagingDir: Path = _ + protected var stagingDir: Path = + getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() @@ -113,10 +115,17 @@ class HadoopMapReduceCommitProtocol( case c: Configurable => c.setConf(context.getConfiguration) case _ => () } - stagingDir = getStagingDir(path, jobId, context.getConfiguration) format.getOutputCommitter(context) } + override def outputPath: Path = { + if (dynamicPartitionOverwrite) { + stagingDir + } else { + new Path(path) + } + } + override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { newTaskTempFile(taskContext, dir, FileNameSpec("", ext)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index f1a0e0dedbddd..382c1e3c16384 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand( } } - // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files - // will be renamed from staging path to final output path during commit job - val committerOutputPath = if (dynamicPartitionOverwrite) { - FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf) - .makeQualified(fs.getUri, fs.getWorkingDirectory) - } else { - qualifiedOutputPath - } - val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committerOutputPath.toString, customPartitionLocations, outputColumns), + committer.outputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 46ce33687890d..30709266f1806 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } + override def outputPath: Path = new Path(path) + override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] From 9ee6ee545c1c6490afaec2febbaf68cb3888586e Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 14:55:26 +0800 Subject: [PATCH 08/42] update --- .../sql/sources/PartitionedWriteSuite.scala | 28 ------- .../sql/sources/StagingInsertSuite.scala | 74 +++++++++++++++++++ 2 files changed, 74 insertions(+), 28 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 86894945591da..b9266429f81a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -25,7 +25,6 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.TestUtils import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -190,33 +189,6 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } - - test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { - withTempDir { stagingDir => - withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> - SQLConf.PartitionOverwriteMode.DYNAMIC.toString, - EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { - withTempDir { d => - withTable("t") { - sql( - s""" - | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) - | LOCATION '${d.getAbsolutePath}' - """.stripMargin) - - val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") - df.write - .partitionBy("p1") - .mode("overwrite") - .saveAsTable("t") - checkAnswer(sql("SELECT * FROM t"), df) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) - } - } - } - } - } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala new file mode 100644 index 0000000000000..72c6534dc8751 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import org.apache.spark.SparkConf +import org.apache.spark.internal.config.EXEC_STAGING_DIR +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.Utils + +class StagingInsertSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + val stagingDir = Utils.createTempDir() + + override def sparkConf: SparkConf = + super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath) + + override def beforeAll(): Unit = { + super.beforeAll() + stagingDir.delete() + } + + override def afterAll(): Unit = { + try { + Utils.deleteRecursively(stagingDir) + } finally { + super.afterAll() + } + } + + test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { + withTempDir { stagingDir => + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString, + EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) + } + } + } + } + } +} From 7773fb24bc5071b76b97b5ebbe8befc7c80d1e71 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 15:24:13 +0800 Subject: [PATCH 09/42] update --- .../internal/io/FileCommitProtocol.scala | 9 +++-- .../io/HadoopMapReduceCommitProtocol.scala | 6 +-- ...FileCommitProtocolInstantiationSuite.scala | 12 +++--- .../InsertIntoHadoopFsRelationCommand.scala | 1 + .../SQLHadoopMapReduceCommitProtocol.scala | 4 +- .../execution/datasources/v2/FileWrite.scala | 3 +- .../execution/streaming/FileStreamSink.scala | 3 +- .../execution/metric/SQLMetricsSuite.scala | 4 +- .../sql/sources/PartitionedWriteSuite.scala | 6 ++- .../sql/sources/StagingInsertSuite.scala | 37 +++++++++---------- .../sql/hive/execution/SaveAsHiveFile.scala | 3 +- 11 files changed, 50 insertions(+), 38 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 73b7fd62a25b6..d6f7370c9a582 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -215,6 +215,7 @@ object FileCommitProtocol extends Logging { className: String, jobId: String, outputPath: String, + conf: Configuration = new Configuration(), dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + @@ -224,9 +225,11 @@ object FileCommitProtocol extends Logging { // dynamicPartitionOverwrite: Boolean). // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { - val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean]) - logDebug("Using (String, String, Boolean) constructor") - ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + val ctor = clazz.getDeclaredConstructor( + classOf[String], classOf[String], classOf[Configuration], classOf[Boolean]) + logDebug("Using (String, String, Configuration, Boolean) constructor") + ctor.newInstance( + jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => logDebug("Falling back to (String, String) constructor") diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 86d087cdb5cc2..9cdfe851c89d7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -24,12 +24,12 @@ import scala.collection.mutable import scala.util.Try import org.apache.hadoop.conf.Configurable +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl -import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.mapred.SparkHadoopMapRedUtil @@ -68,6 +68,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, + conf: Configuration = new Configuration(), dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { @@ -105,8 +106,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected var stagingDir: Path = - getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration) + protected var stagingDir: Path = getStagingDir(path, jobId, conf) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 778f748f83950..1535ca4c6624e 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.internal.io +import org.apache.hadoop.conf.Configuration + import org.apache.spark.SparkFunSuite /** @@ -58,8 +60,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[Other].getCanonicalName, "job", - "path", - false) + "path") } } @@ -68,8 +69,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[NoMatchingArgs].getCanonicalName, "job", - "path", - false) + "path") } } @@ -83,6 +83,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[ClassicConstructorCommitProtocol].getCanonicalName, "job", "path", + new Configuration(), dynamic).asInstanceOf[ClassicConstructorCommitProtocol] } @@ -97,6 +98,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[FullConstructorCommitProtocol].getCanonicalName, "job", "path", + new Configuration(), dynamic).asInstanceOf[FullConstructorCommitProtocol] } @@ -121,7 +123,7 @@ private class FullConstructorCommitProtocol( arg2: String, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) { def this(arg1: String, arg2: String) = { this(arg1, arg2, false, 2) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 382c1e3c16384..fff93a579a628 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -109,6 +109,7 @@ case class InsertIntoHadoopFsRelationCommand( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, outputPath = outputPath.toString, + hadoopConf, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 144be2316f091..dde68006402d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.datasources +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter @@ -32,8 +33,9 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, + conf: Configuration, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala index ccc467aae1f15..d822b1bb6ef12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala @@ -61,7 +61,8 @@ trait FileWrite extends Write { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = paths.head) + outputPath = paths.head, + hadoopConf) lazy val description = createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index 5058a1dfc3baf..8db2ce027fdfd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -150,7 +150,8 @@ class FileStreamSink( val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, - outputPath = path) + outputPath = path, + sparkSession.sessionState.newHadoopConf()) committer match { case manifestCommitter: ManifestFileCommitProtocol => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 32428fbde0016..9638924b394db 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,6 +22,7 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -834,7 +835,8 @@ case class CustomFileCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol( + jobId, path, new Configuration(), dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index b9266429f81a5..df41b93f45fad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.sources import java.io.File import java.sql.Timestamp +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} @@ -35,7 +36,7 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String) - extends SQLHadoopMapReduceCommitProtocol(jobId, path) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration()) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -199,7 +200,8 @@ private class PartitionFileExistCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol( + jobId, path, new Configuration(), dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala index 72c6534dc8751..162cef0cca3f8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -46,27 +46,24 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession { } test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { - withTempDir { stagingDir => - withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> - SQLConf.PartitionOverwriteMode.DYNAMIC.toString, - EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { - withTempDir { d => - withTable("t") { - sql( - s""" - | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) - | LOCATION '${d.getAbsolutePath}' - """.stripMargin) + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString) { + withTempDir { d => + withTable("t") { + sql( + s""" + |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + |LOCATION '${d.getAbsolutePath}' + """.stripMargin) - val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") - df.write - .partitionBy("p1") - .mode("overwrite") - .saveAsTable("t") - checkAnswer(sql("SELECT * FROM t"), df) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) - } + val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) } } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 8c6c10fee7935..734a0aa64df2a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -83,7 +83,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = outputLocation) + outputPath = outputLocation, + hadoopConf) val options = bucketSpec .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true")) From a3b3c51f326fcf285eb870a2874c774a1498d145 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 15:42:17 +0800 Subject: [PATCH 10/42] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index fc5d0a0b3a7f5..d8b783c460f58 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -19,10 +19,10 @@ package org.apache.spark.internal.io.cloud import java.io.IOException +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory} - import org.apache.spark.internal.io.FileNameSpec import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol @@ -51,7 +51,7 @@ class PathOutputCommitProtocol( jobId: String, dest: String, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable { + extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable { if (dynamicPartitionOverwrite) { // until there's explicit extensions to the PathOutputCommitProtocols From b4d60e4b431ebfcc843f826086b1d09135c823c5 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 22:29:45 +0800 Subject: [PATCH 11/42] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index d8b783c460f58..c6497fc47d16f 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory} + import org.apache.spark.internal.io.FileNameSpec import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol From 2c4180886d06e61304774900c45db3d45c18c39c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 10:22:05 +0800 Subject: [PATCH 12/42] Update CommitterBindingSuite.scala --- .../spark/internal/io/cloud/CommitterBindingSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala index 546f54229ea59..13317b61276dc 100644 --- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala +++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala @@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite { test("local filesystem instantiation") { val instance = FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", false) + jobId, "file:///tmp", new Configuration(), false) val protocol = instance.asInstanceOf[PathOutputCommitProtocol] assert("file:///tmp" === protocol.destination) @@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite { val cause = intercept[InvocationTargetException] { FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", true) + jobId, "file:///tmp", new Configuration(), true) }.getCause if (cause == null || !cause.isInstanceOf[IOException] || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) { From 592682235822113a73befe3247f2b7e1bebbd81f Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:10:21 +0800 Subject: [PATCH 13/42] update --- .../internal/io/FileCommitProtocol.scala | 21 +++++++++++++++++-- .../io/HadoopMapReduceCommitProtocol.scala | 4 +++- .../InsertIntoHadoopFsRelationCommand.scala | 2 +- .../ManifestFileCommitProtocol.scala | 4 +++- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index d6f7370c9a582..73e6abcba867e 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -57,9 +57,26 @@ abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ /** - * The output path of this committer. + * Get the final directory where work will be placed once the job + * is committed. This may be null, in which case, there is no output + * path to write data to. */ - def outputPath: Path + def getOutputPath(): Path = null + + /** + * Get the directory that the task should write results into. + * Warning: there's no guarantee that this work path is on the same + * FS as the final output, or that it's visible across machines. + * May be null. + */ + def getWorkPath(): Path = null + + /** + * Predicate: is there an output path? + */ + def hasOutputPath(): Boolean = { + getOutputPath() != null; + } /** * Setups up a job. Must be called on the driver before any other methods can be invoked. diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 9cdfe851c89d7..aa511d385b78a 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -118,7 +118,9 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } - override def outputPath: Path = { + override def getOutputPath(): Path = new Path(path) + + override def getWorkPath: Path = { if (dynamicPartitionOverwrite) { stagingDir } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index fff93a579a628..1a46b60fe111e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -170,7 +170,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committer.outputPath.toString, customPartitionLocations, outputColumns), + committer.getWorkPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 30709266f1806..09681f882853f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,7 +56,9 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } - override def outputPath: Path = new Path(path) + override def getOutputPath: Path = new Path(path) + + override def getWorkPath(): Path = new Path(path) override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") From 6cdee5868655c972f3aebf9294ae0e1680501447 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:38:22 +0800 Subject: [PATCH 14/42] update --- .../apache/spark/internal/io/FileCommitProtocol.scala | 9 +++++---- .../internal/io/HadoopMapRedCommitProtocol.scala | 2 +- .../internal/io/HadoopMapReduceCommitProtocol.scala | 5 ++--- .../io/FileCommitProtocolInstantiationSuite.scala | 6 +----- .../internal/io/cloud/CommitterBindingSuite.scala | 4 ++-- .../InsertIntoHadoopFsRelationCommand.scala | 8 ++++++-- .../SQLHadoopMapReduceCommitProtocol.scala | 5 ++--- .../sql/execution/datasources/v2/FileWrite.scala | 3 +-- .../sql/execution/streaming/FileStreamSink.scala | 3 +-- .../spark/sql/execution/metric/SQLMetricsSuite.scala | 4 ++-- .../spark/sql/sources/PartitionedWriteSuite.scala | 11 +++++++---- .../spark/sql/hive/execution/SaveAsHiveFile.scala | 3 +-- 12 files changed, 31 insertions(+), 32 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 73e6abcba867e..7029f6f3eb1f1 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -232,8 +232,8 @@ object FileCommitProtocol extends Logging { className: String, jobId: String, outputPath: String, - conf: Configuration = new Configuration(), - dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = { + dynamicPartitionOverwrite: Boolean = false, + stagingDir: Option[String] = None): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + s" dynamic=$dynamicPartitionOverwrite") @@ -245,8 +245,9 @@ object FileCommitProtocol extends Logging { val ctor = clazz.getDeclaredConstructor( classOf[String], classOf[String], classOf[Configuration], classOf[Boolean]) logDebug("Using (String, String, Configuration, Boolean) constructor") - ctor.newInstance( - jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + ctor.newInstance(jobId, outputPath, + stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), + dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => logDebug("Falling back to (String, String) constructor") diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index af0aa41518766..30c85e1570d63 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} * Unlike Hadoop's OutputCommitter, this implementation is serializable. */ class HadoopMapRedCommitProtocol(jobId: String, path: String) - extends HadoopMapReduceCommitProtocol(jobId, path) { + extends HadoopMapReduceCommitProtocol(jobId, path, "") { override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index aa511d385b78a..20b2be17a305d 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -24,7 +24,6 @@ import scala.collection.mutable import scala.util.Try import org.apache.hadoop.conf.Configurable -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter @@ -68,7 +67,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - conf: Configuration = new Configuration(), + stagingPath: String = "", dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { @@ -106,7 +105,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected var stagingDir: Path = getStagingDir(path, jobId, conf) + protected var stagingDir: Path = new Path(stagingPath) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 1535ca4c6624e..87ffe0b69a469 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.internal.io -import org.apache.hadoop.conf.Configuration - import org.apache.spark.SparkFunSuite /** @@ -83,7 +81,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[ClassicConstructorCommitProtocol].getCanonicalName, "job", "path", - new Configuration(), dynamic).asInstanceOf[ClassicConstructorCommitProtocol] } @@ -98,7 +95,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[FullConstructorCommitProtocol].getCanonicalName, "job", "path", - new Configuration(), dynamic).asInstanceOf[FullConstructorCommitProtocol] } @@ -123,7 +119,7 @@ private class FullConstructorCommitProtocol( arg2: String, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) { def this(arg1: String, arg2: String) = { this(arg1, arg2, false, 2) diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala index 13317b61276dc..546f54229ea59 100644 --- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala +++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala @@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite { test("local filesystem instantiation") { val instance = FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", new Configuration(), false) + jobId, "file:///tmp", false) val protocol = instance.asInstanceOf[PathOutputCommitProtocol] assert("file:///tmp" === protocol.destination) @@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite { val cause = intercept[InvocationTargetException] { FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", new Configuration(), true) + jobId, "file:///tmp", true) }.getCause if (cause == null || !cause.isInstanceOf[IOException] || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 1a46b60fe111e..dc6ad7eb8b95f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -19,7 +19,9 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec @@ -105,12 +107,14 @@ case class InsertIntoHadoopFsRelationCommand( } val jobId = java.util.UUID.randomUUID().toString + val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf, + hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, outputPath = outputPath.toString, - hadoopConf, - dynamicPartitionOverwrite = dynamicPartitionOverwrite) + dynamicPartitionOverwrite = dynamicPartitionOverwrite, + stagingDir = Some(tmpLocation.toString)) val doInsertion = if (mode == SaveMode.Append) { true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index dde68006402d7..680c6d63d38ed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.datasources -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter @@ -33,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - conf: Configuration, + stagingPath: String = "", dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala index d822b1bb6ef12..ccc467aae1f15 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala @@ -61,8 +61,7 @@ trait FileWrite extends Write { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = paths.head, - hadoopConf) + outputPath = paths.head) lazy val description = createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index 8db2ce027fdfd..5058a1dfc3baf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -150,8 +150,7 @@ class FileStreamSink( val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, - outputPath = path, - sparkSession.sessionState.newHadoopConf()) + outputPath = path) committer match { case manifestCommitter: ManifestFileCommitProtocol => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 9638924b394db..46d44e3b1a95f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,7 +22,6 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -834,9 +833,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils case class CustomFileCommitProtocol( jobId: String, path: String, + stagingPath: String, dynamicPartitionOverwrite: Boolean = false) extends SQLHadoopMapReduceCommitProtocol( - jobId, path, new Configuration(), dynamicPartitionOverwrite) { + jobId, path, stagingPath, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index df41b93f45fad..96ab46df1f50c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.sources import java.io.File import java.sql.Timestamp -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} @@ -35,8 +34,11 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils -private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration()) +private class OnlyDetectCustomPathFileCommitProtocol( + jobId: String, + path: String, + stagingPath: String) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -199,9 +201,10 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { private class PartitionFileExistCommitProtocol( jobId: String, path: String, + stagingPath: String, dynamicPartitionOverwrite: Boolean) extends SQLHadoopMapReduceCommitProtocol( - jobId, path, new Configuration(), dynamicPartitionOverwrite) { + jobId, path, stagingPath, dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 734a0aa64df2a..8c6c10fee7935 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -83,8 +83,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = outputLocation, - hadoopConf) + outputPath = outputLocation) val options = bucketSpec .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true")) From 824ec0474c5df84fa589394e0f82f7d57e65f224 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:39:50 +0800 Subject: [PATCH 15/42] Update FileCommitProtocol.scala --- .../scala/org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 7029f6f3eb1f1..4050edfc61481 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -243,7 +243,7 @@ object FileCommitProtocol extends Logging { // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[Configuration], classOf[Boolean]) + classOf[String], classOf[String], classOf[String], classOf[Boolean]) logDebug("Using (String, String, Configuration, Boolean) constructor") ctor.newInstance(jobId, outputPath, stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), From 8c8a174814c9e38cdb790ca77975ce3cd7d64264 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:48:25 +0800 Subject: [PATCH 16/42] update --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 6 ------ .../apache/spark/sql/sources/PartitionedWriteSuite.scala | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 4050edfc61481..23f640b40af87 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -27,7 +27,6 @@ import org.apache.hadoop.mapreduce._ import org.apache.spark.annotation.Unstable import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.util.Utils @@ -259,11 +258,6 @@ object FileCommitProtocol extends Logging { } } - def getStagingDir(path: String, jobId: String, conf: Configuration): Path = { - newVersionExternalTempPath( - new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) - } - def newVersionExternalTempPath( path: Path, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 96ab46df1f50c..0fb0d85ced2b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -37,8 +37,9 @@ import org.apache.spark.util.Utils private class OnlyDetectCustomPathFileCommitProtocol( jobId: String, path: String, - stagingPath: String) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath) + stagingPath: String, + dynamicPartitionOverwrite: Boolean) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) with Serializable with Logging { override def newTaskTempFileAbsPath( From 8d7ce6e58800791042a4a3f12c16327abbbe903b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:51:54 +0800 Subject: [PATCH 17/42] update --- .../apache/spark/internal/io/HadoopMapRedCommitProtocol.scala | 4 +++- .../datasources/SQLHadoopMapReduceCommitProtocol.scala | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index 30c85e1570d63..d250601349429 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -17,6 +17,7 @@ package org.apache.spark.internal.io +import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} @@ -27,7 +28,8 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} * Unlike Hadoop's OutputCommitter, this implementation is serializable. */ class HadoopMapRedCommitProtocol(jobId: String, path: String) - extends HadoopMapReduceCommitProtocol(jobId, path, "") { + extends HadoopMapReduceCommitProtocol( + jobId, path, new Path(path, ".spark-staging-" + jobId).toString) { override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 680c6d63d38ed..7ed76643a4eb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String = "", + stagingPath: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) with Serializable with Logging { From da6a0b9ccbd06bc1f08558e74d00facf2bf487f8 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:58:41 +0800 Subject: [PATCH 18/42] update --- .../spark/internal/io/HadoopMapRedCommitProtocol.scala | 4 +--- .../internal/io/HadoopMapReduceCommitProtocol.scala | 9 ++++++++- .../io/FileCommitProtocolInstantiationSuite.scala | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index d250601349429..af0aa41518766 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -17,7 +17,6 @@ package org.apache.spark.internal.io -import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} @@ -28,8 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} * Unlike Hadoop's OutputCommitter, this implementation is serializable. */ class HadoopMapRedCommitProtocol(jobId: String, path: String) - extends HadoopMapReduceCommitProtocol( - jobId, path, new Path(path, ".spark-staging-" + jobId).toString) { + extends HadoopMapReduceCommitProtocol(jobId, path) { override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 20b2be17a305d..6f2b73d34c0c6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -67,10 +67,17 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String = "", + stagingPath: String, dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { + def this(jobId: String, path: String) = + this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString) + + def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = + this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString, + dynamicPartitionOverwrite) + import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 87ffe0b69a469..77f42577517e8 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -119,7 +119,7 @@ private class FullConstructorCommitProtocol( arg2: String, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { def this(arg1: String, arg2: String) = { this(arg1, arg2, false, 2) From 63e466c5ade9078489e0990acdd5c5f1dd97ac10 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 18:07:59 +0800 Subject: [PATCH 19/42] update --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- .../internal/io/FileCommitProtocolInstantiationSuite.scala | 6 ++++-- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 3 +-- .../apache/spark/sql/execution/metric/SQLMetricsSuite.scala | 3 +-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 23f640b40af87..daa6addc0a9f6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging { classOf[String], classOf[String], classOf[String], classOf[Boolean]) logDebug("Using (String, String, Configuration, Boolean) constructor") ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), + stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString), dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 77f42577517e8..778f748f83950 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -58,7 +58,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[Other].getCanonicalName, "job", - "path") + "path", + false) } } @@ -67,7 +68,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[NoMatchingArgs].getCanonicalName, "job", - "path") + "path", + false) } } diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index c6497fc47d16f..fc5d0a0b3a7f5 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -19,7 +19,6 @@ package org.apache.spark.internal.io.cloud import java.io.IOException -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory} @@ -52,7 +51,7 @@ class PathOutputCommitProtocol( jobId: String, dest: String, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable { + extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable { if (dynamicPartitionOverwrite) { // until there's explicit extensions to the PathOutputCommitProtocols diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 46d44e3b1a95f..f90c5064b0fcc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -835,8 +835,7 @@ case class CustomFileCommitProtocol( path: String, stagingPath: String, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol( - jobId, path, stagingPath, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) From fd2ac5f62e28959ce4c6bd5f92718a5eba19c213 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 18:18:37 +0800 Subject: [PATCH 20/42] update --- .../spark/internal/io/FileCommitProtocol.scala | 8 ++++---- .../io/HadoopMapReduceCommitProtocol.scala | 16 ++++++---------- .../InsertIntoHadoopFsRelationCommand.scala | 2 +- .../SQLHadoopMapReduceCommitProtocol.scala | 4 ++-- .../sql/execution/metric/SQLMetricsSuite.scala | 5 +++-- .../sql/sources/PartitionedWriteSuite.scala | 9 ++++----- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index daa6addc0a9f6..151fcdbec17a5 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -232,7 +232,7 @@ object FileCommitProtocol extends Logging { jobId: String, outputPath: String, dynamicPartitionOverwrite: Boolean = false, - stagingDir: Option[String] = None): FileCommitProtocol = { + stagingDir: Option[Path] = None): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + s" dynamic=$dynamicPartitionOverwrite") @@ -242,10 +242,10 @@ object FileCommitProtocol extends Logging { // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[String], classOf[Boolean]) - logDebug("Using (String, String, Configuration, Boolean) constructor") + classOf[String], classOf[String], classOf[Path], classOf[Boolean]) + logDebug("Using (String, String, Path, Boolean) constructor") ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString), + stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 6f2b73d34c0c6..27e155c5733c7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -40,6 +40,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil * * @param jobId the job's or stage's id * @param path the job's output path, or null if committer acts as a noop + * @param stagingDir The staging directory of this write job. Spark uses it to deal with + * files with absolute output path, or writing data into partitioned directory + * with dynamicPartitionOverwrite=true * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime * dynamically. Suppose final path is /path/to/outputPath, output * path of [[FileOutputCommitter]] is an intermediate path, e.g. @@ -67,16 +70,15 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { def this(jobId: String, path: String) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString) + this(jobId, path, new Path(path, ".spark-staging-" + jobId)) def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString, - dynamicPartitionOverwrite) + this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite) import FileCommitProtocol._ @@ -108,12 +110,6 @@ class HadoopMapReduceCommitProtocol( */ @transient private var partitionPaths: mutable.Set[String] = null - /** - * The staging directory of this write job. Spark uses it to deal with files with absolute output - * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. - */ - protected var stagingDir: Path = new Path(stagingPath) - protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() // If OutputFormat is Configurable, we should set conf to it. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index dc6ad7eb8b95f..6605bb6c1e870 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -114,7 +114,7 @@ case class InsertIntoHadoopFsRelationCommand( jobId = jobId, outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite, - stagingDir = Some(tmpLocation.toString)) + stagingDir = Some(tmpLocation)) val doInsertion = if (mode == SaveMode.Append) { true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 7ed76643a4eb2..34ce32e710c48 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -32,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index f90c5064b0fcc..08c86e6edbb08 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,6 +22,7 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random +import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -833,9 +834,9 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils case class CustomFileCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 0fb0d85ced2b9..4fffe8f560345 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -37,9 +37,9 @@ import org.apache.spark.util.Utils private class OnlyDetectCustomPathFileCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -202,10 +202,9 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { private class PartitionFileExistCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol( - jobId, path, stagingPath, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") From e2951f128b91574e8019bc9e8f3a1bf666f73008 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 08:33:31 +0800 Subject: [PATCH 21/42] fix UT --- .../spark/internal/io/FileCommitProtocol.scala | 9 ++++++++- .../internal/io/HadoopMapReduceCommitProtocol.scala | 5 +++-- .../io/FileCommitProtocolInstantiationSuite.scala | 13 ++++++++----- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 151fcdbec17a5..ab6dbd97f11d5 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging { classOf[String], classOf[String], classOf[Path], classOf[Boolean]) logDebug("Using (String, String, Path, Boolean) constructor") ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), + stagingDir.getOrElse(getStagingDir(outputPath, jobId)), dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => @@ -258,6 +258,13 @@ object FileCommitProtocol extends Logging { } } + def getStagingDir(path: String, jobId: String): Path = { + try { + new Path(path, ".spark-staging-" + jobId) + } catch { + case _: Exception => null + } + } def newVersionExternalTempPath( path: Path, diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 27e155c5733c7..4c8dd50c146cc 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir import org.apache.spark.mapred.SparkHadoopMapRedUtil /** @@ -75,10 +76,10 @@ class HadoopMapReduceCommitProtocol( extends FileCommitProtocol with Serializable with Logging { def this(jobId: String, path: String) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId)) + this(jobId, path, getStagingDir(path, jobId)) def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite) + this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) import FileCommitProtocol._ diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 778f748f83950..26f92cb669b6b 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.internal.io +import org.apache.hadoop.fs.Path + import org.apache.spark.SparkFunSuite /** @@ -119,16 +121,17 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String) private class FullConstructorCommitProtocol( arg1: String, arg2: String, + arg3: Path, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) { - def this(arg1: String, arg2: String) = { - this(arg1, arg2, false, 2) + def this(arg1: String, arg2: String, arg3: Path) = { + this(arg1, arg2, arg3, false, 2) } - def this(arg1: String, arg2: String, b: Boolean) = { - this(arg1, arg2, false, 3) + def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = { + this(arg1, arg2, arg3, false, 3) } } From c8d0c33b25cf06409798bdc18417887e0f1d1d1c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 08:49:20 +0800 Subject: [PATCH 22/42] complicated --- .../internal/io/FileCommitProtocol.scala | 21 +++++++++++++------ ...FileCommitProtocolInstantiationSuite.scala | 13 +++++------- .../SQLHadoopMapReduceCommitProtocol.scala | 7 +++++++ .../sql/sources/PartitionedWriteSuite.scala | 11 +++------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index ab6dbd97f11d5..7c67e3441b33a 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -249,12 +249,21 @@ object FileCommitProtocol extends Logging { dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => - logDebug("Falling back to (String, String) constructor") - require(!dynamicPartitionOverwrite, - "Dynamic Partition Overwrite is enabled but" + - s" the committer ${className} does not have the appropriate constructor") - val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) - ctor.newInstance(jobId, outputPath) + try { + logDebug("Falling back to (String, String, Boolean) constructor") + val ctor = clazz.getDeclaredConstructor( + classOf[String], classOf[String], classOf[Boolean]) + ctor.newInstance(jobId, outputPath, + dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + } catch { + case _: NoSuchMethodException => + logDebug("Falling back to (String, String) constructor") + require(!dynamicPartitionOverwrite, + "Dynamic Partition Overwrite is enabled but" + + s" the committer ${className} does not have the appropriate constructor") + val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) + ctor.newInstance(jobId, outputPath) + } } } diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 26f92cb669b6b..778f748f83950 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.internal.io -import org.apache.hadoop.fs.Path - import org.apache.spark.SparkFunSuite /** @@ -121,17 +119,16 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String) private class FullConstructorCommitProtocol( arg1: String, arg2: String, - arg3: Path, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { - def this(arg1: String, arg2: String, arg3: Path) = { - this(arg1, arg2, arg3, false, 2) + def this(arg1: String, arg2: String) = { + this(arg1, arg2, false, 2) } - def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = { - this(arg1, arg2, arg3, false, 3) + def this(arg1: String, arg2: String, b: Boolean) = { + this(arg1, arg2, false, 3) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 34ce32e710c48..a6ecffdcaeb5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -22,6 +22,7 @@ import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf @@ -37,6 +38,12 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { + def this(jobId: String, path: String) = + this(jobId, path, getStagingDir(path, jobId), false) + + def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = + this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) + override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 4fffe8f560345..b9266429f81a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -34,12 +34,8 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils -private class OnlyDetectCustomPathFileCommitProtocol( - jobId: String, - path: String, - stagingDir: Path, - dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) +private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String) + extends SQLHadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -202,9 +198,8 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { private class PartitionFileExistCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") From eafb8dd5b81c587280adddcd9183ee26b2e08c75 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 10:49:26 +0800 Subject: [PATCH 23/42] update --- .../internal/io/FileCommitProtocol.scala | 20 ++++++++++--------- .../io/HadoopMapReduceCommitProtocol.scala | 18 ++++------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 7c67e3441b33a..b32a3979a8d5f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -21,6 +21,8 @@ import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} +import scala.util.Try + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ @@ -71,7 +73,12 @@ abstract class FileCommitProtocol extends Logging { def getWorkPath(): Path = null /** - * Predicate: is there an output path? + * Checks whether there are files to be committed to a valid output location. + * + * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, + * it is necessary to check whether a valid output path is specified. + * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for + * committers not writing to distributed file systems. */ def hasOutputPath(): Boolean = { getOutputPath() != null; @@ -268,11 +275,7 @@ object FileCommitProtocol extends Logging { } def getStagingDir(path: String, jobId: String): Path = { - try { - new Path(path, ".spark-staging-" + jobId) - } catch { - case _: Exception => null - } + Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null) } def newVersionExternalTempPath( @@ -290,15 +293,14 @@ object FileCommitProtocol extends Logging { } } - private def getExtTmpPathRelTo( path: Path, hadoopConf: Configuration, stagingDir: String, engineType: String, jobId: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), - "-ext-10000") // Hive uses 10000 + // Hive uses 10000 + new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), "-ext-10000") } private def getExternalScratchDir( diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 4c8dd50c146cc..fff87da244c0c 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -86,16 +86,6 @@ class HadoopMapReduceCommitProtocol( /** OutputCommitter from Hadoop is not serializable so marking it transient. */ @transient private var committer: OutputCommitter = _ - /** - * Checks whether there are files to be committed to a valid output location. - * - * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, - * it is necessary to check whether a valid output path is specified. - * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for - * committers not writing to distributed file systems. - */ - private val hasValidPath = Try { new Path(path) }.isSuccess - /** * Tracks files staged by this task for absolute output paths. These outputs are not managed by * the Hadoop OutputCommitter, so we must move these to their final locations on job commit. @@ -121,13 +111,13 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } - override def getOutputPath(): Path = new Path(path) + override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null) override def getWorkPath: Path = { if (dynamicPartitionOverwrite) { stagingDir } else { - new Path(path) + getOutputPath() } } @@ -206,7 +196,7 @@ class HadoopMapReduceCommitProtocol( override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { committer.commitJob(jobContext) - if (hasValidPath) { + if (hasOutputPath()) { val (allAbsPathFiles, allPartitionPaths) = taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip val fs = stagingDir.getFileSystem(jobContext.getConfiguration) @@ -270,7 +260,7 @@ class HadoopMapReduceCommitProtocol( logWarning(s"Exception while aborting ${jobContext.getJobID}", e) } try { - if (hasValidPath) { + if (hasOutputPath()) { val fs = stagingDir.getFileSystem(jobContext.getConfiguration) fs.delete(stagingDir, true) } From ff2bfb89ea40c2626b430aa6c06614e6a9228503 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 11:30:10 +0800 Subject: [PATCH 24/42] revert API change --- .../internal/io/FileCommitProtocol.scala | 27 ------------------- .../io/HadoopMapReduceCommitProtocol.scala | 24 ++++++++--------- .../InsertIntoHadoopFsRelationCommand.scala | 10 ++++++- .../ManifestFileCommitProtocol.scala | 4 --- .../execution/metric/SQLMetricsSuite.scala | 4 +-- 5 files changed, 22 insertions(+), 47 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index b32a3979a8d5f..f2f26297e02c7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -57,33 +57,6 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ - /** - * Get the final directory where work will be placed once the job - * is committed. This may be null, in which case, there is no output - * path to write data to. - */ - def getOutputPath(): Path = null - - /** - * Get the directory that the task should write results into. - * Warning: there's no guarantee that this work path is on the same - * FS as the final output, or that it's visible across machines. - * May be null. - */ - def getWorkPath(): Path = null - - /** - * Checks whether there are files to be committed to a valid output location. - * - * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, - * it is necessary to check whether a valid output path is specified. - * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for - * committers not writing to distributed file systems. - */ - def hasOutputPath(): Boolean = { - getOutputPath() != null; - } - /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index fff87da244c0c..35bad427253ed 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -86,6 +86,16 @@ class HadoopMapReduceCommitProtocol( /** OutputCommitter from Hadoop is not serializable so marking it transient. */ @transient private var committer: OutputCommitter = _ + /** + * Checks whether there are files to be committed to a valid output location. + * + * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, + * it is necessary to check whether a valid output path is specified. + * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for + * committers not writing to distributed file systems. + */ + private val hasValidPath = Try { new Path(path) }.isSuccess + /** * Tracks files staged by this task for absolute output paths. These outputs are not managed by * the Hadoop OutputCommitter, so we must move these to their final locations on job commit. @@ -111,16 +121,6 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } - override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null) - - override def getWorkPath: Path = { - if (dynamicPartitionOverwrite) { - stagingDir - } else { - getOutputPath() - } - } - override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { newTaskTempFile(taskContext, dir, FileNameSpec("", ext)) @@ -196,7 +196,7 @@ class HadoopMapReduceCommitProtocol( override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { committer.commitJob(jobContext) - if (hasOutputPath()) { + if (hasValidPath) { val (allAbsPathFiles, allPartitionPaths) = taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip val fs = stagingDir.getFileSystem(jobContext.getConfiguration) @@ -260,7 +260,7 @@ class HadoopMapReduceCommitProtocol( logWarning(s"Exception while aborting ${jobContext.getJobID}", e) } try { - if (hasOutputPath()) { + if (hasValidPath) { val fs = stagingDir.getFileSystem(jobContext.getConfiguration) fs.delete(stagingDir, true) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 6605bb6c1e870..e0461766097c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -167,6 +167,14 @@ case class InsertIntoHadoopFsRelationCommand( } } + // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files + // will be renamed from staging path to final output path during commit job + val committerOutputPath = if (dynamicPartitionOverwrite) { + tmpLocation + } else { + qualifiedOutputPath + } + val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -174,7 +182,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committer.getWorkPath.toString, customPartitionLocations, outputColumns), + committerOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 09681f882853f..46ce33687890d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,10 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } - override def getOutputPath: Path = new Path(path) - - override def getWorkPath(): Path = new Path(path) - override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 08c86e6edbb08..32428fbde0016 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,7 +22,6 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random -import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -834,9 +833,8 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils case class CustomFileCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) From c5a1d16d55591edfc539490d193a77eb38a80ea4 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 11:36:11 +0800 Subject: [PATCH 25/42] update --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- .../datasources/InsertIntoHadoopFsRelationCommand.scala | 3 +-- .../org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index f2f26297e02c7..c421c44414b7f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -251,7 +251,7 @@ object FileCommitProtocol extends Logging { Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null) } - def newVersionExternalTempPath( + def externalTempPath( path: Path, hadoopConf: Configuration, stagingDir: String, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index e0461766097c6..45ba105c501fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -21,7 +21,6 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.FileCommitProtocol -import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec @@ -107,7 +106,7 @@ case class InsertIntoHadoopFsRelationCommand( } val jobId = java.util.UUID.randomUUID().toString - val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf, + val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf, hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 8c6c10fee7935..c3e9deacd7e80 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -134,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - val externalTempPath = FileCommitProtocol.newVersionExternalTempPath( + val externalTempPath = FileCommitProtocol.externalTempPath( path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) createdTempDir = Some(externalTempPath.getParent) externalTempPath From 8b93bad3608cbf20992beec4451c0f6f20183269 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 19 Oct 2021 23:40:46 +0800 Subject: [PATCH 26/42] Update package.scala --- .../main/scala/org/apache/spark/internal/config/package.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 8e24105d7742b..64faac9b4a53d 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -2263,7 +2263,8 @@ package object config { ConfigBuilder("spark.exec.stagingDir") .doc("The staging directory of Spark job. Spark uses it to deal with files with " + "absolute output path, or writing data into partitioned directory when " + - "dynamic partition overwrite mode.") + "dynamic partition overwrite mode. " + + "Default value means staging dir is under table path.") .version("3.3.0") .internal() .stringConf From a0e0ec9d4cc9a863f1691f991ed151a06a110d9c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 20 Oct 2021 16:23:18 +0800 Subject: [PATCH 27/42] follow comment --- .../internal/io/FileCommitProtocol.scala | 52 +++++++++---------- .../io/HadoopMapReduceCommitProtocol.scala | 27 ++++++---- .../InsertIntoHadoopFsRelationCommand.scala | 16 +----- .../SQLHadoopMapReduceCommitProtocol.scala | 16 +++--- 4 files changed, 51 insertions(+), 60 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index c421c44414b7f..8c96fcbe8a44d 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -21,8 +21,6 @@ import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} -import scala.util.Try - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ @@ -57,6 +55,21 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ + /** + * Get the final directory where work will be placed once the job + * is org.apache.hadoop.shaded.com.itted. This may be null, in which case, there is no output + * path to write data to. + */ + def getOutputPath: Path = null + + /** + * Get the directory that the task should write results into. + * Warning: there's no guarantee that this work path is on the same + * FS as the final output, or that it's visible across machines. + * May be null. + */ + def getWorkPath: Path = null + /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ @@ -211,8 +224,7 @@ object FileCommitProtocol extends Logging { className: String, jobId: String, outputPath: String, - dynamicPartitionOverwrite: Boolean = false, - stagingDir: Option[Path] = None): FileCommitProtocol = { + dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + s" dynamic=$dynamicPartitionOverwrite") @@ -221,34 +233,22 @@ object FileCommitProtocol extends Logging { // dynamicPartitionOverwrite: Boolean). // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { - val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[Path], classOf[Boolean]) - logDebug("Using (String, String, Path, Boolean) constructor") - ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(getStagingDir(outputPath, jobId)), - dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean]) + logDebug("Using (String, String, Boolean) constructor") + ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => - try { - logDebug("Falling back to (String, String, Boolean) constructor") - val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[Boolean]) - ctor.newInstance(jobId, outputPath, - dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) - } catch { - case _: NoSuchMethodException => - logDebug("Falling back to (String, String) constructor") - require(!dynamicPartitionOverwrite, - "Dynamic Partition Overwrite is enabled but" + - s" the committer ${className} does not have the appropriate constructor") - val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) - ctor.newInstance(jobId, outputPath) - } + logDebug("Falling back to (String, String) constructor") + require(!dynamicPartitionOverwrite, + "Dynamic Partition Overwrite is enabled but" + + s" the committer ${className} does not have the appropriate constructor") + val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) + ctor.newInstance(jobId, outputPath) } } def getStagingDir(path: String, jobId: String): Path = { - Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null) + new Path(path, ".spark-staging-" + jobId) } def externalTempPath( diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 35bad427253ed..33f5249c936df 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -30,7 +30,6 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir import org.apache.spark.mapred.SparkHadoopMapRedUtil /** @@ -41,9 +40,6 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil * * @param jobId the job's or stage's id * @param path the job's output path, or null if committer acts as a noop - * @param stagingDir The staging directory of this write job. Spark uses it to deal with - * files with absolute output path, or writing data into partitioned directory - * with dynamicPartitionOverwrite=true * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime * dynamically. Suppose final path is /path/to/outputPath, output * path of [[FileOutputCommitter]] is an intermediate path, e.g. @@ -71,16 +67,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { - def this(jobId: String, path: String) = - this(jobId, path, getStagingDir(path, jobId)) - - def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) - import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ @@ -111,6 +100,12 @@ class HadoopMapReduceCommitProtocol( */ @transient private var partitionPaths: mutable.Set[String] = null + /** + * The staging directory of this write job. Spark uses it to deal with files with absolute output + * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. + */ + protected val stagingDir = getStagingDir(path, jobId) + protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() // If OutputFormat is Configurable, we should set conf to it. @@ -121,6 +116,16 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } + override def getOutputPath: Path = { + if (dynamicPartitionOverwrite) { + stagingDir + } else { + new Path(path) + } + } + + override def getWorkPath: Path = getOutputPath + override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { newTaskTempFile(taskContext, dir, FileNameSpec("", ext)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 45ba105c501fa..f2a8c2372b551 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} @@ -106,14 +105,11 @@ case class InsertIntoHadoopFsRelationCommand( } val jobId = java.util.UUID.randomUUID().toString - val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf, - hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, outputPath = outputPath.toString, - dynamicPartitionOverwrite = dynamicPartitionOverwrite, - stagingDir = Some(tmpLocation)) + dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { true @@ -166,14 +162,6 @@ case class InsertIntoHadoopFsRelationCommand( } } - // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files - // will be renamed from staging path to final output path during commit job - val committerOutputPath = if (dynamicPartitionOverwrite) { - tmpLocation - } else { - qualifiedOutputPath - } - val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -181,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committerOutputPath.toString, customPartitionLocations, outputColumns), + committer.getOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index a6ecffdcaeb5a..8430a77716630 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,9 +21,10 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir -import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol +import org.apache.spark.internal.config.EXEC_STAGING_DIR +import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol} import org.apache.spark.sql.internal.SQLConf /** @@ -33,16 +34,13 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - def this(jobId: String, path: String) = - this(jobId, path, getStagingDir(path, jobId), false) - - def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) + override val stagingDir: Path = + FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, + SQLConf.get.getConfString(EXEC_STAGING_DIR.key), "spark", jobId) override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) From 872cd01a9741488f76f8983901b52820946afc46 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 20 Oct 2021 16:26:08 +0800 Subject: [PATCH 28/42] update --- .../apache/spark/internal/io/FileCommitProtocol.scala | 9 +++++---- .../internal/io/HadoopMapReduceCommitProtocol.scala | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 8c96fcbe8a44d..bdbef8d63bfbc 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -56,9 +56,9 @@ abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ /** - * Get the final directory where work will be placed once the job - * is org.apache.hadoop.shaded.com.itted. This may be null, in which case, there is no output - * path to write data to. + * Get the final directory where the result data will be placed once the job + * is committed. This may be null, in which case, there is no output + * path to write data to and won't write any data. */ def getOutputPath: Path = null @@ -66,7 +66,8 @@ abstract class FileCommitProtocol extends Logging { * Get the directory that the task should write results into. * Warning: there's no guarantee that this work path is on the same * FS as the final output, or that it's visible across machines. - * May be null. + * May be null, in which case, there is no output path to write data to + * and won't write any data. */ def getWorkPath: Path = null diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 33f5249c936df..e40de79fc024e 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -104,7 +104,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected val stagingDir = getStagingDir(path, jobId) + protected def stagingDir = getStagingDir(path, jobId) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() From d664932164333398fd0a9b9c979b9907d58d8e02 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 20 Oct 2021 18:36:17 +0800 Subject: [PATCH 29/42] update --- .../org/apache/spark/internal/config/package.scala | 11 ----------- .../org/apache/spark/sql/internal/SQLConf.scala | 13 +++++++++++++ .../SQLHadoopMapReduceCommitProtocol.scala | 4 ++-- .../spark/sql/sources/StagingInsertSuite.scala | 3 +-- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 64faac9b4a53d..4b7ba9b19f4de 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -2258,15 +2258,4 @@ package object config { .version("3.2.0") .stringConf .createOptional - - val EXEC_STAGING_DIR = - ConfigBuilder("spark.exec.stagingDir") - .doc("The staging directory of Spark job. Spark uses it to deal with files with " + - "absolute output path, or writing data into partitioned directory when " + - "dynamic partition overwrite mode. " + - "Default value means staging dir is under table path.") - .version("3.3.0") - .internal() - .stringConf - .createWithDefault(".spark-staging") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 98aad1c9c83f0..ccffc89f6d98d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3394,6 +3394,17 @@ object SQLConf { .booleanConf .createWithDefault(false) + val EXEC_STAGING_DIR = + ConfigBuilder("spark.sql.exec.stagingDir") + .doc("The staging directory of Spark job. Spark uses it to deal with files with " + + "absolute output path, or writing data into partitioned directory when " + + "dynamic partition overwrite mode. " + + "Default value means staging dir is under table path.") + .version("3.3.0") + .internal() + .stringConf + .createWithDefault(".spark-staging") + /** * Holds information about keys that have been deprecated. * @@ -4115,6 +4126,8 @@ class SQLConf extends Serializable with Logging { def inferDictAsStruct: Boolean = getConf(SQLConf.INFER_NESTED_DICT_AS_STRUCT) + def stagingDir: String = getConf(SQLConf.EXEC_STAGING_DIR) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 8430a77716630..337fd34c37df2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -23,9 +23,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.EXEC_STAGING_DIR /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -40,7 +40,7 @@ class SQLHadoopMapReduceCommitProtocol( override val stagingDir: Path = FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, - SQLConf.get.getConfString(EXEC_STAGING_DIR.key), "spark", jobId) + SQLConf.get.stagingDir, "spark", jobId) override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala index 162cef0cca3f8..bd935d2567269 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.sources import org.apache.spark.SparkConf -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -30,7 +29,7 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession { val stagingDir = Utils.createTempDir() override def sparkConf: SparkConf = - super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath) + super.sparkConf.set(SQLConf.EXEC_STAGING_DIR, stagingDir.getAbsolutePath) override def beforeAll(): Unit = { super.beforeAll() From 997ba25dcaf52d3417739b1517f0038d05139cdb Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 20 Oct 2021 21:29:23 +0800 Subject: [PATCH 30/42] Update InsertSuite.scala --- .../org/apache/spark/sql/hive/InsertSuite.scala | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index 1dfeedcd9a18b..2289d90e7cd06 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -543,28 +543,23 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val inputPath = new Path("/tmp/b/c") var stagingDir = "tmp/b" val id = TaskRunner.getTaskRunnerID - var path = FileCommitProtocol. - getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) + var path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b_hive_") != -1) stagingDir = "tmp/b/c" - path = FileCommitProtocol.getStagingDir( - inputPath, conf, stagingDir, "hive", id.toString) + path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = "d/e" - path = FileCommitProtocol.getStagingDir( - inputPath, conf, stagingDir, "hive", id.toString) + path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = ".d/e" - path = FileCommitProtocol.getStagingDir( - inputPath, conf, stagingDir, "hive", id.toString) + path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1) stagingDir = "/tmp/c/" - path = FileCommitProtocol.getStagingDir( - inputPath, conf, stagingDir, "hive", id.toString) + path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/c_hive_") != -1) } From 44d63de2f83fb37887c32ebc2d8cc789f1ceed9b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 21 Oct 2021 00:05:57 +0800 Subject: [PATCH 31/42] Update SQLHadoopMapReduceCommitProtocol.scala --- .../execution/datasources/SQLHadoopMapReduceCommitProtocol.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 337fd34c37df2..9e0b2a1a00552 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -25,7 +25,6 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.EXEC_STAGING_DIR /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual From ed588842aa47337c40efd93618d436d025a0c7ac Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 21 Oct 2021 10:39:27 +0800 Subject: [PATCH 32/42] Update SQLConf.scala --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 19da5300e9e30..d87971966b4cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3412,8 +3412,7 @@ object SQLConf { .booleanConf .createWithDefault(false) - val EXEC_STAGING_DIR = - ConfigBuilder("spark.sql.exec.stagingDir") + val EXEC_STAGING_DIR = buildConf("spark.sql.exec.stagingDir") .doc("The staging directory of Spark job. Spark uses it to deal with files with " + "absolute output path, or writing data into partitioned directory when " + "dynamic partition overwrite mode. " + From 632d7255c4ddb1aeb367ed6c0123121348975e7e Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Sun, 24 Oct 2021 01:50:12 +0800 Subject: [PATCH 33/42] Update SQLConf.scala --- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index d87971966b4cb..ba7ff98dc8018 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3414,12 +3414,13 @@ object SQLConf { val EXEC_STAGING_DIR = buildConf("spark.sql.exec.stagingDir") .doc("The staging directory of Spark job. Spark uses it to deal with files with " + - "absolute output path, or writing data into partitioned directory when " + - "dynamic partition overwrite mode. " + - "Default value means staging dir is under table path.") + "absolute output path, or writing data into partitioned directory " + + "when dynamic partition overwrite mode is on. " + + "Default value means staging directory is under table path.") .version("3.3.0") .internal() .stringConf + .checkValue(!_.isEmpty, "Should not pass an empty string as staging diretory.") .createWithDefault(".spark-staging") val LEGACY_USE_V1_COMMAND = From cc8a4c3f6a315023a0d749d69580ba2cce3e2e64 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 10 Mar 2022 10:18:57 +0800 Subject: [PATCH 34/42] Update SQLConf.scala --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 88d1f9883c0e9..7f66b6986cee4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3562,7 +3562,7 @@ object SQLConf { .version("3.3.0") .internal() .stringConf - .checkValue(!_.isEmpty, "Should not pass an empty string as staging diretory.") + .checkValue(!_.isEmpty, "Should not pass an empty string as staging directory.") .createWithDefault(".spark-staging") val LEGACY_USE_V1_COMMAND = From 1c5a70a2c9e2e3a00a198119eb8047c1b7213465 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 10 Mar 2022 10:56:18 +0800 Subject: [PATCH 35/42] Update FileCommitProtocol.scala --- .../scala/org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 096d139a2a0bf..b33899936b7d6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -321,7 +321,7 @@ object FileCommitProtocol extends Logging { val dir = fs.makeQualified( new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) + logDebug(s"Created staging dir = $dir for path = $inputPath") dir } From 1e8424ecbd56b4e14ccdd2e9ff161a5da92237d5 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 10 Mar 2022 11:49:20 +0800 Subject: [PATCH 36/42] Update SQLHadoopMapReduceCommitProtocol.scala --- .../datasources/SQLHadoopMapReduceCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 9e0b2a1a00552..192fbe6e94e39 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -37,7 +37,7 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - override val stagingDir: Path = + @transient override val stagingDir: Path = FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, SQLConf.get.stagingDir, "spark", jobId) From 3da83f1bca65e84ef9dca79308026c1a95b8ed57 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 10 Mar 2022 14:19:59 +0800 Subject: [PATCH 37/42] Update SQLHadoopMapReduceCommitProtocol.scala --- .../datasources/SQLHadoopMapReduceCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 192fbe6e94e39..8f682d0fb005a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -37,7 +37,7 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - @transient override val stagingDir: Path = + @transient override lazy val stagingDir: Path = FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, SQLConf.get.stagingDir, "spark", jobId) From 0a729f0e5345958bd7bbf618e5cb2c9e32f01faa Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 14 Mar 2022 13:47:35 +0800 Subject: [PATCH 38/42] update --- .../internal/io/FileCommitProtocol.scala | 25 ++++++++++++++++--- .../SQLHadoopMapReduceCommitProtocol.scala | 2 +- .../sql/hive/execution/SaveAsHiveFile.scala | 4 +-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index b33899936b7d6..cb24e9ec14fb7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -216,6 +216,9 @@ abstract class FileCommitProtocol extends Logging { object FileCommitProtocol extends Logging { + val USING_SPARK_COMMIT_METHOD = "spark" + val USING_HIVE_COMMIT_METHOD = "hive" + class TaskCommitMessage(val obj: Any) extends Serializable object EmptyTaskCommitMessage extends TaskCommitMessage(null) @@ -262,7 +265,8 @@ object FileCommitProtocol extends Logging { jobId: String): Path = { val extURI = path.toUri if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) + new Path(getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") } else { new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), "-ext-10000") @@ -275,8 +279,7 @@ object FileCommitProtocol extends Logging { stagingDir: String, engineType: String, jobId: String): Path = { - // Hive uses 10000 - new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), "-ext-10000") + getStagingDir(path, hadoopConf, stagingDir, engineType, jobId) } private def getExternalScratchDir( @@ -321,6 +324,18 @@ object FileCommitProtocol extends Logging { val dir = fs.makeQualified( new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) + + if (engineType == "SPARK") { + val stagingFS = dir.getFileSystem(hadoopConf) + // SPARK-36579: Current SQLHadoopMapReduceCommitProtocol's dynamic partition overwriting uses + // rename operation to move partition's directories. This operation is not supported between + // different FileSystems. + if (equalsFileSystem(fs, stagingFS)) { + logDebug(s"The staging dir '$stagingPathName' should be in a same filesystem " + + s"with table location if we set `spark.sql.exec.stagingDir` under the table " + + "directory.") + } + } logDebug(s"Created staging dir = $dir for path = $inputPath") dir } @@ -336,6 +351,10 @@ object FileCommitProtocol extends Logging { val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) } + + def equalsFileSystem(fs1: FileSystem, fs2: FileSystem): Boolean = { + fs1.getUri == fs2.getUri + } } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 8f682d0fb005a..134910dcd75cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -39,7 +39,7 @@ class SQLHadoopMapReduceCommitProtocol( @transient override lazy val stagingDir: Path = FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, - SQLConf.get.stagingDir, "spark", jobId) + SQLConf.get.stagingDir, FileCommitProtocol.USING_SPARK_COMMIT_METHOD, jobId) override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index c3e9deacd7e80..e4ffdffe46cdd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -134,8 +134,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - val externalTempPath = FileCommitProtocol.externalTempPath( - path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) + val externalTempPath = FileCommitProtocol.externalTempPath(path, hadoopConf, stagingDir, + FileCommitProtocol.USING_HIVE_COMMIT_METHOD, TaskRunner.getTaskRunnerID.toString) createdTempDir = Some(externalTempPath.getParent) externalTempPath } else { From 9392bdb61e57dfae3c385c841b1f74ee2b985baa Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 14 Mar 2022 13:55:04 +0800 Subject: [PATCH 39/42] update --- .../spark/internal/io/FileCommitProtocol.scala | 17 +++++++++-------- .../sql/hive/execution/SaveAsHiveFile.scala | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index cb24e9ec14fb7..a8e4ad551059f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -277,22 +277,22 @@ object FileCommitProtocol extends Logging { path: Path, hadoopConf: Configuration, stagingDir: String, - engineType: String, + commitMethod: String, jobId: String): Path = { - getStagingDir(path, hadoopConf, stagingDir, engineType, jobId) + getStagingDir(path, hadoopConf, stagingDir, commitMethod, jobId) } private def getExternalScratchDir( extURI: URI, hadoopConf: Configuration, stagingDir: String, - engineType: String, + commitMethod: String, jobId: String): Path = { getStagingDir( new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), hadoopConf, stagingDir, - engineType, + commitMethod, jobId) } @@ -300,7 +300,7 @@ object FileCommitProtocol extends Logging { inputPath: Path, hadoopConf: Configuration, stagingDir: String, - engineType: String, + commitMethod: String, jobId: String): Path = { val inputPathName: String = inputPath.toString val fs: FileSystem = inputPath.getFileSystem(hadoopConf) @@ -319,13 +319,13 @@ object FileCommitProtocol extends Logging { logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString + stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString } val dir = fs.makeQualified( - new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) + new Path(stagingPathName + "_" + executionId(commitMethod) + "-" + jobId)) - if (engineType == "SPARK") { + if (commitMethod == USING_SPARK_COMMIT_METHOD) { val stagingFS = dir.getFileSystem(hadoopConf) // SPARK-36579: Current SQLHadoopMapReduceCommitProtocol's dynamic partition overwriting uses // rename operation to move partition's directories. This operation is not supported between @@ -334,6 +334,7 @@ object FileCommitProtocol extends Logging { logDebug(s"The staging dir '$stagingPathName' should be in a same filesystem " + s"with table location if we set `spark.sql.exec.stagingDir` under the table " + "directory.") + stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString } } logDebug(s"Created staging dir = $dir for path = $inputPath") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index e4ffdffe46cdd..5900e03b08b87 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -134,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - val externalTempPath = FileCommitProtocol.externalTempPath(path, hadoopConf, stagingDir, + val externalTempPath = FileCommitProtocol.externalTempPath(path, hadoopConf, stagingDir, FileCommitProtocol.USING_HIVE_COMMIT_METHOD, TaskRunner.getTaskRunnerID.toString) createdTempDir = Some(externalTempPath.getParent) externalTempPath From 1178d1ba15c810fd429bfc4fd9383e3d22a4cf3b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 14 Mar 2022 15:25:42 +0800 Subject: [PATCH 40/42] update --- .../internal/io/FileCommitProtocol.scala | 11 ++++++---- .../io/HadoopMapReduceCommitProtocol.scala | 1 + .../SQLHadoopMapReduceCommitProtocol.scala | 1 + ...la => CommitProtocolStagingDirSuite.scala} | 22 ++++++++++++++----- 4 files changed, 26 insertions(+), 9 deletions(-) rename sql/core/src/test/scala/org/apache/spark/sql/sources/{StagingInsertSuite.scala => CommitProtocolStagingDirSuite.scala} (78%) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index a8e4ad551059f..43cff753a44a2 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -322,11 +322,8 @@ object FileCommitProtocol extends Logging { stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString } - val dir = fs.makeQualified( - new Path(stagingPathName + "_" + executionId(commitMethod) + "-" + jobId)) - if (commitMethod == USING_SPARK_COMMIT_METHOD) { - val stagingFS = dir.getFileSystem(hadoopConf) + val stagingFS = new Path(stagingPathName).getFileSystem(hadoopConf) // SPARK-36579: Current SQLHadoopMapReduceCommitProtocol's dynamic partition overwriting uses // rename operation to move partition's directories. This operation is not supported between // different FileSystems. @@ -337,6 +334,12 @@ object FileCommitProtocol extends Logging { stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString } } + val dir = commitMethod match { + case USING_SPARK_COMMIT_METHOD => + new Path(stagingPathName + "-" + jobId) + case USING_HIVE_COMMIT_METHOD => + fs.makeQualified(new Path(stagingPathName + "_" + executionId(commitMethod) + "-" + jobId)) + } logDebug(s"Created staging dir = $dir for path = $inputPath") dir } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 9da91af77bd33..27d5a48f5961d 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -238,6 +238,7 @@ class HadoopMapReduceCommitProtocol( // a parent that exists, otherwise we may get unexpected result on the rename. fs.mkdirs(finalPartPath.getParent) } + println(s"When commit job ${stagingDir}") val stagingPartPath = new Path(stagingDir, part) if (!fs.rename(stagingPartPath, finalPartPath)) { throw new IOException(s"Failed to rename $stagingPartPath to $finalPartPath when " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 134910dcd75cc..47312404fbffb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -61,6 +61,7 @@ class SQLHadoopMapReduceCommitProtocol( // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path) + println(s"committerOutputPath = ${committerOutputPath}") committer = ctor.newInstance(committerOutputPath, context) } else { // The specified output committer is just an OutputCommitter. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala similarity index 78% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala index bd935d2567269..de9c643998f56 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala @@ -23,22 +23,24 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils -class StagingInsertSuite extends QueryTest with SharedSparkSession { + +abstract class CommitProtocolStagingDirBaseSuite extends QueryTest with SharedSparkSession { import testImplicits._ - val stagingDir = Utils.createTempDir() + def stagingDir: String + def cleanStagingDir(): Unit override def sparkConf: SparkConf = - super.sparkConf.set(SQLConf.EXEC_STAGING_DIR, stagingDir.getAbsolutePath) + super.sparkConf.set(SQLConf.EXEC_STAGING_DIR, stagingDir) override def beforeAll(): Unit = { super.beforeAll() - stagingDir.delete() + cleanStagingDir() } override def afterAll(): Unit = { try { - Utils.deleteRecursively(stagingDir) + cleanStagingDir() } finally { super.afterAll() } @@ -68,3 +70,13 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession { } } } + +class LocalStagingDirSuite extends CommitProtocolStagingDirBaseSuite { + val stagingDirFile = Utils.createTempDir() + override val stagingDir = stagingDirFile.getAbsolutePath + + println(s"stagingDirFile = ${stagingDirFile}") + override def cleanStagingDir(): Unit = { + Utils.deleteRecursively(stagingDirFile) + } +} From 9dba59069a71e405c37a3ef606a4ea88c8a17f18 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 14 Mar 2022 15:26:32 +0800 Subject: [PATCH 41/42] update --- .../apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala | 1 - .../execution/datasources/SQLHadoopMapReduceCommitProtocol.scala | 1 - .../apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala | 1 - 3 files changed, 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 27d5a48f5961d..9da91af77bd33 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -238,7 +238,6 @@ class HadoopMapReduceCommitProtocol( // a parent that exists, otherwise we may get unexpected result on the rename. fs.mkdirs(finalPartPath.getParent) } - println(s"When commit job ${stagingDir}") val stagingPartPath = new Path(stagingDir, part) if (!fs.rename(stagingPartPath, finalPartPath)) { throw new IOException(s"Failed to rename $stagingPartPath to $finalPartPath when " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 47312404fbffb..134910dcd75cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -61,7 +61,6 @@ class SQLHadoopMapReduceCommitProtocol( // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path) - println(s"committerOutputPath = ${committerOutputPath}") committer = ctor.newInstance(committerOutputPath, context) } else { // The specified output committer is just an OutputCommitter. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala index de9c643998f56..6704bf322f457 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala @@ -75,7 +75,6 @@ class LocalStagingDirSuite extends CommitProtocolStagingDirBaseSuite { val stagingDirFile = Utils.createTempDir() override val stagingDir = stagingDirFile.getAbsolutePath - println(s"stagingDirFile = ${stagingDirFile}") override def cleanStagingDir(): Unit = { Utils.deleteRecursively(stagingDirFile) } From b998ec9d7d5b514940b39daa0a32f1d4e8bf9e87 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 14 Mar 2022 15:38:25 +0800 Subject: [PATCH 42/42] Update InsertSuite.scala --- .../test/scala/org/apache/spark/sql/sources/InsertSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index d7c2f44b79506..b553e6ed566b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -1116,6 +1116,6 @@ class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLoca } private def isSparkStagingDir(path: Path): Boolean = { - path.toString.contains(".spark-staging_") + path.toString.contains(".spark-staging-") } }