From 6331e56f1a75750eebd1c8217b48e4d77773bae0 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 16:14:26 +0800 Subject: [PATCH 01/53] [36562][SQL] Add new NewSQLHadoopMapReduceCommitProtocol resolve conflict --- .../io/HadoopMapReduceCommitProtocol.scala | 2 +- .../datasources/NewFileOutputCommitter.scala | 585 ++++++++++++++++++ .../NewSQLHadoopMapReduceCommitProtocol.scala | 66 ++ .../sql/sources/PartitionedWriteSuite.scala | 51 +- 4 files changed, 702 insertions(+), 2 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index c061d617fce4b..f279d0514da66 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -73,7 +73,7 @@ class HadoopMapReduceCommitProtocol( import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ - @transient private var committer: OutputCommitter = _ + @transient protected var committer: OutputCommitter = _ /** * Checks whether there are files to be committed to a valid output location. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala new file mode 100644 index 0000000000000..1a8760f8c6f6d --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala @@ -0,0 +1,585 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{FileNotFoundException, IOException} +import java.net.URI +import java.text.SimpleDateFormat +import java.util.{Date, Locale, Random} + +import com.google.common.base.Preconditions +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} +import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext, TaskAttemptID} +import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter +import org.apache.hadoop.util.{DurationInfo, Progressable} + +import org.apache.spark.internal.Logging + +class NewFileOutputCommitter( + stagingDir: Path, + outputPath: Path, + context: TaskAttemptContext) + extends PathOutputCommitter(outputPath, context) with Logging { + + val PENDING_DIR_NAME = "_temporary" + val SUCCEEDED_FILE_NAME = "_SUCCESS" + val SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = "mapreduce.fileoutputcommitter.marksuccessfuljobs" + val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION = "mapreduce.fileoutputcommitter.algorithm.version" + val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT = 2 + val FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED = "mapreduce.fileoutputcommitter.cleanup.skipped" + val FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT = false + val FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED = + "mapreduce.fileoutputcommitter.cleanup-failures.ignored" + val FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT = false + val FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS = "mapreduce.fileoutputcommitter.failures.attempts" + val FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS_DEFAULT = 1 + val FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED = + "mapreduce.fileoutputcommitter.task.cleanup.enabled" + val FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED_DEFAULT = false + private val workPath: Path = if (getOutputPath != null) { + Preconditions.checkNotNull(getTaskAttemptPath(context, getOutputPath), + "Null task attempt path in %s and output path %s", context, outputPath) + } else { + null + } + private val algorithmVersion = context.getConfiguration.getInt( + FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT) + private val skipCleanup = context.getConfiguration.getBoolean( + FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED, FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT) + private val ignoreCleanupFailures = context.getConfiguration.getBoolean( + FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED, + FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT) + + private class CommittedTaskFilter extends PathFilter { + def accept(path: Path): Boolean = !(PENDING_DIR_NAME == path.getName) + } + + override def getOutputPath: Path = + outputPath.getFileSystem(context.getConfiguration).makeQualified(outputPath) + + def getPendingJobAttemptsPath(): Path = { + getPendingJobAttemptsPath(stagingDir) + } + + def getPendingJobAttemptsPath(out: Path): Path = { + new Path(out, PENDING_DIR_NAME) + } + + private def getAppAttemptId(context: JobContext) = { + context.getConfiguration.getInt("mapreduce.job.application.attempt.id", 0) + } + + def getJobAttemptPath(context: JobContext): Path = { + getJobAttemptPath(context, this.stagingDir) + } + + def getJobAttemptPath(context: JobContext, out: Path): Path = { + getJobAttemptPath(getAppAttemptId(context), out) + } + + protected def getJobAttemptPath(appAttemptId: Int): Path = { + getJobAttemptPath(appAttemptId, this.stagingDir) + } + + private def getJobAttemptPath(appAttemptId: Int, out: Path) = { + new Path(getPendingJobAttemptsPath(out), String.valueOf(appAttemptId)) + } + + private def getPendingTaskAttemptsPath(context: JobContext): Path = { + getPendingTaskAttemptsPath(context, this.stagingDir) + } + + private def getPendingTaskAttemptsPath(context: JobContext, out: Path) = { + new Path(getJobAttemptPath(context, out), "_temporary") + } + + def getTaskAttemptPath(context: TaskAttemptContext): Path = { + new Path(this.getPendingTaskAttemptsPath(context), String.valueOf(context.getTaskAttemptID)) + } + + def getTaskAttemptPath(context: TaskAttemptContext, out: Path): Path = { + new Path(getPendingTaskAttemptsPath(context, out), String.valueOf(context.getTaskAttemptID)) + } + + def getCommittedTaskPath(context: TaskAttemptContext): Path = { + this.getCommittedTaskPath(getAppAttemptId(context), context) + } + + def getCommittedTaskPath(context: TaskAttemptContext, out: Path): Path = { + getCommittedTaskPath(getAppAttemptId(context), context, out) + } + + protected def getCommittedTaskPath(appAttemptId: Int, context: TaskAttemptContext) = { + new Path(this.getJobAttemptPath(appAttemptId), + String.valueOf(context.getTaskAttemptID.getTaskID)) + } + + private def getCommittedTaskPath( + appAttemptId: Int, + context: TaskAttemptContext, + out: Path): Path = { + new Path(getJobAttemptPath(appAttemptId, out), + String.valueOf(context.getTaskAttemptID.getTaskID.toString)) + } + + @throws[IOException] + private def getAllCommittedTaskPaths(context: JobContext) = { + val jobAttemptPath = this.getJobAttemptPath(context) + val fs = jobAttemptPath.getFileSystem(context.getConfiguration) + fs.listStatus(jobAttemptPath, new CommittedTaskFilter) + } + + @throws[IOException] + override def setupJob(context: JobContext): Unit = { + if (hasOutputPath) { + val jobAttemptPath = getJobAttemptPath(context) + val fs = jobAttemptPath.getFileSystem(context.getConfiguration) + if (!fs.mkdirs(jobAttemptPath)) { + logError("Mkdirs failed to create " + jobAttemptPath) + } + } else { + logWarning("Output Path is null in setupJob()") + } + } + + + @throws[IOException] + override def commitJob(context: JobContext): Unit = { + val maxAttemptsOnFailure = if (this.isCommitJobRepeatable(context)) { + context.getConfiguration.getInt( + FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS, FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS_DEFAULT) + } else { + 1 + } + var attempt = 0 + var jobCommitNotFinished = true + + while (jobCommitNotFinished) { + try { + this.commitJobInternal(context) + jobCommitNotFinished = false + } catch { + case var6: Exception => + attempt += 1 + if (attempt >= maxAttemptsOnFailure) throw var6 + logWarning("Exception get thrown in job commit, retry (" + attempt + ") time.", var6) + } + } + } + + @throws[IOException] + protected def commitJobInternal(context: JobContext): Unit = { + if (!this.hasOutputPath) { + logWarning("Output Path is null in commitJob()") + } else { + val finalOutput = this.getOutputPath + val fs = finalOutput.getFileSystem(context.getConfiguration) + if (this.algorithmVersion == 1) { + val committedTaskPaths = this.getAllCommittedTaskPaths(context) + committedTaskPaths.foreach { path => + this.mergePaths(fs, path, finalOutput, context) + } + } + if (this.skipCleanup) { + logInfo("Skip cleanup the _temporary folders under job's output directory in commitJob.") + } else { + try this.cleanupJob(context) + catch { + case var8: IOException => + if (!this.ignoreCleanupFailures) throw var8 + logError("Error in cleanup job, manually cleanup is needed.", var8) + } + } + if (context.getConfiguration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) { + val markerPath = new Path(this.outputPath, "_SUCCESS") + if (this.isCommitJobRepeatable(context)) { + fs.create(markerPath, true).close() + } else { + fs.create(markerPath).close() + } + } + } + } + + @throws[IOException] + private def mergePaths(fs: FileSystem, from: FileStatus, to: Path, context: JobContext): Unit = { + val d: DurationInfo = + new DurationInfo(log, false, "Merging data from %s to %s", Array[AnyRef](from, to)) + var var6: Throwable = null + try { + this.reportProgress(context) + var toStat: FileStatus = null + try { + toStat = fs.getFileStatus(to) + } catch { + case _: FileNotFoundException => + toStat = null + } + if (from.isFile) { + if (toStat != null && !fs.delete(to, true)) { + throw new IOException("Failed to delete " + to) + } + if (!fs.rename(from.getPath, to)) { + throw new IOException("Failed to rename " + from + " to " + to) + } + } else { + if (from.isDirectory) { + if (toStat != null) { + if (!toStat.isDirectory) { + if (!fs.delete(to, true)) { + throw new IOException("Failed to delete " + to) + } + this.renameOrMerge(fs, from, to, context) + } else { + val var8: Array[FileStatus] = fs.listStatus(from.getPath) + val var9: Int = var8.length + for (var10 <- 0 until var9) { + val subFrom: FileStatus = var8(var10) + val subTo: Path = new Path(to, subFrom.getPath.getName) + this.mergePaths(fs, subFrom, subTo, context) + } + } + } else { + this.renameOrMerge(fs, from, to, context) + } + } + } + } catch { + case var22: Throwable => + var6 = var22 + throw var22 + } finally { + if (d != null) { + if (var6 != null) { + try d.close() + catch { + case var20: Throwable => + var6.addSuppressed(var20) + } + } + else { + d.close() + } + } + } + } + + private def reportProgress(context: JobContext): Unit = { + if (context.isInstanceOf[Progressable]) { + (context.asInstanceOf[Progressable]).progress() + } + } + + @throws[IOException] + private def renameOrMerge( + fs: FileSystem, + from: FileStatus, + to: Path, + context: JobContext): Unit = { + if (this.algorithmVersion == 1) { + if (!(fs.rename(from.getPath, to))) { + throw new IOException("Failed to rename " + from + " to " + to) + } + } + else { + fs.mkdirs(to) + val var5: Array[FileStatus] = fs.listStatus(from.getPath) + val var6: Int = var5.length + for (var7 <- 0 until var6) { + val subFrom: FileStatus = var5(var7) + val subTo: Path = new Path(to, subFrom.getPath.getName) + this.mergePaths(fs, subFrom, subTo, context) + } + } + } + + /** @deprecated */ + @deprecated + @throws[IOException] + override def cleanupJob(context: JobContext): Unit = { + if (this.hasOutputPath) { + val pendingJobAttemptsPath: Path = this.getPendingJobAttemptsPath + val fs: FileSystem = pendingJobAttemptsPath.getFileSystem(context.getConfiguration) + try fs.delete(pendingJobAttemptsPath, true) + catch { + case var5: FileNotFoundException => + if (!(this.isCommitJobRepeatable(context))) { + throw var5 + } + } + } else { + logWarning("Output Path is null in cleanupJob()") + } + } + + @throws[IOException] + override def abortJob(context: JobContext, state: JobStatus.State): Unit = { + this.cleanupJob(context) + } + + @throws[IOException] + override def setupTask(context: TaskAttemptContext): Unit = { + } + + @throws[IOException] + override def commitTask(context: TaskAttemptContext): Unit = { + this.commitTask(context, null.asInstanceOf[Path]) + } + + @throws[IOException] + def commitTask(context: TaskAttemptContext, taskAttemptPath: Path): Unit = { + val attemptId: TaskAttemptID = context.getTaskAttemptID + var currentTaskAttemptPath = taskAttemptPath + if (this.hasOutputPath) { + context.progress() + if (currentTaskAttemptPath == null) { + currentTaskAttemptPath = this.getTaskAttemptPath(context) + } + val fs: FileSystem = currentTaskAttemptPath.getFileSystem(context.getConfiguration) + var taskAttemptDirStatus: FileStatus = null + try { + taskAttemptDirStatus = fs.getFileStatus(currentTaskAttemptPath) + } catch { + case _: FileNotFoundException => + taskAttemptDirStatus = null + } + if (taskAttemptDirStatus != null) { + if (this.algorithmVersion == 1) { + val committedTaskPath = this.getCommittedTaskPath(context) + if (fs.exists(committedTaskPath) && !fs.delete(committedTaskPath, true)) { + throw new IOException("Could not delete " + committedTaskPath) + } + if (!fs.rename(currentTaskAttemptPath, committedTaskPath)) { + throw new IOException( + "Could not rename " + currentTaskAttemptPath + " to " + committedTaskPath) + } + logInfo("Saved output of task '" + attemptId + "' to " + committedTaskPath) + } else { + this.mergePaths(fs, taskAttemptDirStatus, this.outputPath, context) + logInfo("Saved output of task '" + attemptId + "' to " + this.outputPath) + if (context.getConfiguration.getBoolean( + "mapreduce.fileoutputcommitter.task.cleanup.enabled", false)) { + logDebug(String.format( + "Deleting the temporary directory of '%s': '%s'", attemptId, currentTaskAttemptPath)) + if (!fs.delete(currentTaskAttemptPath, true)) { + logWarning("Could not delete " + currentTaskAttemptPath) + } + } + } + } else { + logWarning("No Output found for " + attemptId) + } + } else { + logWarning("Output Path is null in commitTask()") + } + } + + @throws[IOException] + override def abortTask(context: TaskAttemptContext): Unit = { + this.abortTask(context, null.asInstanceOf[Path]) + } + + @throws[IOException] + def abortTask(context: TaskAttemptContext, taskAttemptPath: Path): Unit = { + var currentTaskAttemptPath = taskAttemptPath + if (this.hasOutputPath) { + context.progress() + if (currentTaskAttemptPath == null) { + currentTaskAttemptPath = this.getTaskAttemptPath(context) + } + val fs: FileSystem = currentTaskAttemptPath.getFileSystem(context.getConfiguration) + if (!fs.delete(currentTaskAttemptPath, true)) { + logWarning("Could not delete " + currentTaskAttemptPath) + } + } else { + logWarning("Output Path is null in abortTask()") + } + } + + @throws[IOException] + override def needsTaskCommit(context: TaskAttemptContext): Boolean = { + this.needsTaskCommit(context, null.asInstanceOf[Path]) + } + + @throws[IOException] + def needsTaskCommit(context: TaskAttemptContext, taskAttemptPath: Path): Boolean = { + var currentTaskAttemptPath = taskAttemptPath + if (this.hasOutputPath) { + if (currentTaskAttemptPath == null) { + currentTaskAttemptPath = this.getTaskAttemptPath(context) + } + val fs: FileSystem = currentTaskAttemptPath.getFileSystem(context.getConfiguration) + fs.exists(currentTaskAttemptPath) + } else { + false + } + } + + @deprecated override def isRecoverySupported: Boolean = { + true + } + + @throws[IOException] + override def isCommitJobRepeatable(context: JobContext): Boolean = { + this.algorithmVersion == 2 + } + + @throws[IOException] + override def recoverTask(context: TaskAttemptContext): Unit = { + if (this.hasOutputPath) { + context.progress() + val attemptId: TaskAttemptID = context.getTaskAttemptID + val previousAttempt: Int = getAppAttemptId(context) - 1 + if (previousAttempt < 0) { + throw new IOException("Cannot recover task output for first attempt...") + } + val previousCommittedTaskPath: Path = this.getCommittedTaskPath(previousAttempt, context) + val fs: FileSystem = previousCommittedTaskPath.getFileSystem(context.getConfiguration) + if (log.isDebugEnabled) { + logDebug("Trying to recover task from " + previousCommittedTaskPath) + } + if (this.algorithmVersion == 1) { + if (fs.exists(previousCommittedTaskPath)) { + val committedTaskPath: Path = this.getCommittedTaskPath(context) + if (!fs.delete(committedTaskPath, true) && fs.exists(committedTaskPath)) { + throw new IOException("Could not delete " + committedTaskPath) + } + val committedParent: Path = committedTaskPath.getParent + fs.mkdirs(committedParent) + if (!fs.rename(previousCommittedTaskPath, committedTaskPath)) { + throw new IOException( + "Could not rename " + previousCommittedTaskPath + " to " + committedTaskPath) + } + } + else { + logWarning(attemptId + " had no output to recover.") + } + } + else { + try { + val from: FileStatus = fs.getFileStatus(previousCommittedTaskPath) + logInfo("Recovering task for upgrading scenario, moving files from " + + previousCommittedTaskPath + " to " + this.outputPath) + this.mergePaths(fs, from, this.outputPath, context) + } catch { + case var8: FileNotFoundException => + + } + logInfo("Done recovering task " + attemptId) + } + } + else { + logWarning("Output Path is null in recoverTask()") + } + } + + override def getWorkPath: Path = workPath + + override def toString: String = { + val sb: StringBuilder = new StringBuilder("FileOutputCommitter{") + sb.append(super.toString).append("; ") + sb.append("outputPath=").append(this.outputPath) + sb.append(", workPath=").append(this.workPath) + sb.append(", algorithmVersion=").append(this.algorithmVersion) + sb.append(", skipCleanup=").append(this.skipCleanup) + sb.append(", ignoreCleanupFailures=").append(this.ignoreCleanupFailures) + sb.append('}') + sb.toString + } +} + +object NewFileOutputCommitter extends Logging { + def newVersionExternalTempPath( + jobId: String, + path: Path, + hadoopConf: Configuration, + stagingDir: String): Path = { + val extURI = path.toUri + if (extURI.getScheme == "viewfs") { + getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, jobId) + } else { + new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, jobId), "-ext-10000") + } + } + + + private def getExtTmpPathRelTo( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + jobId: String): Path = { + new Path(getStagingDir(path, hadoopConf, stagingDir, jobId), "-ext-10000") // Hive uses 10000 + } + + private def getExternalScratchDir( + extURI: URI, + hadoopConf: Configuration, + stagingDir: String, + jobId: String): Path = { + getStagingDir( + new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), + hadoopConf, + stagingDir, + jobId: String) + } + + private def getStagingDir( + inputPath: Path, + hadoopConf: Configuration, + stagingDir: String, + jobId: String): Path = { + val inputPathName: String = inputPath.toString + val fs: FileSystem = inputPath.getFileSystem(hadoopConf) + var stagingPathName: String = + if (inputPathName.indexOf(stagingDir) == -1) { + new Path(inputPathName, stagingDir).toString + } else { + inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) + } + + // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the + // staging directory needs to avoid being deleted when users set hive.exec.stagingdir + // under the table directory. + if (isSubDir(new Path(stagingPathName), inputPath, fs) && + !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { + logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + + "directory.") + stagingPathName = new Path(inputPathName, ".hive-staging").toString + } + + val dir = fs.makeQualified( + new Path(stagingPathName + "_" + executionId + "-" + jobId)) + logDebug("Created staging dir = " + dir + " for path = " + inputPath) + dir + } + + // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). + private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { + val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR + val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR + path1.startsWith(path2) + } + + private def executionId: String = { + val rand: Random = new Random + val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) + "spark_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala new file mode 100644 index 0000000000000..9416bcd728d73 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.internal.SQLConf + +/** + * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual + * Hadoop output committer using an option specified in SQLConf. + */ +class NewSQLHadoopMapReduceCommitProtocol( + jobId: String, + path: String, + dynamicPartitionOverwrite: Boolean = false) + extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) + with Serializable with Logging { + + private val committerStagingDir = NewFileOutputCommitter.newVersionExternalTempPath( + jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf, + SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir")) + + def currentCommitter: NewFileOutputCommitter = + committer.asInstanceOf[NewFileOutputCommitter] + + override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { + // The specified output committer is a FileOutputCommitter. + // So, we will use the FileOutputCommitter-specified constructor. + val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path) + val committer = + new NewFileOutputCommitter(committerStagingDir, committerOutputPath, context) + logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") + committer + } + + override def newTaskTempFile( + taskContext: TaskAttemptContext, + dir: Option[String], + ext: String): String = { + val filename = getFilename(taskContext, ext) + dir.map { d => + new Path(new Path(currentCommitter.getTaskAttemptPath(taskContext), d), filename).toString + }.getOrElse { + new Path(currentCommitter.getTaskAttemptPath(taskContext), filename).toString + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index b9266429f81a5..862e14e3996af 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol +import org.apache.spark.sql.execution.datasources.{NewSQLHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -189,6 +189,55 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-36571") { + withTempDir { stagingDir => + withSQLConf( + "spark.sql.source.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir", + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2", + SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> + classOf[NewSQLHadoopMapReduceCommitProtocol].getName) { + withTempDir { d => + withTable("t") { + sql( + s""" + | create table t(c1 int, p1 int) using ORC + | location '${d.getAbsolutePath}' + """.stripMargin) + + val df = + Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) + .toDF("c1", "p1").repartition(1) + df.write + .mode("overwrite") + .format("orc") + .saveAsTable("t") + checkAnswer(sql("select * from t"), df) + } + + withTable("t") { + sql( + s""" + | create table t(c1 int, p1 int) using ORC PARTITIONED BY(p1) + | location '${d.getAbsolutePath}' + """.stripMargin) + + val df = + Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) + .toDF("c1", "p1").repartition(1) + + df.createOrReplaceTempView("view1") + sql("insert overwrite t partition(p1=2) select c1 from view1 ") + checkAnswer(sql("select * from t"), df) + } + } + } + } + } } /** From 4d80c24253fcc8257b97bc44ff6517a003efa816 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 16:21:29 +0800 Subject: [PATCH 02/53] Update PartitionedWriteSuite.scala --- .../org/apache/spark/sql/sources/PartitionedWriteSuite.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 862e14e3996af..af94c1a01bd9d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -216,7 +216,8 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { .saveAsTable("t") checkAnswer(sql("select * from t"), df) } - + } + withTempDir { d => withTable("t") { sql( s""" @@ -232,7 +233,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { df.createOrReplaceTempView("view1") sql("insert overwrite t partition(p1=2) select c1 from view1 ") - checkAnswer(sql("select * from t"), df) + checkAnswer(sql("select * from t where p1=2"), df) } } } From b692e0f2681c3e1e184a46e6dd3f188231488a52 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 16:28:51 +0800 Subject: [PATCH 03/53] Update PartitionedWriteSuite.scala --- .../sql/sources/PartitionedWriteSuite.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index af94c1a01bd9d..186b0e355a700 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -234,6 +234,23 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { df.createOrReplaceTempView("view1") sql("insert overwrite t partition(p1=2) select c1 from view1 ") checkAnswer(sql("select * from t where p1=2"), df) + sql("desc formatted t partition(p1=2)").show(100, 2000) + } + } + withTempDir { d => + withTable("t") { + sql( + s""" + | create table t(c1 int, p1 int) using parquet partitioned by (p1) + | location '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("select * from t"), df) } } } From e6dbae1b9f59b9fe3f29554522fb84d8f20afd84 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 16:45:49 +0800 Subject: [PATCH 04/53] Update NewSQLHadoopMapReduceCommitProtocol.scala --- .../datasources/NewSQLHadoopMapReduceCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala index 9416bcd728d73..1e71c21ece04c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala @@ -45,7 +45,7 @@ class NewSQLHadoopMapReduceCommitProtocol( override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. - val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path) + val committerOutputPath = new Path(path) val committer = new NewFileOutputCommitter(committerStagingDir, committerOutputPath, context) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") From 6419573557fb65501af60bd4c8e038afbfd4d549 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 17:34:50 +0800 Subject: [PATCH 05/53] Update NewSQLHadoopMapReduceCommitProtocol.scala --- .../datasources/NewSQLHadoopMapReduceCommitProtocol.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala index 1e71c21ece04c..45159e7b85c09 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} + import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.SparkSession From 44d1d8f8bae102ce647bf2ecb885d33c218b0e04 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 18:00:52 +0800 Subject: [PATCH 06/53] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index 2ca50878485c0..afeb116649f85 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -60,7 +60,7 @@ class PathOutputCommitProtocol( } /** The committer created. */ - @transient private var committer: PathOutputCommitter = _ + @transient protected var committer: PathOutputCommitter = _ require(dest != null, "Null destination specified") From e2c5318ecd5fc9d17b86bbaaf230a2dbe858bdaf Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 18:28:07 +0800 Subject: [PATCH 07/53] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index afeb116649f85..0d181c42a1d8e 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -60,7 +60,7 @@ class PathOutputCommitProtocol( } /** The committer created. */ - @transient protected var committer: PathOutputCommitter = _ + @transient override protected var committer: PathOutputCommitter = _ require(dest != null, "Null destination specified") From 9106d18615957f23ef4d16f6af6f9cf33598dcb3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 24 Aug 2021 19:23:34 +0800 Subject: [PATCH 08/53] update --- .../spark/internal/io/HadoopMapReduceCommitProtocol.scala | 4 +++- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 2 +- .../datasources/NewSQLHadoopMapReduceCommitProtocol.scala | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index f279d0514da66..a7baf74e51818 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -73,7 +73,7 @@ class HadoopMapReduceCommitProtocol( import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ - @transient protected var committer: OutputCommitter = _ + @transient private var committer: OutputCommitter = _ /** * Checks whether there are files to be committed to a valid output location. @@ -106,6 +106,8 @@ class HadoopMapReduceCommitProtocol( */ protected def stagingDir = getStagingDir(path, jobId) + protected def getCommitter = committer + protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() // If OutputFormat is Configurable, we should set conf to it. diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index 0d181c42a1d8e..2ca50878485c0 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -60,7 +60,7 @@ class PathOutputCommitProtocol( } /** The committer created. */ - @transient override protected var committer: PathOutputCommitter = _ + @transient private var committer: PathOutputCommitter = _ require(dest != null, "Null destination specified") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala index 45159e7b85c09..acb4415604488 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala @@ -41,7 +41,7 @@ class NewSQLHadoopMapReduceCommitProtocol( SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir")) def currentCommitter: NewFileOutputCommitter = - committer.asInstanceOf[NewFileOutputCommitter] + getCommitter.asInstanceOf[NewFileOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { // The specified output committer is a FileOutputCommitter. From 2031f5bea5c83800b28190bee40895da7c1d25a0 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 11:12:01 +0800 Subject: [PATCH 09/53] [SPARK-36579][SQL] Make spark source stagingDir can use user defined --- .../apache/spark/sql/internal/SQLConf.scala | 12 ++ .../SQLHadoopMapReduceCommitProtocol.scala | 7 ++ .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ++++++++++++++++++ .../sql/sources/PartitionedWriteSuite.scala | 27 ++++ .../sql/hive/execution/SaveAsHiveFile.scala | 94 +------------- 5 files changed, 166 insertions(+), 89 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index c6d9558d53eda..d8dbc36a4f9cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1216,6 +1216,16 @@ object SQLConf { .booleanConf .createWithDefault(true) + val FILE_COMMIT_STAGING_DIR = + buildConf("spark.sql.source.stagingDir") + .doc("The staging directory of Spark job. Spark uses it to deal with files with " + + "absolute output path, or writing data into partitioned directory when " + + "dynamic partition overwrite mode.") + .version("3.3.0") + .internal() + .stringConf + .createWithDefault(".spark-staging") + // The output committer class used by data sources. The specified class needs to be a // subclass of org.apache.hadoop.mapreduce.OutputCommitter. val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass") @@ -3824,6 +3834,8 @@ class SQLConf extends Serializable with Logging { def partitionColumnTypeInferenceEnabled: Boolean = getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE) + def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR) + def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS) def parallelPartitionDiscoveryThreshold: Int = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 144be2316f091..37a401d842abd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -23,7 +23,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.util.SQLFileCommitProtocolUtils /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -36,6 +38,11 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { + override val stagingDir: Path = + SQLFileCommitProtocolUtils.newVersionExternalTempPath( + new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(), + SQLConf.get.fileCommitStagingDir, "spark", jobId) + override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala new file mode 100644 index 0000000000000..1b2bfb42f0e35 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.spark.sql.util + +import java.net.URI +import java.text.SimpleDateFormat +import java.util.{Date, Locale, Random} + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} + +import org.apache.spark.internal.Logging + +object SQLFileCommitProtocolUtils extends Logging { + + def newVersionExternalTempPath( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val extURI = path.toUri + if (extURI.getScheme == "viewfs") { + getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) + } else { + new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") + } + } + + + private def getExtTmpPathRelTo( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") // Hive uses 10000 + } + + private def getExternalScratchDir( + extURI: URI, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + getStagingDir( + new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), + hadoopConf, + stagingDir, + engineType, + jobId) + } + + private def getStagingDir( + inputPath: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val inputPathName: String = inputPath.toString + val fs: FileSystem = inputPath.getFileSystem(hadoopConf) + var stagingPathName: String = + if (inputPathName.indexOf(stagingDir) == -1) { + new Path(inputPathName, stagingDir).toString + } else { + inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) + } + + // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the + // staging directory needs to avoid being deleted when users set hive.exec.stagingdir + // under the table directory. + if (isSubDir(new Path(stagingPathName), inputPath, fs) && + !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { + logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + + "directory.") + stagingPathName = new Path(inputPathName, ".hive-staging").toString + } + + val dir = fs.makeQualified( + new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) + logDebug("Created staging dir = " + dir + " for path = " + inputPath) + dir + } + + private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { + val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR + val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR + path1.startsWith(path2) + } + + def executionId(engineType: String): String = { + val rand: Random = new Random + val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) + s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index b9266429f81a5..4751767bfd239 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -189,6 +189,33 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { + withTempDir { stagingDir => + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString, + SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) { + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) + } + } + } + } + } } /** diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index ec189344f4fa5..581c97ebcbfdd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -19,8 +19,7 @@ package org.apache.spark.sql.hive.execution import java.io.IOException import java.net.URI -import java.text.SimpleDateFormat -import java.util.{Date, Locale, Random} +import java.util.Locale import scala.util.control.NonFatal @@ -40,6 +39,7 @@ import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.sql.hive.client.HiveVersion +import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._ // Base trait from which all hive insert statement physical execution extends. private[hive] trait SaveAsHiveFile extends DataWritingCommand { @@ -129,7 +129,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - newVersionExternalTempPath(path, hadoopConf, stagingDir) + newVersionExternalTempPath( + path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) } else { throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion) } @@ -159,7 +160,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { hadoopConf: Configuration, scratchDir: String): Path = { val extURI: URI = path.toUri - val scratchPath = new Path(scratchDir, executionId) + val scratchPath = new Path(scratchDir, executionId("hive")) var dirPath = new Path( extURI.getScheme, extURI.getAuthority, @@ -180,90 +181,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { } dirPath } - - // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2 - private def newVersionExternalTempPath( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val extURI: URI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000") - } - } - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir) - } - - private[hive] def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir: Path = - fs.makeQualified( - new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - try { - if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) { - throw new IllegalStateException("Cannot create staging directory '" + dir.toString + "'") - } - createdTempDir = Some(dir) - fs.deleteOnExit(dir) - } catch { - case e: IOException => - throw QueryExecutionErrors.cannotCreateStagingDirError( - s"'${dir.toString}': ${e.getMessage}", e) - } - dir - } - - // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - private def executionId: String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } } From c29f55ed00c7b50aa86a7d86b499739d7d377c96 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 14:27:08 +0800 Subject: [PATCH 10/53] Update SQLHadoopMapReduceCommitProtocol.scala --- .../datasources/SQLHadoopMapReduceCommitProtocol.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 37a401d842abd..a6f86256de2c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter +import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.util.SQLFileCommitProtocolUtils @@ -40,7 +40,7 @@ class SQLHadoopMapReduceCommitProtocol( override val stagingDir: Path = SQLFileCommitProtocolUtils.newVersionExternalTempPath( - new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(), + new Path(path), SparkContext.getActive.get.hadoopConfiguration, SQLConf.get.fileCommitStagingDir, "spark", jobId) override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { From 2604c9f2c95f50e90f77c300442344816baea3d3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 15:07:15 +0800 Subject: [PATCH 11/53] Update --- .../spark/internal/io/FileCommitProtocol.scala | 5 +++++ .../internal/io/HadoopMapReduceCommitProtocol.scala | 8 ++++++++ .../InsertIntoHadoopFsRelationCommand.scala | 13 ++----------- .../streaming/ManifestFileCommitProtocol.scala | 2 ++ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 5cd7397ea358f..b968a51d53d34 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -50,6 +50,11 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ + /** + * The output path of this committer. + */ + def outputPath: Path + /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index c061d617fce4b..2434ffa91d7a3 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -160,6 +160,14 @@ class HadoopMapReduceCommitProtocol( f"part-$split%05d-$jobId$ext" } + override def outputPath: Path = { + if (dynamicPartitionOverwrite) { + stagingDir + } else { + new Path(path) + } + } + override def setupJob(jobContext: JobContext): Unit = { // Setup IDs val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 267b360b474ca..7edb78471e0ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand( val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, - outputPath = outputPath.toString, + outputPath = qualifiedOutputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { @@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand( } } - // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files - // will be renamed from staging path to final output path during commit job - val committerOutputPath = if (dynamicPartitionOverwrite) { - FileCommitProtocol.getStagingDir(outputPath.toString, jobId) - .makeQualified(fs.getUri, fs.getWorkingDirectory) - } else { - qualifiedOutputPath - } - val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committerOutputPath.toString, customPartitionLocations, outputColumns), + committer.outputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 46ce33687890d..30709266f1806 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } + override def outputPath: Path = new Path(path) + override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] From 71f6b17c91b92a85913f816085290a0212e46939 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 25 Aug 2021 19:27:56 +0800 Subject: [PATCH 12/53] fix ut --- .../sql/util/SQLFileCommitProtocolUtils.scala | 2 +- .../spark/sql/sources/InsertSuite.scala | 2 +- .../sql/hive/execution/SaveAsHiveFile.scala | 4 +++- .../apache/spark/sql/hive/InsertSuite.scala | 21 ++++++++++++------- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala index 1b2bfb42f0e35..0d5455a411c2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala @@ -69,7 +69,7 @@ object SQLFileCommitProtocolUtils extends Logging { jobId) } - private def getStagingDir( + def getStagingDir( inputPath: Path, hadoopConf: Configuration, stagingDir: String, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index d3c21032d0b32..ddcbfe44fbef8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -1073,6 +1073,6 @@ class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLoca } private def isSparkStagingDir(path: Path): Boolean = { - path.toString.contains(".spark-staging-") + path.toString.contains(".spark-staging_") } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 581c97ebcbfdd..12a9ed3d4d311 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -129,8 +129,10 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - newVersionExternalTempPath( + val externalTempPath = newVersionExternalTempPath( path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) + createdTempDir = Some(externalTempPath.getParent) + externalTempPath } else { throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index b715f484fa02a..307d30d88296d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -21,16 +21,17 @@ import java.io.File import com.google.common.io.Files import org.apache.hadoop.fs.Path +import org.apache.hadoop.hive.ql.exec.TaskRunner import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.SparkException import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.hive.execution.InsertIntoHiveTable import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.SQLFileCommitProtocolUtils import org.apache.spark.util.Utils case class TestData(key: Int, value: String) @@ -556,25 +557,29 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val conf = spark.sessionState.newHadoopConf() val inputPath = new Path("/tmp/b/c") var stagingDir = "tmp/b" - val saveHiveFile = InsertIntoHiveTable(null, Map.empty, null, false, false, null) - val getStagingDir = PrivateMethod[Path](Symbol("getStagingDir")) - var path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + val id = TaskRunner.getTaskRunnerID + var path = SQLFileCommitProtocolUtils. + getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b_hive_") != -1) stagingDir = "tmp/b/c" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = "d/e" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = ".d/e" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1) stagingDir = "/tmp/c/" - path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) + path = SQLFileCommitProtocolUtils.getStagingDir( + inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/c_hive_") != -1) } From 30113d28dc4a0a619e4bd732b360c84542eb25b3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 13:38:01 +0800 Subject: [PATCH 13/53] Update SaveAsHiveFile.scala --- .../sql/hive/execution/SaveAsHiveFile.scala | 85 ------------------- 1 file changed, 85 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 298fadcb01fac..e669f9cfcab2f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -189,90 +189,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { } dirPath } - - // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2 - private def newVersionExternalTempPath( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val extURI: URI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path, hadoopConf, stagingDir) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000") - } - } - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir) - } - - private[hive] def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir: Path = - fs.makeQualified( - new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - try { - if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) { - throw new IllegalStateException("Cannot create staging directory '" + dir.toString + "'") - } - createdTempDir = Some(dir) - fs.deleteOnExit(dir) - } catch { - case e: IOException => - throw QueryExecutionErrors.cannotCreateStagingDirError( - s"'${dir.toString}': ${e.getMessage}", e) - } - dir - } - - // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - private def executionId: String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } } From 6f405dc89d507d36da03d6a97d9aa268926f7bfc Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 14:15:46 +0800 Subject: [PATCH 14/53] update --- .../spark/internal/config/package.scala | 10 ++ .../internal/io/FileCommitProtocol.scala | 103 ++++++++++++++-- .../io/HadoopMapReduceCommitProtocol.scala | 11 +- .../apache/spark/sql/internal/SQLConf.scala | 12 -- .../InsertIntoHadoopFsRelationCommand.scala | 13 +- .../SQLHadoopMapReduceCommitProtocol.scala | 7 -- .../ManifestFileCommitProtocol.scala | 2 - .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ------------------ .../sql/sources/PartitionedWriteSuite.scala | 3 +- .../sql/hive/execution/SaveAsHiveFile.scala | 5 +- .../apache/spark/sql/hive/InsertSuite.scala | 12 +- 11 files changed, 129 insertions(+), 164 deletions(-) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 4b7ba9b19f4de..8e24105d7742b 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -2258,4 +2258,14 @@ package object config { .version("3.2.0") .stringConf .createOptional + + val EXEC_STAGING_DIR = + ConfigBuilder("spark.exec.stagingDir") + .doc("The staging directory of Spark job. Spark uses it to deal with files with " + + "absolute output path, or writing data into partitioned directory when " + + "dynamic partition overwrite mode.") + .version("3.3.0") + .internal() + .stringConf + .createWithDefault(".spark-staging") } diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index b968a51d53d34..958a9fa8aa034 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -17,11 +17,18 @@ package org.apache.spark.internal.io +import java.net.URI +import java.text.SimpleDateFormat +import java.util.{Date, Locale, Random} + +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ +import org.apache.spark.SparkContext import org.apache.spark.annotation.Unstable import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.util.Utils @@ -50,11 +57,6 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ - /** - * The output path of this committer. - */ - def outputPath: Path - /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ @@ -232,8 +234,95 @@ object FileCommitProtocol extends Logging { } } - def getStagingDir(path: String, jobId: String): Path = { - new Path(path, ".spark-staging-" + jobId) + def getStagingDir(path: String, jobId: String, conf: Configuration): Path = { + newVersionExternalTempPath( + new Path(path), conf, + SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId) + } + + + def newVersionExternalTempPath( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val extURI = path.toUri + if (extURI.getScheme == "viewfs") { + getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) + } else { + new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") + } + } + + + private def getExtTmpPathRelTo( + path: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), + "-ext-10000") // Hive uses 10000 + } + + private def getExternalScratchDir( + extURI: URI, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + getStagingDir( + new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), + hadoopConf, + stagingDir, + engineType, + jobId) + } + + def getStagingDir( + inputPath: Path, + hadoopConf: Configuration, + stagingDir: String, + engineType: String, + jobId: String): Path = { + val inputPathName: String = inputPath.toString + val fs: FileSystem = inputPath.getFileSystem(hadoopConf) + var stagingPathName: String = + if (inputPathName.indexOf(stagingDir) == -1) { + new Path(inputPathName, stagingDir).toString + } else { + inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) + } + + // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the + // staging directory needs to avoid being deleted when users set hive.exec.stagingdir + // under the table directory. + if (isSubDir(new Path(stagingPathName), inputPath, fs) && + !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { + logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + + "directory.") + stagingPathName = new Path(inputPathName, ".hive-staging").toString + } + + val dir = fs.makeQualified( + new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) + logDebug("Created staging dir = " + dir + " for path = " + inputPath) + dir + } + + private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { + val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR + val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR + path1.startsWith(path2) + } + + def executionId(engineType: String): String = { + val rand: Random = new Random + val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) + s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) } } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index e1c0367e46b98..9796d805be35f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -104,7 +104,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected def stagingDir = getStagingDir(path, jobId) + protected var stagingDir: Path = _ protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() @@ -113,6 +113,7 @@ class HadoopMapReduceCommitProtocol( case c: Configurable => c.setConf(context.getConfiguration) case _ => () } + stagingDir = getStagingDir(path, jobId, context.getConfiguration) format.getOutputCommitter(context) } @@ -170,14 +171,6 @@ class HadoopMapReduceCommitProtocol( f"${spec.prefix}part-$split%05d-$jobId${spec.suffix}" } - override def outputPath: Path = { - if (dynamicPartitionOverwrite) { - stagingDir - } else { - new Path(path) - } - } - override def setupJob(jobContext: JobContext): Unit = { // Setup IDs val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0defec5becd7e..98aad1c9c83f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1219,16 +1219,6 @@ object SQLConf { .booleanConf .createWithDefault(true) - val FILE_COMMIT_STAGING_DIR = - buildConf("spark.sql.source.stagingDir") - .doc("The staging directory of Spark job. Spark uses it to deal with files with " + - "absolute output path, or writing data into partitioned directory when " + - "dynamic partition overwrite mode.") - .version("3.3.0") - .internal() - .stringConf - .createWithDefault(".spark-staging") - // The output committer class used by data sources. The specified class needs to be a // subclass of org.apache.hadoop.mapreduce.OutputCommitter. val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass") @@ -3842,8 +3832,6 @@ class SQLConf extends Serializable with Logging { def partitionColumnTypeInferenceEnabled: Boolean = getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE) - def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR) - def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS) def parallelPartitionDiscoveryThreshold: Int = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 7edb78471e0ff..f1a0e0dedbddd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand( val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, - outputPath = qualifiedOutputPath.toString, + outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { @@ -162,6 +162,15 @@ case class InsertIntoHadoopFsRelationCommand( } } + // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files + // will be renamed from staging path to final output path during commit job + val committerOutputPath = if (dynamicPartitionOverwrite) { + FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf) + .makeQualified(fs.getUri, fs.getWorkingDirectory) + } else { + qualifiedOutputPath + } + val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -169,7 +178,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committer.outputPath.toString, customPartitionLocations, outputColumns), + committerOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index a6f86256de2c3..144be2316f091 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,11 +21,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter -import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.util.SQLFileCommitProtocolUtils /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -38,11 +36,6 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - override val stagingDir: Path = - SQLFileCommitProtocolUtils.newVersionExternalTempPath( - new Path(path), SparkContext.getActive.get.hadoopConfiguration, - SQLConf.get.fileCommitStagingDir, "spark", jobId) - override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 30709266f1806..46ce33687890d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,8 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } - override def outputPath: Path = new Path(path) - override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala deleted file mode 100644 index 0d5455a411c2d..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.spark.sql.util - -import java.net.URI -import java.text.SimpleDateFormat -import java.util.{Date, Locale, Random} - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} - -import org.apache.spark.internal.Logging - -object SQLFileCommitProtocolUtils extends Logging { - - def newVersionExternalTempPath( - path: Path, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - val extURI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId), - "-ext-10000") - } - } - - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), - "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir, - engineType, - jobId) - } - - def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String, - engineType: String, - jobId: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir = fs.makeQualified( - new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - dir - } - - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - def executionId(engineType: String): String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 4751767bfd239..86894945591da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.TestUtils import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -194,7 +195,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { withTempDir { stagingDir => withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> SQLConf.PartitionOverwriteMode.DYNAMIC.toString, - SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) { + EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { withTempDir { d => withTable("t") { sql( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index e669f9cfcab2f..8c6c10fee7935 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -40,7 +40,6 @@ import org.apache.spark.sql.execution.datasources.{BucketingUtils, FileFormatWri import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.sql.hive.client.HiveVersion -import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._ // Base trait from which all hive insert statement physical execution extends. private[hive] trait SaveAsHiveFile extends DataWritingCommand { @@ -135,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - val externalTempPath = newVersionExternalTempPath( + val externalTempPath = FileCommitProtocol.newVersionExternalTempPath( path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) createdTempDir = Some(externalTempPath.getParent) externalTempPath @@ -168,7 +167,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { hadoopConf: Configuration, scratchDir: String): Path = { val extURI: URI = path.toUri - val scratchPath = new Path(scratchDir, executionId("hive")) + val scratchPath = new Path(scratchDir, FileCommitProtocol.executionId("hive")) var dirPath = new Path( extURI.getScheme, extURI.getAuthority, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index f0355eaf18b68..1dfeedcd9a18b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -26,13 +26,13 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.SparkException +import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.SQLFileCommitProtocolUtils import org.apache.spark.util.Utils case class TestData(key: Int, value: String) @@ -543,27 +543,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val inputPath = new Path("/tmp/b/c") var stagingDir = "tmp/b" val id = TaskRunner.getTaskRunnerID - var path = SQLFileCommitProtocolUtils. + var path = FileCommitProtocol. getStagingDir(inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b_hive_") != -1) stagingDir = "tmp/b/c" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = "d/e" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1) stagingDir = ".d/e" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1) stagingDir = "/tmp/c/" - path = SQLFileCommitProtocolUtils.getStagingDir( + path = FileCommitProtocol.getStagingDir( inputPath, conf, stagingDir, "hive", id.toString) assert(path.toString.indexOf("/tmp/c_hive_") != -1) } From 361263b9b11e5668286268fc306b399b8760c48a Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 14:43:35 +0800 Subject: [PATCH 15/53] update --- .../spark/internal/io/FileCommitProtocol.scala | 9 ++++++--- .../internal/io/HadoopMapReduceCommitProtocol.scala | 13 +++++++++++-- .../InsertIntoHadoopFsRelationCommand.scala | 11 +---------- .../streaming/ManifestFileCommitProtocol.scala | 2 ++ 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 958a9fa8aa034..73b7fd62a25b6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ -import org.apache.spark.SparkContext import org.apache.spark.annotation.Unstable import org.apache.spark.internal.Logging import org.apache.spark.internal.config.EXEC_STAGING_DIR @@ -57,6 +56,11 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ + /** + * The output path of this committer. + */ + def outputPath: Path + /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ @@ -236,8 +240,7 @@ object FileCommitProtocol extends Logging { def getStagingDir(path: String, jobId: String, conf: Configuration): Path = { newVersionExternalTempPath( - new Path(path), conf, - SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId) + new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 9796d805be35f..86d087cdb5cc2 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl +import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.mapred.SparkHadoopMapRedUtil @@ -104,7 +105,8 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected var stagingDir: Path = _ + protected var stagingDir: Path = + getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() @@ -113,10 +115,17 @@ class HadoopMapReduceCommitProtocol( case c: Configurable => c.setConf(context.getConfiguration) case _ => () } - stagingDir = getStagingDir(path, jobId, context.getConfiguration) format.getOutputCommitter(context) } + override def outputPath: Path = { + if (dynamicPartitionOverwrite) { + stagingDir + } else { + new Path(path) + } + } + override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { newTaskTempFile(taskContext, dir, FileNameSpec("", ext)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index f1a0e0dedbddd..382c1e3c16384 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand( } } - // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files - // will be renamed from staging path to final output path during commit job - val committerOutputPath = if (dynamicPartitionOverwrite) { - FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf) - .makeQualified(fs.getUri, fs.getWorkingDirectory) - } else { - qualifiedOutputPath - } - val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committerOutputPath.toString, customPartitionLocations, outputColumns), + committer.outputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 46ce33687890d..30709266f1806 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } + override def outputPath: Path = new Path(path) + override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] From 9ee6ee545c1c6490afaec2febbaf68cb3888586e Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 14:55:26 +0800 Subject: [PATCH 16/53] update --- .../sql/sources/PartitionedWriteSuite.scala | 28 ------- .../sql/sources/StagingInsertSuite.scala | 74 +++++++++++++++++++ 2 files changed, 74 insertions(+), 28 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 86894945591da..b9266429f81a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -25,7 +25,6 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.TestUtils import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -190,33 +189,6 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } - - test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { - withTempDir { stagingDir => - withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> - SQLConf.PartitionOverwriteMode.DYNAMIC.toString, - EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { - withTempDir { d => - withTable("t") { - sql( - s""" - | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) - | LOCATION '${d.getAbsolutePath}' - """.stripMargin) - - val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") - df.write - .partitionBy("p1") - .mode("overwrite") - .saveAsTable("t") - checkAnswer(sql("SELECT * FROM t"), df) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) - } - } - } - } - } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala new file mode 100644 index 0000000000000..72c6534dc8751 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import org.apache.spark.SparkConf +import org.apache.spark.internal.config.EXEC_STAGING_DIR +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.Utils + +class StagingInsertSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + val stagingDir = Utils.createTempDir() + + override def sparkConf: SparkConf = + super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath) + + override def beforeAll(): Unit = { + super.beforeAll() + stagingDir.delete() + } + + override def afterAll(): Unit = { + try { + Utils.deleteRecursively(stagingDir) + } finally { + super.afterAll() + } + } + + test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { + withTempDir { stagingDir => + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString, + EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) + } + } + } + } + } +} From 7773fb24bc5071b76b97b5ebbe8befc7c80d1e71 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 15:24:13 +0800 Subject: [PATCH 17/53] update --- .../internal/io/FileCommitProtocol.scala | 9 +++-- .../io/HadoopMapReduceCommitProtocol.scala | 6 +-- ...FileCommitProtocolInstantiationSuite.scala | 12 +++--- .../InsertIntoHadoopFsRelationCommand.scala | 1 + .../SQLHadoopMapReduceCommitProtocol.scala | 4 +- .../execution/datasources/v2/FileWrite.scala | 3 +- .../execution/streaming/FileStreamSink.scala | 3 +- .../execution/metric/SQLMetricsSuite.scala | 4 +- .../sql/sources/PartitionedWriteSuite.scala | 6 ++- .../sql/sources/StagingInsertSuite.scala | 37 +++++++++---------- .../sql/hive/execution/SaveAsHiveFile.scala | 3 +- 11 files changed, 50 insertions(+), 38 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 73b7fd62a25b6..d6f7370c9a582 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -215,6 +215,7 @@ object FileCommitProtocol extends Logging { className: String, jobId: String, outputPath: String, + conf: Configuration = new Configuration(), dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + @@ -224,9 +225,11 @@ object FileCommitProtocol extends Logging { // dynamicPartitionOverwrite: Boolean). // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { - val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean]) - logDebug("Using (String, String, Boolean) constructor") - ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + val ctor = clazz.getDeclaredConstructor( + classOf[String], classOf[String], classOf[Configuration], classOf[Boolean]) + logDebug("Using (String, String, Configuration, Boolean) constructor") + ctor.newInstance( + jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => logDebug("Falling back to (String, String) constructor") diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 86d087cdb5cc2..9cdfe851c89d7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -24,12 +24,12 @@ import scala.collection.mutable import scala.util.Try import org.apache.hadoop.conf.Configurable +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl -import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.mapred.SparkHadoopMapRedUtil @@ -68,6 +68,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, + conf: Configuration = new Configuration(), dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { @@ -105,8 +106,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected var stagingDir: Path = - getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration) + protected var stagingDir: Path = getStagingDir(path, jobId, conf) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 778f748f83950..1535ca4c6624e 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.internal.io +import org.apache.hadoop.conf.Configuration + import org.apache.spark.SparkFunSuite /** @@ -58,8 +60,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[Other].getCanonicalName, "job", - "path", - false) + "path") } } @@ -68,8 +69,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[NoMatchingArgs].getCanonicalName, "job", - "path", - false) + "path") } } @@ -83,6 +83,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[ClassicConstructorCommitProtocol].getCanonicalName, "job", "path", + new Configuration(), dynamic).asInstanceOf[ClassicConstructorCommitProtocol] } @@ -97,6 +98,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[FullConstructorCommitProtocol].getCanonicalName, "job", "path", + new Configuration(), dynamic).asInstanceOf[FullConstructorCommitProtocol] } @@ -121,7 +123,7 @@ private class FullConstructorCommitProtocol( arg2: String, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) { def this(arg1: String, arg2: String) = { this(arg1, arg2, false, 2) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 382c1e3c16384..fff93a579a628 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -109,6 +109,7 @@ case class InsertIntoHadoopFsRelationCommand( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, outputPath = outputPath.toString, + hadoopConf, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 144be2316f091..dde68006402d7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.datasources +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter @@ -32,8 +33,9 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, + conf: Configuration, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala index ccc467aae1f15..d822b1bb6ef12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala @@ -61,7 +61,8 @@ trait FileWrite extends Write { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = paths.head) + outputPath = paths.head, + hadoopConf) lazy val description = createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index 5058a1dfc3baf..8db2ce027fdfd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -150,7 +150,8 @@ class FileStreamSink( val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, - outputPath = path) + outputPath = path, + sparkSession.sessionState.newHadoopConf()) committer match { case manifestCommitter: ManifestFileCommitProtocol => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 32428fbde0016..9638924b394db 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,6 +22,7 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -834,7 +835,8 @@ case class CustomFileCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol( + jobId, path, new Configuration(), dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index b9266429f81a5..df41b93f45fad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.sources import java.io.File import java.sql.Timestamp +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} @@ -35,7 +36,7 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String) - extends SQLHadoopMapReduceCommitProtocol(jobId, path) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration()) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -199,7 +200,8 @@ private class PartitionFileExistCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol( + jobId, path, new Configuration(), dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala index 72c6534dc8751..162cef0cca3f8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -46,27 +46,24 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession { } test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { - withTempDir { stagingDir => - withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> - SQLConf.PartitionOverwriteMode.DYNAMIC.toString, - EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) { - withTempDir { d => - withTable("t") { - sql( - s""" - | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) - | LOCATION '${d.getAbsolutePath}' - """.stripMargin) + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString) { + withTempDir { d => + withTable("t") { + sql( + s""" + |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + |LOCATION '${d.getAbsolutePath}' + """.stripMargin) - val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") - df.write - .partitionBy("p1") - .mode("overwrite") - .saveAsTable("t") - checkAnswer(sql("SELECT * FROM t"), df) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) - } + val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) } } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 8c6c10fee7935..734a0aa64df2a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -83,7 +83,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = outputLocation) + outputPath = outputLocation, + hadoopConf) val options = bucketSpec .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true")) From a3b3c51f326fcf285eb870a2874c774a1498d145 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 15:42:17 +0800 Subject: [PATCH 18/53] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index fc5d0a0b3a7f5..d8b783c460f58 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -19,10 +19,10 @@ package org.apache.spark.internal.io.cloud import java.io.IOException +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory} - import org.apache.spark.internal.io.FileNameSpec import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol @@ -51,7 +51,7 @@ class PathOutputCommitProtocol( jobId: String, dest: String, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable { + extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable { if (dynamicPartitionOverwrite) { // until there's explicit extensions to the PathOutputCommitProtocols From b4d60e4b431ebfcc843f826086b1d09135c823c5 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 12 Oct 2021 22:29:45 +0800 Subject: [PATCH 19/53] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index d8b783c460f58..c6497fc47d16f 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory} + import org.apache.spark.internal.io.FileNameSpec import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol From 2c4180886d06e61304774900c45db3d45c18c39c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 10:22:05 +0800 Subject: [PATCH 20/53] Update CommitterBindingSuite.scala --- .../spark/internal/io/cloud/CommitterBindingSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala index 546f54229ea59..13317b61276dc 100644 --- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala +++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala @@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite { test("local filesystem instantiation") { val instance = FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", false) + jobId, "file:///tmp", new Configuration(), false) val protocol = instance.asInstanceOf[PathOutputCommitProtocol] assert("file:///tmp" === protocol.destination) @@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite { val cause = intercept[InvocationTargetException] { FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", true) + jobId, "file:///tmp", new Configuration(), true) }.getCause if (cause == null || !cause.isInstanceOf[IOException] || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) { From 454118d3e7037b99775c53d3d473984e19ed92cf Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 14:02:22 +0800 Subject: [PATCH 21/53] update --- ...cala => SQLPathHadoopMapReduceCommitProtocol.scala} | 10 +++++----- ...putCommitter.scala => SQLPathOutputCommitter.scala} | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{NewSQLHadoopMapReduceCommitProtocol.scala => SQLPathHadoopMapReduceCommitProtocol.scala} (90%) rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{NewFileOutputCommitter.scala => SQLPathOutputCommitter.scala} (99%) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala similarity index 90% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index acb4415604488..2135b1ff82b8f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -29,26 +29,26 @@ import org.apache.spark.sql.internal.SQLConf * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual * Hadoop output committer using an option specified in SQLConf. */ -class NewSQLHadoopMapReduceCommitProtocol( +class SQLPathHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - private val committerStagingDir = NewFileOutputCommitter.newVersionExternalTempPath( + private val committerStagingDir = SQLPathOutputCommitter.newVersionExternalTempPath( jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf, SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir")) - def currentCommitter: NewFileOutputCommitter = - getCommitter.asInstanceOf[NewFileOutputCommitter] + def currentCommitter: SQLPathOutputCommitter = + getCommitter.asInstanceOf[SQLPathOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val committerOutputPath = new Path(path) val committer = - new NewFileOutputCommitter(committerStagingDir, committerOutputPath, context) + new SQLPathOutputCommitter(committerStagingDir, committerOutputPath, context) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala index 1a8760f8c6f6d..f831594b49561 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala @@ -31,7 +31,7 @@ import org.apache.hadoop.util.{DurationInfo, Progressable} import org.apache.spark.internal.Logging -class NewFileOutputCommitter( +class SQLPathOutputCommitter( stagingDir: Path, outputPath: Path, context: TaskAttemptContext) @@ -504,7 +504,7 @@ class NewFileOutputCommitter( } } -object NewFileOutputCommitter extends Logging { +object SQLPathOutputCommitter extends Logging { def newVersionExternalTempPath( jobId: String, path: Path, From 11d6d15d1fa19dd67d5ed3b0335d797e7c8fa02b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 14:04:39 +0800 Subject: [PATCH 22/53] Update PartitionedWriteSuite.scala --- .../org/apache/spark/sql/sources/PartitionedWriteSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 186b0e355a700..ab5216b1439cf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.{NewSQLHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol} +import org.apache.spark.sql.execution.datasources.{SQLPathHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -196,7 +196,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { "spark.sql.source.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir", "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2", SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> - classOf[NewSQLHadoopMapReduceCommitProtocol].getName) { + classOf[SQLPathHadoopMapReduceCommitProtocol].getName) { withTempDir { d => withTable("t") { sql( From 9c9826cf9988150792368a817e7b258a692e638d Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 14:20:30 +0800 Subject: [PATCH 23/53] update --- .../internal/io/HadoopMapReduceCommitProtocol.scala | 4 +--- .../SQLPathHadoopMapReduceCommitProtocol.scala | 9 +++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index a7baf74e51818..f279d0514da66 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -73,7 +73,7 @@ class HadoopMapReduceCommitProtocol( import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ - @transient private var committer: OutputCommitter = _ + @transient protected var committer: OutputCommitter = _ /** * Checks whether there are files to be committed to a valid output location. @@ -106,8 +106,6 @@ class HadoopMapReduceCommitProtocol( */ protected def stagingDir = getStagingDir(path, jobId) - protected def getCommitter = committer - protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() // If OutputFormat is Configurable, we should set conf to it. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 2135b1ff82b8f..930e242fd6f66 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -40,8 +40,8 @@ class SQLPathHadoopMapReduceCommitProtocol( jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf, SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir")) - def currentCommitter: SQLPathOutputCommitter = - getCommitter.asInstanceOf[SQLPathOutputCommitter] + val sqlPathOutputCommitter: SQLPathOutputCommitter = + committer.asInstanceOf[SQLPathOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { // The specified output committer is a FileOutputCommitter. @@ -59,9 +59,10 @@ class SQLPathHadoopMapReduceCommitProtocol( ext: String): String = { val filename = getFilename(taskContext, ext) dir.map { d => - new Path(new Path(currentCommitter.getTaskAttemptPath(taskContext), d), filename).toString + new Path( + new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString }.getOrElse { - new Path(currentCommitter.getTaskAttemptPath(taskContext), filename).toString + new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), filename).toString } } } From 592682235822113a73befe3247f2b7e1bebbd81f Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:10:21 +0800 Subject: [PATCH 24/53] update --- .../internal/io/FileCommitProtocol.scala | 21 +++++++++++++++++-- .../io/HadoopMapReduceCommitProtocol.scala | 4 +++- .../InsertIntoHadoopFsRelationCommand.scala | 2 +- .../ManifestFileCommitProtocol.scala | 4 +++- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index d6f7370c9a582..73e6abcba867e 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -57,9 +57,26 @@ abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ /** - * The output path of this committer. + * Get the final directory where work will be placed once the job + * is committed. This may be null, in which case, there is no output + * path to write data to. */ - def outputPath: Path + def getOutputPath(): Path = null + + /** + * Get the directory that the task should write results into. + * Warning: there's no guarantee that this work path is on the same + * FS as the final output, or that it's visible across machines. + * May be null. + */ + def getWorkPath(): Path = null + + /** + * Predicate: is there an output path? + */ + def hasOutputPath(): Boolean = { + getOutputPath() != null; + } /** * Setups up a job. Must be called on the driver before any other methods can be invoked. diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 9cdfe851c89d7..aa511d385b78a 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -118,7 +118,9 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } - override def outputPath: Path = { + override def getOutputPath(): Path = new Path(path) + + override def getWorkPath: Path = { if (dynamicPartitionOverwrite) { stagingDir } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index fff93a579a628..1a46b60fe111e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -170,7 +170,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committer.outputPath.toString, customPartitionLocations, outputColumns), + committer.getWorkPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 30709266f1806..09681f882853f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,7 +56,9 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } - override def outputPath: Path = new Path(path) + override def getOutputPath: Path = new Path(path) + + override def getWorkPath(): Path = new Path(path) override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") From 6cdee5868655c972f3aebf9294ae0e1680501447 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:38:22 +0800 Subject: [PATCH 25/53] update --- .../apache/spark/internal/io/FileCommitProtocol.scala | 9 +++++---- .../internal/io/HadoopMapRedCommitProtocol.scala | 2 +- .../internal/io/HadoopMapReduceCommitProtocol.scala | 5 ++--- .../io/FileCommitProtocolInstantiationSuite.scala | 6 +----- .../internal/io/cloud/CommitterBindingSuite.scala | 4 ++-- .../InsertIntoHadoopFsRelationCommand.scala | 8 ++++++-- .../SQLHadoopMapReduceCommitProtocol.scala | 5 ++--- .../sql/execution/datasources/v2/FileWrite.scala | 3 +-- .../sql/execution/streaming/FileStreamSink.scala | 3 +-- .../spark/sql/execution/metric/SQLMetricsSuite.scala | 4 ++-- .../spark/sql/sources/PartitionedWriteSuite.scala | 11 +++++++---- .../spark/sql/hive/execution/SaveAsHiveFile.scala | 3 +-- 12 files changed, 31 insertions(+), 32 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 73e6abcba867e..7029f6f3eb1f1 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -232,8 +232,8 @@ object FileCommitProtocol extends Logging { className: String, jobId: String, outputPath: String, - conf: Configuration = new Configuration(), - dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = { + dynamicPartitionOverwrite: Boolean = false, + stagingDir: Option[String] = None): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + s" dynamic=$dynamicPartitionOverwrite") @@ -245,8 +245,9 @@ object FileCommitProtocol extends Logging { val ctor = clazz.getDeclaredConstructor( classOf[String], classOf[String], classOf[Configuration], classOf[Boolean]) logDebug("Using (String, String, Configuration, Boolean) constructor") - ctor.newInstance( - jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + ctor.newInstance(jobId, outputPath, + stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), + dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => logDebug("Falling back to (String, String) constructor") diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index af0aa41518766..30c85e1570d63 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} * Unlike Hadoop's OutputCommitter, this implementation is serializable. */ class HadoopMapRedCommitProtocol(jobId: String, path: String) - extends HadoopMapReduceCommitProtocol(jobId, path) { + extends HadoopMapReduceCommitProtocol(jobId, path, "") { override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index aa511d385b78a..20b2be17a305d 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -24,7 +24,6 @@ import scala.collection.mutable import scala.util.Try import org.apache.hadoop.conf.Configurable -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter @@ -68,7 +67,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - conf: Configuration = new Configuration(), + stagingPath: String = "", dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { @@ -106,7 +105,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - protected var stagingDir: Path = getStagingDir(path, jobId, conf) + protected var stagingDir: Path = new Path(stagingPath) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 1535ca4c6624e..87ffe0b69a469 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.internal.io -import org.apache.hadoop.conf.Configuration - import org.apache.spark.SparkFunSuite /** @@ -83,7 +81,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[ClassicConstructorCommitProtocol].getCanonicalName, "job", "path", - new Configuration(), dynamic).asInstanceOf[ClassicConstructorCommitProtocol] } @@ -98,7 +95,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { classOf[FullConstructorCommitProtocol].getCanonicalName, "job", "path", - new Configuration(), dynamic).asInstanceOf[FullConstructorCommitProtocol] } @@ -123,7 +119,7 @@ private class FullConstructorCommitProtocol( arg2: String, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) { def this(arg1: String, arg2: String) = { this(arg1, arg2, false, 2) diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala index 13317b61276dc..546f54229ea59 100644 --- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala +++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala @@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite { test("local filesystem instantiation") { val instance = FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", new Configuration(), false) + jobId, "file:///tmp", false) val protocol = instance.asInstanceOf[PathOutputCommitProtocol] assert("file:///tmp" === protocol.destination) @@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite { val cause = intercept[InvocationTargetException] { FileCommitProtocol.instantiate( pathCommitProtocolClassname, - jobId, "file:///tmp", new Configuration(), true) + jobId, "file:///tmp", true) }.getCause if (cause == null || !cause.isInstanceOf[IOException] || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 1a46b60fe111e..dc6ad7eb8b95f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -19,7 +19,9 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec @@ -105,12 +107,14 @@ case class InsertIntoHadoopFsRelationCommand( } val jobId = java.util.UUID.randomUUID().toString + val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf, + hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, outputPath = outputPath.toString, - hadoopConf, - dynamicPartitionOverwrite = dynamicPartitionOverwrite) + dynamicPartitionOverwrite = dynamicPartitionOverwrite, + stagingDir = Some(tmpLocation.toString)) val doInsertion = if (mode == SaveMode.Append) { true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index dde68006402d7..680c6d63d38ed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.datasources -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter @@ -33,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - conf: Configuration, + stagingPath: String = "", dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala index d822b1bb6ef12..ccc467aae1f15 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala @@ -61,8 +61,7 @@ trait FileWrite extends Write { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = paths.head, - hadoopConf) + outputPath = paths.head) lazy val description = createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index 8db2ce027fdfd..5058a1dfc3baf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -150,8 +150,7 @@ class FileStreamSink( val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, - outputPath = path, - sparkSession.sessionState.newHadoopConf()) + outputPath = path) committer match { case manifestCommitter: ManifestFileCommitProtocol => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 9638924b394db..46d44e3b1a95f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,7 +22,6 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -834,9 +833,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils case class CustomFileCommitProtocol( jobId: String, path: String, + stagingPath: String, dynamicPartitionOverwrite: Boolean = false) extends SQLHadoopMapReduceCommitProtocol( - jobId, path, new Configuration(), dynamicPartitionOverwrite) { + jobId, path, stagingPath, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index df41b93f45fad..96ab46df1f50c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.sources import java.io.File import java.sql.Timestamp -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} @@ -35,8 +34,11 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils -private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration()) +private class OnlyDetectCustomPathFileCommitProtocol( + jobId: String, + path: String, + stagingPath: String) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -199,9 +201,10 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { private class PartitionFileExistCommitProtocol( jobId: String, path: String, + stagingPath: String, dynamicPartitionOverwrite: Boolean) extends SQLHadoopMapReduceCommitProtocol( - jobId, path, new Configuration(), dynamicPartitionOverwrite) { + jobId, path, stagingPath, dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 734a0aa64df2a..8c6c10fee7935 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -83,8 +83,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, - outputPath = outputLocation, - hadoopConf) + outputPath = outputLocation) val options = bucketSpec .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true")) From 824ec0474c5df84fa589394e0f82f7d57e65f224 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:39:50 +0800 Subject: [PATCH 26/53] Update FileCommitProtocol.scala --- .../scala/org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 7029f6f3eb1f1..4050edfc61481 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -243,7 +243,7 @@ object FileCommitProtocol extends Logging { // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[Configuration], classOf[Boolean]) + classOf[String], classOf[String], classOf[String], classOf[Boolean]) logDebug("Using (String, String, Configuration, Boolean) constructor") ctor.newInstance(jobId, outputPath, stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), From 8c8a174814c9e38cdb790ca77975ce3cd7d64264 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:48:25 +0800 Subject: [PATCH 27/53] update --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 6 ------ .../apache/spark/sql/sources/PartitionedWriteSuite.scala | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 4050edfc61481..23f640b40af87 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -27,7 +27,6 @@ import org.apache.hadoop.mapreduce._ import org.apache.spark.annotation.Unstable import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.util.Utils @@ -259,11 +258,6 @@ object FileCommitProtocol extends Logging { } } - def getStagingDir(path: String, jobId: String, conf: Configuration): Path = { - newVersionExternalTempPath( - new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) - } - def newVersionExternalTempPath( path: Path, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 96ab46df1f50c..0fb0d85ced2b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -37,8 +37,9 @@ import org.apache.spark.util.Utils private class OnlyDetectCustomPathFileCommitProtocol( jobId: String, path: String, - stagingPath: String) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath) + stagingPath: String, + dynamicPartitionOverwrite: Boolean) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) with Serializable with Logging { override def newTaskTempFileAbsPath( From 8d7ce6e58800791042a4a3f12c16327abbbe903b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:51:54 +0800 Subject: [PATCH 28/53] update --- .../apache/spark/internal/io/HadoopMapRedCommitProtocol.scala | 4 +++- .../datasources/SQLHadoopMapReduceCommitProtocol.scala | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index 30c85e1570d63..d250601349429 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -17,6 +17,7 @@ package org.apache.spark.internal.io +import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} @@ -27,7 +28,8 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} * Unlike Hadoop's OutputCommitter, this implementation is serializable. */ class HadoopMapRedCommitProtocol(jobId: String, path: String) - extends HadoopMapReduceCommitProtocol(jobId, path, "") { + extends HadoopMapReduceCommitProtocol( + jobId, path, new Path(path, ".spark-staging-" + jobId).toString) { override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 680c6d63d38ed..7ed76643a4eb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String = "", + stagingPath: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) with Serializable with Logging { From da6a0b9ccbd06bc1f08558e74d00facf2bf487f8 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 17:58:41 +0800 Subject: [PATCH 29/53] update --- .../spark/internal/io/HadoopMapRedCommitProtocol.scala | 4 +--- .../internal/io/HadoopMapReduceCommitProtocol.scala | 9 ++++++++- .../io/FileCommitProtocolInstantiationSuite.scala | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala index d250601349429..af0aa41518766 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala @@ -17,7 +17,6 @@ package org.apache.spark.internal.io -import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} @@ -28,8 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext} * Unlike Hadoop's OutputCommitter, this implementation is serializable. */ class HadoopMapRedCommitProtocol(jobId: String, path: String) - extends HadoopMapReduceCommitProtocol( - jobId, path, new Path(path, ".spark-staging-" + jobId).toString) { + extends HadoopMapReduceCommitProtocol(jobId, path) { override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = { val config = context.getConfiguration.asInstanceOf[JobConf] diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 20b2be17a305d..6f2b73d34c0c6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -67,10 +67,17 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String = "", + stagingPath: String, dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { + def this(jobId: String, path: String) = + this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString) + + def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = + this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString, + dynamicPartitionOverwrite) + import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 87ffe0b69a469..77f42577517e8 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -119,7 +119,7 @@ private class FullConstructorCommitProtocol( arg2: String, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { def this(arg1: String, arg2: String) = { this(arg1, arg2, false, 2) From 63e466c5ade9078489e0990acdd5c5f1dd97ac10 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 18:07:59 +0800 Subject: [PATCH 30/53] update --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- .../internal/io/FileCommitProtocolInstantiationSuite.scala | 6 ++++-- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 3 +-- .../apache/spark/sql/execution/metric/SQLMetricsSuite.scala | 3 +-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 23f640b40af87..daa6addc0a9f6 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging { classOf[String], classOf[String], classOf[String], classOf[Boolean]) logDebug("Using (String, String, Configuration, Boolean) constructor") ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), + stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString), dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 77f42577517e8..778f748f83950 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -58,7 +58,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[Other].getCanonicalName, "job", - "path") + "path", + false) } } @@ -67,7 +68,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { FileCommitProtocol.instantiate( classOf[NoMatchingArgs].getCanonicalName, "job", - "path") + "path", + false) } } diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index c6497fc47d16f..fc5d0a0b3a7f5 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -19,7 +19,6 @@ package org.apache.spark.internal.io.cloud import java.io.IOException -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory} @@ -52,7 +51,7 @@ class PathOutputCommitProtocol( jobId: String, dest: String, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable { + extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable { if (dynamicPartitionOverwrite) { // until there's explicit extensions to the PathOutputCommitProtocols diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 46d44e3b1a95f..f90c5064b0fcc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -835,8 +835,7 @@ case class CustomFileCommitProtocol( path: String, stagingPath: String, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol( - jobId, path, stagingPath, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) From fd2ac5f62e28959ce4c6bd5f92718a5eba19c213 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 13 Oct 2021 18:18:37 +0800 Subject: [PATCH 31/53] update --- .../spark/internal/io/FileCommitProtocol.scala | 8 ++++---- .../io/HadoopMapReduceCommitProtocol.scala | 16 ++++++---------- .../InsertIntoHadoopFsRelationCommand.scala | 2 +- .../SQLHadoopMapReduceCommitProtocol.scala | 4 ++-- .../sql/execution/metric/SQLMetricsSuite.scala | 5 +++-- .../sql/sources/PartitionedWriteSuite.scala | 9 ++++----- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index daa6addc0a9f6..151fcdbec17a5 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -232,7 +232,7 @@ object FileCommitProtocol extends Logging { jobId: String, outputPath: String, dynamicPartitionOverwrite: Boolean = false, - stagingDir: Option[String] = None): FileCommitProtocol = { + stagingDir: Option[Path] = None): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + s" dynamic=$dynamicPartitionOverwrite") @@ -242,10 +242,10 @@ object FileCommitProtocol extends Logging { // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[String], classOf[Boolean]) - logDebug("Using (String, String, Configuration, Boolean) constructor") + classOf[String], classOf[String], classOf[Path], classOf[Boolean]) + logDebug("Using (String, String, Path, Boolean) constructor") ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString), + stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 6f2b73d34c0c6..27e155c5733c7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -40,6 +40,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil * * @param jobId the job's or stage's id * @param path the job's output path, or null if committer acts as a noop + * @param stagingDir The staging directory of this write job. Spark uses it to deal with + * files with absolute output path, or writing data into partitioned directory + * with dynamicPartitionOverwrite=true * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime * dynamically. Suppose final path is /path/to/outputPath, output * path of [[FileOutputCommitter]] is an intermediate path, e.g. @@ -67,16 +70,15 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { def this(jobId: String, path: String) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString) + this(jobId, path, new Path(path, ".spark-staging-" + jobId)) def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString, - dynamicPartitionOverwrite) + this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite) import FileCommitProtocol._ @@ -108,12 +110,6 @@ class HadoopMapReduceCommitProtocol( */ @transient private var partitionPaths: mutable.Set[String] = null - /** - * The staging directory of this write job. Spark uses it to deal with files with absolute output - * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. - */ - protected var stagingDir: Path = new Path(stagingPath) - protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() // If OutputFormat is Configurable, we should set conf to it. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index dc6ad7eb8b95f..6605bb6c1e870 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -114,7 +114,7 @@ case class InsertIntoHadoopFsRelationCommand( jobId = jobId, outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite, - stagingDir = Some(tmpLocation.toString)) + stagingDir = Some(tmpLocation)) val doInsertion = if (mode == SaveMode.Append) { true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 7ed76643a4eb2..34ce32e710c48 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -32,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index f90c5064b0fcc..08c86e6edbb08 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,6 +22,7 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random +import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -833,9 +834,9 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils case class CustomFileCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 0fb0d85ced2b9..4fffe8f560345 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -37,9 +37,9 @@ import org.apache.spark.util.Utils private class OnlyDetectCustomPathFileCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -202,10 +202,9 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { private class PartitionFileExistCommitProtocol( jobId: String, path: String, - stagingPath: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol( - jobId, path, stagingPath, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") From e2951f128b91574e8019bc9e8f3a1bf666f73008 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 08:33:31 +0800 Subject: [PATCH 32/53] fix UT --- .../spark/internal/io/FileCommitProtocol.scala | 9 ++++++++- .../internal/io/HadoopMapReduceCommitProtocol.scala | 5 +++-- .../io/FileCommitProtocolInstantiationSuite.scala | 13 ++++++++----- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 151fcdbec17a5..ab6dbd97f11d5 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging { classOf[String], classOf[String], classOf[Path], classOf[Boolean]) logDebug("Using (String, String, Path, Boolean) constructor") ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)), + stagingDir.getOrElse(getStagingDir(outputPath, jobId)), dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => @@ -258,6 +258,13 @@ object FileCommitProtocol extends Logging { } } + def getStagingDir(path: String, jobId: String): Path = { + try { + new Path(path, ".spark-staging-" + jobId) + } catch { + case _: Exception => null + } + } def newVersionExternalTempPath( path: Path, diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 27e155c5733c7..4c8dd50c146cc 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir import org.apache.spark.mapred.SparkHadoopMapRedUtil /** @@ -75,10 +76,10 @@ class HadoopMapReduceCommitProtocol( extends FileCommitProtocol with Serializable with Logging { def this(jobId: String, path: String) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId)) + this(jobId, path, getStagingDir(path, jobId)) def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite) + this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) import FileCommitProtocol._ diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 778f748f83950..26f92cb669b6b 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.internal.io +import org.apache.hadoop.fs.Path + import org.apache.spark.SparkFunSuite /** @@ -119,16 +121,17 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String) private class FullConstructorCommitProtocol( arg1: String, arg2: String, + arg3: Path, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) { - def this(arg1: String, arg2: String) = { - this(arg1, arg2, false, 2) + def this(arg1: String, arg2: String, arg3: Path) = { + this(arg1, arg2, arg3, false, 2) } - def this(arg1: String, arg2: String, b: Boolean) = { - this(arg1, arg2, false, 3) + def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = { + this(arg1, arg2, arg3, false, 3) } } From c8d0c33b25cf06409798bdc18417887e0f1d1d1c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 08:49:20 +0800 Subject: [PATCH 33/53] complicated --- .../internal/io/FileCommitProtocol.scala | 21 +++++++++++++------ ...FileCommitProtocolInstantiationSuite.scala | 13 +++++------- .../SQLHadoopMapReduceCommitProtocol.scala | 7 +++++++ .../sql/sources/PartitionedWriteSuite.scala | 11 +++------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index ab6dbd97f11d5..7c67e3441b33a 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -249,12 +249,21 @@ object FileCommitProtocol extends Logging { dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => - logDebug("Falling back to (String, String) constructor") - require(!dynamicPartitionOverwrite, - "Dynamic Partition Overwrite is enabled but" + - s" the committer ${className} does not have the appropriate constructor") - val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) - ctor.newInstance(jobId, outputPath) + try { + logDebug("Falling back to (String, String, Boolean) constructor") + val ctor = clazz.getDeclaredConstructor( + classOf[String], classOf[String], classOf[Boolean]) + ctor.newInstance(jobId, outputPath, + dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + } catch { + case _: NoSuchMethodException => + logDebug("Falling back to (String, String) constructor") + require(!dynamicPartitionOverwrite, + "Dynamic Partition Overwrite is enabled but" + + s" the committer ${className} does not have the appropriate constructor") + val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) + ctor.newInstance(jobId, outputPath) + } } } diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 26f92cb669b6b..778f748f83950 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.internal.io -import org.apache.hadoop.fs.Path - import org.apache.spark.SparkFunSuite /** @@ -121,17 +119,16 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String) private class FullConstructorCommitProtocol( arg1: String, arg2: String, - arg3: Path, b: Boolean, val argCount: Int) - extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) { + extends HadoopMapReduceCommitProtocol(arg1, arg2, b) { - def this(arg1: String, arg2: String, arg3: Path) = { - this(arg1, arg2, arg3, false, 2) + def this(arg1: String, arg2: String) = { + this(arg1, arg2, false, 2) } - def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = { - this(arg1, arg2, arg3, false, 3) + def this(arg1: String, arg2: String, b: Boolean) = { + this(arg1, arg2, false, 3) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 34ce32e710c48..a6ecffdcaeb5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -22,6 +22,7 @@ import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf @@ -37,6 +38,12 @@ class SQLHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { + def this(jobId: String, path: String) = + this(jobId, path, getStagingDir(path, jobId), false) + + def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = + this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) + override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 4fffe8f560345..b9266429f81a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -34,12 +34,8 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils -private class OnlyDetectCustomPathFileCommitProtocol( - jobId: String, - path: String, - stagingDir: Path, - dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) +private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String) + extends SQLHadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging { override def newTaskTempFileAbsPath( @@ -202,9 +198,8 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { private class PartitionFileExistCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { override def setupJob(jobContext: JobContext): Unit = { super.setupJob(jobContext) val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") From eafb8dd5b81c587280adddcd9183ee26b2e08c75 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 10:49:26 +0800 Subject: [PATCH 34/53] update --- .../internal/io/FileCommitProtocol.scala | 20 ++++++++++--------- .../io/HadoopMapReduceCommitProtocol.scala | 18 ++++------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 7c67e3441b33a..b32a3979a8d5f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -21,6 +21,8 @@ import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} +import scala.util.Try + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ @@ -71,7 +73,12 @@ abstract class FileCommitProtocol extends Logging { def getWorkPath(): Path = null /** - * Predicate: is there an output path? + * Checks whether there are files to be committed to a valid output location. + * + * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, + * it is necessary to check whether a valid output path is specified. + * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for + * committers not writing to distributed file systems. */ def hasOutputPath(): Boolean = { getOutputPath() != null; @@ -268,11 +275,7 @@ object FileCommitProtocol extends Logging { } def getStagingDir(path: String, jobId: String): Path = { - try { - new Path(path, ".spark-staging-" + jobId) - } catch { - case _: Exception => null - } + Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null) } def newVersionExternalTempPath( @@ -290,15 +293,14 @@ object FileCommitProtocol extends Logging { } } - private def getExtTmpPathRelTo( path: Path, hadoopConf: Configuration, stagingDir: String, engineType: String, jobId: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), - "-ext-10000") // Hive uses 10000 + // Hive uses 10000 + new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), "-ext-10000") } private def getExternalScratchDir( diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 4c8dd50c146cc..fff87da244c0c 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -86,16 +86,6 @@ class HadoopMapReduceCommitProtocol( /** OutputCommitter from Hadoop is not serializable so marking it transient. */ @transient private var committer: OutputCommitter = _ - /** - * Checks whether there are files to be committed to a valid output location. - * - * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, - * it is necessary to check whether a valid output path is specified. - * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for - * committers not writing to distributed file systems. - */ - private val hasValidPath = Try { new Path(path) }.isSuccess - /** * Tracks files staged by this task for absolute output paths. These outputs are not managed by * the Hadoop OutputCommitter, so we must move these to their final locations on job commit. @@ -121,13 +111,13 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } - override def getOutputPath(): Path = new Path(path) + override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null) override def getWorkPath: Path = { if (dynamicPartitionOverwrite) { stagingDir } else { - new Path(path) + getOutputPath() } } @@ -206,7 +196,7 @@ class HadoopMapReduceCommitProtocol( override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { committer.commitJob(jobContext) - if (hasValidPath) { + if (hasOutputPath()) { val (allAbsPathFiles, allPartitionPaths) = taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip val fs = stagingDir.getFileSystem(jobContext.getConfiguration) @@ -270,7 +260,7 @@ class HadoopMapReduceCommitProtocol( logWarning(s"Exception while aborting ${jobContext.getJobID}", e) } try { - if (hasValidPath) { + if (hasOutputPath()) { val fs = stagingDir.getFileSystem(jobContext.getConfiguration) fs.delete(stagingDir, true) } From ff2bfb89ea40c2626b430aa6c06614e6a9228503 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 11:30:10 +0800 Subject: [PATCH 35/53] revert API change --- .../internal/io/FileCommitProtocol.scala | 27 ------------------- .../io/HadoopMapReduceCommitProtocol.scala | 24 ++++++++--------- .../InsertIntoHadoopFsRelationCommand.scala | 10 ++++++- .../ManifestFileCommitProtocol.scala | 4 --- .../execution/metric/SQLMetricsSuite.scala | 4 +-- 5 files changed, 22 insertions(+), 47 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index b32a3979a8d5f..f2f26297e02c7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -57,33 +57,6 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ - /** - * Get the final directory where work will be placed once the job - * is committed. This may be null, in which case, there is no output - * path to write data to. - */ - def getOutputPath(): Path = null - - /** - * Get the directory that the task should write results into. - * Warning: there's no guarantee that this work path is on the same - * FS as the final output, or that it's visible across machines. - * May be null. - */ - def getWorkPath(): Path = null - - /** - * Checks whether there are files to be committed to a valid output location. - * - * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, - * it is necessary to check whether a valid output path is specified. - * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for - * committers not writing to distributed file systems. - */ - def hasOutputPath(): Boolean = { - getOutputPath() != null; - } - /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index fff87da244c0c..35bad427253ed 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -86,6 +86,16 @@ class HadoopMapReduceCommitProtocol( /** OutputCommitter from Hadoop is not serializable so marking it transient. */ @transient private var committer: OutputCommitter = _ + /** + * Checks whether there are files to be committed to a valid output location. + * + * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null, + * it is necessary to check whether a valid output path is specified. + * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for + * committers not writing to distributed file systems. + */ + private val hasValidPath = Try { new Path(path) }.isSuccess + /** * Tracks files staged by this task for absolute output paths. These outputs are not managed by * the Hadoop OutputCommitter, so we must move these to their final locations on job commit. @@ -111,16 +121,6 @@ class HadoopMapReduceCommitProtocol( format.getOutputCommitter(context) } - override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null) - - override def getWorkPath: Path = { - if (dynamicPartitionOverwrite) { - stagingDir - } else { - getOutputPath() - } - } - override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { newTaskTempFile(taskContext, dir, FileNameSpec("", ext)) @@ -196,7 +196,7 @@ class HadoopMapReduceCommitProtocol( override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { committer.commitJob(jobContext) - if (hasOutputPath()) { + if (hasValidPath) { val (allAbsPathFiles, allPartitionPaths) = taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip val fs = stagingDir.getFileSystem(jobContext.getConfiguration) @@ -260,7 +260,7 @@ class HadoopMapReduceCommitProtocol( logWarning(s"Exception while aborting ${jobContext.getJobID}", e) } try { - if (hasOutputPath()) { + if (hasValidPath) { val fs = stagingDir.getFileSystem(jobContext.getConfiguration) fs.delete(stagingDir, true) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 6605bb6c1e870..e0461766097c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -167,6 +167,14 @@ case class InsertIntoHadoopFsRelationCommand( } } + // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files + // will be renamed from staging path to final output path during commit job + val committerOutputPath = if (dynamicPartitionOverwrite) { + tmpLocation + } else { + qualifiedOutputPath + } + val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -174,7 +182,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committer.getWorkPath.toString, customPartitionLocations, outputColumns), + committerOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 09681f882853f..46ce33687890d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -56,10 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String) this.batchId = batchId } - override def getOutputPath: Path = new Path(path) - - override def getWorkPath(): Path = new Path(path) - override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") pendingCommitFiles = new ArrayBuffer[Path] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 08c86e6edbb08..32428fbde0016 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -22,7 +22,6 @@ import java.io.File import scala.reflect.{classTag, ClassTag} import scala.util.Random -import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.internal.io.FileCommitProtocol @@ -834,9 +833,8 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils case class CustomFileCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) { + extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { override def commitTask( taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = { Thread.sleep(Random.nextInt(100)) From c5a1d16d55591edfc539490d193a77eb38a80ea4 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 11:36:11 +0800 Subject: [PATCH 36/53] update --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +- .../datasources/InsertIntoHadoopFsRelationCommand.scala | 3 +-- .../org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index f2f26297e02c7..c421c44414b7f 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -251,7 +251,7 @@ object FileCommitProtocol extends Logging { Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null) } - def newVersionExternalTempPath( + def externalTempPath( path: Path, hadoopConf: Configuration, stagingDir: String, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index e0461766097c6..45ba105c501fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -21,7 +21,6 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.FileCommitProtocol -import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec @@ -107,7 +106,7 @@ case class InsertIntoHadoopFsRelationCommand( } val jobId = java.util.UUID.randomUUID().toString - val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf, + val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf, hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 8c6c10fee7935..c3e9deacd7e80 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -134,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { oldVersionExternalTempPath(path, hadoopConf, scratchDir) } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { - val externalTempPath = FileCommitProtocol.newVersionExternalTempPath( + val externalTempPath = FileCommitProtocol.externalTempPath( path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString) createdTempDir = Some(externalTempPath.getParent) externalTempPath From ac468fa3b872e93b7124c265da6131f2206b7310 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 16:05:20 +0800 Subject: [PATCH 37/53] update --- ...SQLPathHadoopMapReduceCommitProtocol.scala | 25 ++-- .../datasources/SQLPathOutputCommitter.scala | 114 +++--------------- 2 files changed, 25 insertions(+), 114 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 930e242fd6f66..823cba3c07d7e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -21,9 +21,7 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.internal.io.{FileNameSpec, HadoopMapReduceCommitProtocol} /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -32,32 +30,25 @@ import org.apache.spark.sql.internal.SQLConf class SQLPathHadoopMapReduceCommitProtocol( jobId: String, path: String, + stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { - private val committerStagingDir = SQLPathOutputCommitter.newVersionExternalTempPath( - jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf, - SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir")) - val sqlPathOutputCommitter: SQLPathOutputCommitter = committer.asInstanceOf[SQLPathOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { - // The specified output committer is a FileOutputCommitter. - // So, we will use the FileOutputCommitter-specified constructor. - val committerOutputPath = new Path(path) - val committer = - new SQLPathOutputCommitter(committerStagingDir, committerOutputPath, context) + val committer = new SQLPathOutputCommitter(stagingDir, new Path(path), context) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } override def newTaskTempFile( - taskContext: TaskAttemptContext, - dir: Option[String], - ext: String): String = { - val filename = getFilename(taskContext, ext) + taskContext: TaskAttemptContext, + dir: Option[String], + spec: FileNameSpec): String = { + val filename = getFilename(taskContext, spec) dir.map { d => new Path( new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala index f831594b49561..61c4e947e969b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala @@ -199,10 +199,13 @@ class SQLPathOutputCommitter( if (this.skipCleanup) { logInfo("Skip cleanup the _temporary folders under job's output directory in commitJob.") } else { - try this.cleanupJob(context) - catch { + try { + this.cleanupJob(context) + } catch { case var8: IOException => - if (!this.ignoreCleanupFailures) throw var8 + if (!this.ignoreCleanupFailures) { + throw var8 + } logError("Error in cleanup job, manually cleanup is needed.", var8) } } @@ -267,13 +270,13 @@ class SQLPathOutputCommitter( } finally { if (d != null) { if (var6 != null) { - try d.close() - catch { + try { + d.close() + } catch { case var20: Throwable => var6.addSuppressed(var20) } - } - else { + } else { d.close() } } @@ -296,8 +299,7 @@ class SQLPathOutputCommitter( if (!(fs.rename(from.getPath, to))) { throw new IOException("Failed to rename " + from + " to " + to) } - } - else { + } else { fs.mkdirs(to) val var5: Array[FileStatus] = fs.listStatus(from.getPath) val var6: Int = var5.length @@ -316,8 +318,9 @@ class SQLPathOutputCommitter( if (this.hasOutputPath) { val pendingJobAttemptsPath: Path = this.getPendingJobAttemptsPath val fs: FileSystem = pendingJobAttemptsPath.getFileSystem(context.getConfiguration) - try fs.delete(pendingJobAttemptsPath, true) - catch { + try { + fs.delete(pendingJobAttemptsPath, true) + } catch { case var5: FileNotFoundException => if (!(this.isCommitJobRepeatable(context))) { throw var5 @@ -466,12 +469,10 @@ class SQLPathOutputCommitter( throw new IOException( "Could not rename " + previousCommittedTaskPath + " to " + committedTaskPath) } - } - else { + } else { logWarning(attemptId + " had no output to recover.") } - } - else { + } else { try { val from: FileStatus = fs.getFileStatus(previousCommittedTaskPath) logInfo("Recovering task for upgrading scenario, moving files from " + @@ -483,8 +484,7 @@ class SQLPathOutputCommitter( } logInfo("Done recovering task " + attemptId) } - } - else { + } else { logWarning("Output Path is null in recoverTask()") } } @@ -503,83 +503,3 @@ class SQLPathOutputCommitter( sb.toString } } - -object SQLPathOutputCommitter extends Logging { - def newVersionExternalTempPath( - jobId: String, - path: Path, - hadoopConf: Configuration, - stagingDir: String): Path = { - val extURI = path.toUri - if (extURI.getScheme == "viewfs") { - getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, jobId) - } else { - new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, jobId), "-ext-10000") - } - } - - - private def getExtTmpPathRelTo( - path: Path, - hadoopConf: Configuration, - stagingDir: String, - jobId: String): Path = { - new Path(getStagingDir(path, hadoopConf, stagingDir, jobId), "-ext-10000") // Hive uses 10000 - } - - private def getExternalScratchDir( - extURI: URI, - hadoopConf: Configuration, - stagingDir: String, - jobId: String): Path = { - getStagingDir( - new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), - hadoopConf, - stagingDir, - jobId: String) - } - - private def getStagingDir( - inputPath: Path, - hadoopConf: Configuration, - stagingDir: String, - jobId: String): Path = { - val inputPathName: String = inputPath.toString - val fs: FileSystem = inputPath.getFileSystem(hadoopConf) - var stagingPathName: String = - if (inputPathName.indexOf(stagingDir) == -1) { - new Path(inputPathName, stagingDir).toString - } else { - inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) - } - - // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the - // staging directory needs to avoid being deleted when users set hive.exec.stagingdir - // under the table directory. - if (isSubDir(new Path(stagingPathName), inputPath, fs) && - !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { - logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + - "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + - "directory.") - stagingPathName = new Path(inputPathName, ".hive-staging").toString - } - - val dir = fs.makeQualified( - new Path(stagingPathName + "_" + executionId + "-" + jobId)) - logDebug("Created staging dir = " + dir + " for path = " + inputPath) - dir - } - - // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). - private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { - val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR - val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR - path1.startsWith(path2) - } - - private def executionId: String = { - val rand: Random = new Random - val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) - "spark_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) - } -} From 09f211b4d22533bd04f124104b3ab7965e3e017f Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 16:34:16 +0800 Subject: [PATCH 38/53] update --- .../datasources/FileFormatDataWriter.scala | 2 + ...SQLPathHadoopMapReduceCommitProtocol.scala | 3 +- .../datasources/SQLPathOutputCommitter.scala | 4 -- .../sql/sources/PartitionedWriteSuite.scala | 69 +------------------ .../sql/sources/StagingInsertSuite.scala | 68 ++++++++++++++++++ 5 files changed, 73 insertions(+), 73 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala index 0b1b616bd835c..e1865d68b9c0a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala @@ -292,6 +292,8 @@ abstract class BaseDynamicPartitionDataWriter( committer.newTaskTempFile(taskAttemptContext, partDir, fileNameSpec) } + println(s"write current path = ${currentPath}") + currentWriter = description.outputWriterFactory.newInstance( path = currentPath, dataSchema = description.dataColumns.toStructType, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 823cba3c07d7e..86ad89f920693 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -35,10 +35,11 @@ class SQLPathHadoopMapReduceCommitProtocol( extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) with Serializable with Logging { - val sqlPathOutputCommitter: SQLPathOutputCommitter = + def sqlPathOutputCommitter(): SQLPathOutputCommitter = committer.asInstanceOf[SQLPathOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { + println(s"output path = ${path}") val committer = new SQLPathOutputCommitter(stagingDir, new Path(path), context) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala index 61c4e947e969b..22cd72e353f96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala @@ -18,12 +18,8 @@ package org.apache.spark.sql.execution.datasources import java.io.{FileNotFoundException, IOException} -import java.net.URI -import java.text.SimpleDateFormat -import java.util.{Date, Locale, Random} import com.google.common.base.Preconditions -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter} import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext, TaskAttemptID} import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index ab5216b1439cf..b9266429f81a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.{SQLPathHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol} +import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -189,73 +189,6 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } - - test("SPARK-36571") { - withTempDir { stagingDir => - withSQLConf( - "spark.sql.source.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir", - "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2", - SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> - classOf[SQLPathHadoopMapReduceCommitProtocol].getName) { - withTempDir { d => - withTable("t") { - sql( - s""" - | create table t(c1 int, p1 int) using ORC - | location '${d.getAbsolutePath}' - """.stripMargin) - - val df = - Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) - .toDF("c1", "p1").repartition(1) - df.write - .mode("overwrite") - .format("orc") - .saveAsTable("t") - checkAnswer(sql("select * from t"), df) - } - } - withTempDir { d => - withTable("t") { - sql( - s""" - | create table t(c1 int, p1 int) using ORC PARTITIONED BY(p1) - | location '${d.getAbsolutePath}' - """.stripMargin) - - val df = - Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) - .toDF("c1", "p1").repartition(1) - - df.createOrReplaceTempView("view1") - sql("insert overwrite t partition(p1=2) select c1 from view1 ") - checkAnswer(sql("select * from t where p1=2"), df) - sql("desc formatted t partition(p1=2)").show(100, 2000) - } - } - withTempDir { d => - withTable("t") { - sql( - s""" - | create table t(c1 int, p1 int) using parquet partitioned by (p1) - | location '${d.getAbsolutePath}' - """.stripMargin) - - val df = Seq((1, 2)).toDF("c1", "p1") - df.write - .partitionBy("p1") - .mode("overwrite") - .saveAsTable("t") - checkAnswer(sql("select * from t"), df) - } - } - } - } - } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala index 162cef0cca3f8..131758cc1f84d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.sources import org.apache.spark.SparkConf import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.execution.datasources.SQLPathHadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils @@ -68,4 +69,71 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-36571: Add a new commit protocol that support writing data to staging " + + "then mv to output path") { + withTempDir { stagingDir => + withSQLConf( + "spark.exec.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir", + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2", + SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> + classOf[SQLPathHadoopMapReduceCommitProtocol].getName) { + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) using ORC + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = + Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) + .toDF("c1", "p1").repartition(1) + df.write + .mode("overwrite") + .format("orc") + .saveAsTable("t") + checkAnswer(sql("select * from t"), df) + } + } + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING ORC PARTITIONED BY(p1) + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = + Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) + .toDF("c1", "p1").repartition(1) + + df.createOrReplaceTempView("view1") + sql("INSERT OVERWRITE t PARTITION(p1=2) SELECT c1 FROM view1 ") + checkAnswer(sql("SELECT * FROM t WHERE p1=2"), df) + } + } + withTempDir { d => + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + | LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("select * from t"), df) + } + } + } + } + } } From f1f12c39c27ba85a8fd99967b618ce19e06ab39b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 16:43:28 +0800 Subject: [PATCH 39/53] update --- .../spark/sql/execution/datasources/FileFormatDataWriter.scala | 2 -- .../datasources/SQLPathHadoopMapReduceCommitProtocol.scala | 1 - 2 files changed, 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala index e1865d68b9c0a..0b1b616bd835c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala @@ -292,8 +292,6 @@ abstract class BaseDynamicPartitionDataWriter( committer.newTaskTempFile(taskAttemptContext, partDir, fileNameSpec) } - println(s"write current path = ${currentPath}") - currentWriter = description.outputWriterFactory.newInstance( path = currentPath, dataSchema = description.dataColumns.toStructType, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 86ad89f920693..4690417e1e2bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -39,7 +39,6 @@ class SQLPathHadoopMapReduceCommitProtocol( committer.asInstanceOf[SQLPathOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { - println(s"output path = ${path}") val committer = new SQLPathOutputCommitter(stagingDir, new Path(path), context) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer From 023787a7914ef6c06c36622ee7e843025e8b4c8c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 17:42:49 +0800 Subject: [PATCH 40/53] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index fc5d0a0b3a7f5..cbdc5c3438842 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -61,7 +61,7 @@ class PathOutputCommitProtocol( } /** The committer created. */ - @transient private var committer: PathOutputCommitter = _ + @transient override protected var committer: PathOutputCommitter = _ require(dest != null, "Null destination specified") From 5984fb2d767c9c951d0437fa432e0501d98220ee Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 14 Oct 2021 18:35:40 +0800 Subject: [PATCH 41/53] Update PathOutputCommitProtocol.scala --- .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala index cbdc5c3438842..467de4509e7fe 100644 --- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala +++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala @@ -60,9 +60,6 @@ class PathOutputCommitProtocol( throw new IOException(PathOutputCommitProtocol.UNSUPPORTED) } - /** The committer created. */ - @transient override protected var committer: PathOutputCommitter = _ - require(dest != null, "Null destination specified") private[cloud] val destination: String = dest @@ -115,7 +112,7 @@ class PathOutputCommitProtocol( logTrace(s"Committer $committer may not be tolerant of task commit failures") } } - committer + committer.asInstanceOf[PathOutputCommitter] } /** @@ -131,7 +128,7 @@ class PathOutputCommitProtocol( dir: Option[String], spec: FileNameSpec): String = { - val workDir = committer.getWorkPath + val workDir = committer.asInstanceOf[PathOutputCommitter].getWorkPath val parent = dir.map { d => new Path(workDir, d) }.getOrElse(workDir) From c2f606f0a6a3816f5d46ade0466912be0dfb4996 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Fri, 15 Oct 2021 15:26:33 +0800 Subject: [PATCH 42/53] trigger GA From b90caeb5014282daf520dc81a3902bf4e0399c58 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:34:28 +0800 Subject: [PATCH 43/53] Update FileCommitProtocol.scala --- .../internal/io/FileCommitProtocol.scala | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index c421c44414b7f..684aca08005d7 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -57,6 +57,22 @@ import org.apache.spark.util.Utils abstract class FileCommitProtocol extends Logging { import FileCommitProtocol._ + /** + * Get the final directory where the result data will be placed once the job + * is committed. This may be null, in which case, there is no output + * path to write data to and won't write any data. + */ + def getOutputPath: Path = null + + /** + * Get the directory that the task should write results into. + * Warning: there's no guarantee that this work path is on the same + * FS as the final output, or that it's visible across machines. + * May be null, in which case, there is no output path to write data to + * and won't write any data. + */ + def getWorkPath: Path = null + /** * Setups up a job. Must be called on the driver before any other methods can be invoked. */ @@ -221,34 +237,22 @@ object FileCommitProtocol extends Logging { // dynamicPartitionOverwrite: Boolean). // If that doesn't exist, try the one with (jobId: string, outputPath: String). try { - val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[Path], classOf[Boolean]) - logDebug("Using (String, String, Path, Boolean) constructor") - ctor.newInstance(jobId, outputPath, - stagingDir.getOrElse(getStagingDir(outputPath, jobId)), - dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) + val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean]) + logDebug("Using (String, String, Boolean) constructor") + ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) } catch { case _: NoSuchMethodException => - try { - logDebug("Falling back to (String, String, Boolean) constructor") - val ctor = clazz.getDeclaredConstructor( - classOf[String], classOf[String], classOf[Boolean]) - ctor.newInstance(jobId, outputPath, - dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean]) - } catch { - case _: NoSuchMethodException => - logDebug("Falling back to (String, String) constructor") - require(!dynamicPartitionOverwrite, - "Dynamic Partition Overwrite is enabled but" + - s" the committer ${className} does not have the appropriate constructor") - val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) - ctor.newInstance(jobId, outputPath) - } + logDebug("Falling back to (String, String) constructor") + require(!dynamicPartitionOverwrite, + "Dynamic Partition Overwrite is enabled but" + + s" the committer ${className} does not have the appropriate constructor") + val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String]) + ctor.newInstance(jobId, outputPath) } } def getStagingDir(path: String, jobId: String): Path = { - Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null) + new Path(path, ".spark-staging-" + jobId) } def externalTempPath( From 98f066c1f4f580cc429387f317c437d56c991717 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:35:03 +0800 Subject: [PATCH 44/53] Update FileCommitProtocol.scala --- .../org/apache/spark/internal/io/FileCommitProtocol.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 684aca08005d7..bdbef8d63bfbc 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -21,8 +21,6 @@ import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} -import scala.util.Try - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapreduce._ @@ -227,8 +225,7 @@ object FileCommitProtocol extends Logging { className: String, jobId: String, outputPath: String, - dynamicPartitionOverwrite: Boolean = false, - stagingDir: Option[Path] = None): FileCommitProtocol = { + dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = { logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" + s" dynamic=$dynamicPartitionOverwrite") From 2ea214c22b5481be6c10e35d40bb0b4b16b220d2 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:37:07 +0800 Subject: [PATCH 45/53] update --- .../io/HadoopMapReduceCommitProtocol.scala | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index e013d01e8e709..73399a33be88c 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -71,16 +71,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil class HadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) extends FileCommitProtocol with Serializable with Logging { - def this(jobId: String, path: String) = - this(jobId, path, getStagingDir(path, jobId)) - - def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) - import FileCommitProtocol._ /** OutputCommitter from Hadoop is not serializable so marking it transient. */ @@ -111,6 +104,22 @@ class HadoopMapReduceCommitProtocol( */ @transient private var partitionPaths: mutable.Set[String] = null + /** + * The staging directory of this write job. Spark uses it to deal with files with absolute output + * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. + */ + protected def stagingDir = getStagingDir(path, jobId) + + override def getOutputPath: Path = { + if (dynamicPartitionOverwrite) { + stagingDir + } else { + new Path(path) + } + } + + override def getWorkPath: Path = getOutputPath + protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() // If OutputFormat is Configurable, we should set conf to it. From 201b0b53f84523d316d3c310ae2dc9cf264c4b09 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:37:46 +0800 Subject: [PATCH 46/53] Update HadoopMapReduceCommitProtocol.scala --- .../spark/internal/io/HadoopMapReduceCommitProtocol.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 73399a33be88c..2f356d5231730 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -30,7 +30,6 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir import org.apache.spark.mapred.SparkHadoopMapRedUtil /** @@ -41,9 +40,6 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil * * @param jobId the job's or stage's id * @param path the job's output path, or null if committer acts as a noop - * @param stagingDir The staging directory of this write job. Spark uses it to deal with - * files with absolute output path, or writing data into partitioned directory - * with dynamicPartitionOverwrite=true * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime * dynamically. Suppose final path is /path/to/outputPath, output * path of [[FileOutputCommitter]] is an intermediate path, e.g. From 772bf6b1e57baf66ea50c1fcb477c6c3445a3267 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:39:43 +0800 Subject: [PATCH 47/53] Update InsertIntoHadoopFsRelationCommand.scala --- .../InsertIntoHadoopFsRelationCommand.scala | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 45ba105c501fa..27da50ba21c53 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -106,14 +106,11 @@ case class InsertIntoHadoopFsRelationCommand( } val jobId = java.util.UUID.randomUUID().toString - val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf, - hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId) val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = jobId, outputPath = outputPath.toString, - dynamicPartitionOverwrite = dynamicPartitionOverwrite, - stagingDir = Some(tmpLocation)) + dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { true @@ -166,14 +163,6 @@ case class InsertIntoHadoopFsRelationCommand( } } - // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files - // will be renamed from staging path to final output path during commit job - val committerOutputPath = if (dynamicPartitionOverwrite) { - tmpLocation - } else { - qualifiedOutputPath - } - val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -181,7 +170,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - committerOutputPath.toString, customPartitionLocations, outputColumns), + committer.getOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, From 866681cc7f16d3badda73675c048fe432ab4588f Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:39:51 +0800 Subject: [PATCH 48/53] Update InsertIntoHadoopFsRelationCommand.scala --- .../datasources/InsertIntoHadoopFsRelationCommand.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 27da50ba21c53..f2a8c2372b551 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} From aaae0f2bd25a1f90783ff6ee4bd2c4a91d6a2e43 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:42:52 +0800 Subject: [PATCH 49/53] update --- .../apache/spark/internal/config/package.scala | 10 ---------- .../org/apache/spark/sql/internal/SQLConf.scala | 13 +++++++++++++ .../SQLHadoopMapReduceCommitProtocol.scala | 15 ++++++--------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 24b988e6734fc..9e6cf341c197b 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -2310,16 +2310,6 @@ package object config { .stringConf .createOptional - val EXEC_STAGING_DIR = - ConfigBuilder("spark.exec.stagingDir") - .doc("The staging directory of Spark job. Spark uses it to deal with files with " + - "absolute output path, or writing data into partitioned directory when " + - "dynamic partition overwrite mode.") - .version("3.3.0") - .internal() - .stringConf - .createWithDefault(".spark-staging") - private[spark] val EXECUTOR_STATE_SYNC_MAX_ATTEMPTS = ConfigBuilder("spark.worker.executorStateSync.maxAttempts") .internal() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 42979a68d8578..b946907d1b31c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1282,6 +1282,17 @@ object SQLConf { .createWithDefault( "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol") + val EXEC_STAGING_DIR = buildConf("spark.sql.exec.stagingDir") + .doc("The staging directory of Spark job. Spark uses it to deal with files with " + + "absolute output path, or writing data into partitioned directory " + + "when dynamic partition overwrite mode is on. " + + "Default value means staging directory is under table path.") + .version("3.3.0") + .internal() + .stringConf + .checkValue(!_.isEmpty, "Should not pass an empty string as staging diretory.") + .createWithDefault(".spark-staging") + val PARALLEL_PARTITION_DISCOVERY_THRESHOLD = buildConf("spark.sql.sources.parallelPartitionDiscovery.threshold") .doc("The maximum number of paths allowed for listing files at driver side. If the number " + @@ -3966,6 +3977,8 @@ class SQLConf extends Serializable with Logging { def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS) + def stagingDir: String = getConf(SQLConf.EXEC_STAGING_DIR) + def parallelPartitionDiscoveryThreshold: Int = getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index a6ecffdcaeb5a..9e0b2a1a00552 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir -import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol +import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol} import org.apache.spark.sql.internal.SQLConf /** @@ -33,16 +33,13 @@ import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - def this(jobId: String, path: String) = - this(jobId, path, getStagingDir(path, jobId), false) - - def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) = - this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite) + override val stagingDir: Path = + FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, + SQLConf.get.stagingDir, "spark", jobId) override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) From c838dd376baf98437b637ab60e822347c073608f Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:45:44 +0800 Subject: [PATCH 50/53] Update SQLPathHadoopMapReduceCommitProtocol.scala --- .../SQLPathHadoopMapReduceCommitProtocol.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 4690417e1e2bf..50ee8a8fec3d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -20,8 +20,10 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.{FileNameSpec, HadoopMapReduceCommitProtocol} +import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec, HadoopMapReduceCommitProtocol} +import org.apache.spark.sql.internal.SQLConf /** * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual @@ -30,12 +32,16 @@ import org.apache.spark.internal.io.{FileNameSpec, HadoopMapReduceCommitProtocol class SQLPathHadoopMapReduceCommitProtocol( jobId: String, path: String, - stagingDir: Path, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) + extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { - def sqlPathOutputCommitter(): SQLPathOutputCommitter = + override val stagingDir: Path = + FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, + SQLConf.get.stagingDir, "spark", jobId) + + // This variable only can be used after setupCommitter. + private lazy val sqlPathOutputCommitter: SQLPathOutputCommitter = committer.asInstanceOf[SQLPathOutputCommitter] override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { @@ -48,7 +54,7 @@ class SQLPathHadoopMapReduceCommitProtocol( taskContext: TaskAttemptContext, dir: Option[String], spec: FileNameSpec): String = { - val filename = getFilename(taskContext, spec) + val filename = getFilename(taskContext, spce) dir.map { d => new Path( new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString From 1f2222c7a8e541bc5c0dc90ccffba6245a2eb2f8 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:47:23 +0800 Subject: [PATCH 51/53] Update StagingInsertSuite.scala --- .../sql/sources/StagingInsertSuite.scala | 143 ++++++++++-------- 1 file changed, 80 insertions(+), 63 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala index 131758cc1f84d..42b047229bb46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala @@ -17,121 +17,138 @@ package org.apache.spark.sql.sources -import org.apache.spark.SparkConf -import org.apache.spark.internal.config.EXEC_STAGING_DIR import org.apache.spark.sql.{QueryTest, Row} -import org.apache.spark.sql.execution.datasources.SQLPathHadoopMapReduceCommitProtocol +import org.apache.spark.sql.execution.datasources.{SQLHadoopMapReduceCommitProtocol, SQLPathHadoopMapReduceCommitProtocol} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils class StagingInsertSuite extends QueryTest with SharedSparkSession { + import testImplicits._ - val stagingDir = Utils.createTempDir() + val stagingParentDir = Utils.createTempDir() - override def sparkConf: SparkConf = - super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath) + val stagingDir = stagingParentDir.getCanonicalPath + "/.spark-stagingDir" override def beforeAll(): Unit = { super.beforeAll() - stagingDir.delete() + stagingParentDir.delete() } override def afterAll(): Unit = { try { - Utils.deleteRecursively(stagingDir) + Utils.deleteRecursively(stagingParentDir) } finally { super.afterAll() } } + test("SPDI-25701: Assert staging dir work") { + withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir) { + val commitProtocol = new SQLHadoopMapReduceCommitProtocol("job-id", "dummy-path", true) + assert(commitProtocol.stagingDir.toString.contains(stagingDir)) + } + } + test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") { - withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> - SQLConf.PartitionOverwriteMode.DYNAMIC.toString) { - withTempDir { d => + // Partition Insert + Seq("static", "dynamic").foreach { mode => + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> mode, + SQLConf.EXEC_STAGING_DIR.key -> stagingDir) { + withTempDir { d => + withTable("t") { + sql( + s""" + |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) + |LOCATION '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2), (3, 4), (5, 6)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) + } + } + } + } + } + + test("SPARK-36571: Add a new commit protocol - None-partitioned insert") { + Seq("1", "2").foreach { version => + withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir, + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> version, + SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> + classOf[SQLPathHadoopMapReduceCommitProtocol].getName) { withTable("t") { sql( s""" - |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) - |LOCATION '${d.getAbsolutePath}' - """.stripMargin) + | CREATE TABLE t(c1 int, p1 int) using PARQUET + """.stripMargin) - val df = Seq((1, 2), (3, 4)).toDF("c1", "p1") + val df = + Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) + .toDF("c1", "p1").repartition(1) df.write - .partitionBy("p1") .mode("overwrite") + .format("parquet") .saveAsTable("t") checkAnswer(sql("SELECT * FROM t"), df) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil) - checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) } } } } - test("SPARK-36571: Add a new commit protocol that support writing data to staging " + - "then mv to output path") { - withTempDir { stagingDir => - withSQLConf( - "spark.exec.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir", - "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2", + test("SPARK-36571: Add a new commit protocol - Static partition insert") { + Seq("1", "2").foreach { version => + withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir, + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> version, SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> classOf[SQLPathHadoopMapReduceCommitProtocol].getName) { withTempDir { d => withTable("t") { sql( s""" - | CREATE TABLE t(c1 int, p1 int) using ORC - | LOCATION '${d.getAbsolutePath}' - """.stripMargin) - - val df = - Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) - .toDF("c1", "p1").repartition(1) - df.write - .mode("overwrite") - .format("orc") - .saveAsTable("t") - checkAnswer(sql("select * from t"), df) - } - } - withTempDir { d => - withTable("t") { - sql( - s""" - | CREATE TABLE t(c1 int, p1 int) USING ORC PARTITIONED BY(p1) + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) | LOCATION '${d.getAbsolutePath}' """.stripMargin) val df = - Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), - (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) + Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2)) .toDF("c1", "p1").repartition(1) - df.createOrReplaceTempView("view1") - sql("INSERT OVERWRITE t PARTITION(p1=2) SELECT c1 FROM view1 ") + sql("INSERT OVERWRITE t PARTITION(p1=2) SELECT c1 FROM view1") checkAnswer(sql("SELECT * FROM t WHERE p1=2"), df) } } - withTempDir { d => - withTable("t") { - sql( - s""" - | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) - | LOCATION '${d.getAbsolutePath}' + } + } + } + + test("SPARK-36571: Add a new commit protocol - Dynamic partition insert") { + Seq("1", "2").foreach { version => + withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir, + "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> version, + SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> + classOf[SQLPathHadoopMapReduceCommitProtocol].getName) { + withTable("t") { + sql( + s""" + | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1) """.stripMargin) - val df = Seq((1, 2)).toDF("c1", "p1") - df.write - .partitionBy("p1") - .mode("overwrite") - .saveAsTable("t") - checkAnswer(sql("select * from t"), df) - } + val df = Seq((1, 2), (3, 4), (5, 6), (7, 8)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("SELECT * FROM t"), df) + checkAnswer(sql("SELECT * FROM t WHERE p1 > 5"), Row(5, 6) :: Row(7, 8) :: Nil) + checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil) } } } From c3472d9031e26ae148f6a036a419958afa9c9991 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 25 Jan 2022 19:51:23 +0800 Subject: [PATCH 52/53] Update SQLPathHadoopMapReduceCommitProtocol.scala --- .../datasources/SQLPathHadoopMapReduceCommitProtocol.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 50ee8a8fec3d9..845d4895d566f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -54,10 +54,10 @@ class SQLPathHadoopMapReduceCommitProtocol( taskContext: TaskAttemptContext, dir: Option[String], spec: FileNameSpec): String = { - val filename = getFilename(taskContext, spce) + val filename = getFilename(taskContext, spec) dir.map { d => - new Path( - new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString + new Path(new Path( + sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString }.getOrElse { new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), filename).toString } From 6b44163f2ae0186fc3bd415352afb3df8f5883e3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Wed, 26 Jan 2022 09:51:16 +0800 Subject: [PATCH 53/53] Update SQLPathHadoopMapReduceCommitProtocol.scala --- .../SQLPathHadoopMapReduceCommitProtocol.scala | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala index 845d4895d566f..51ad8ae5b0ac2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala @@ -20,25 +20,17 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} -import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec, HadoopMapReduceCommitProtocol} -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.internal.io.FileNameSpec /** - * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual + * A variant of [[SQLHadoopMapReduceCommitProtocol]] that allows specifying the actual * Hadoop output committer using an option specified in SQLConf. */ class SQLPathHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) - extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) - with Serializable with Logging { - - override val stagingDir: Path = - FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf, - SQLConf.get.stagingDir, "spark", jobId) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { // This variable only can be used after setupCommitter. private lazy val sqlPathOutputCommitter: SQLPathOutputCommitter =