From 6331e56f1a75750eebd1c8217b48e4d77773bae0 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 16:14:26 +0800
Subject: [PATCH 01/53] [36562][SQL] Add new
 NewSQLHadoopMapReduceCommitProtocol resolve conflict

---
 .../io/HadoopMapReduceCommitProtocol.scala    |   2 +-
 .../datasources/NewFileOutputCommitter.scala  | 585 ++++++++++++++++++
 .../NewSQLHadoopMapReduceCommitProtocol.scala |  66 ++
 .../sql/sources/PartitionedWriteSuite.scala   |  51 +-
 4 files changed, 702 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index c061d617fce4b..f279d0514da66 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -73,7 +73,7 @@ class HadoopMapReduceCommitProtocol(
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
-  @transient private var committer: OutputCommitter = _
+  @transient protected var committer: OutputCommitter = _
 
   /**
    * Checks whether there are files to be committed to a valid output location.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala
new file mode 100644
index 0000000000000..1a8760f8c6f6d
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala
@@ -0,0 +1,585 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{FileNotFoundException, IOException}
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import com.google.common.base.Preconditions
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
+import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext, TaskAttemptID}
+import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter
+import org.apache.hadoop.util.{DurationInfo, Progressable}
+
+import org.apache.spark.internal.Logging
+
+class NewFileOutputCommitter(
+    stagingDir: Path,
+    outputPath: Path,
+    context: TaskAttemptContext)
+  extends PathOutputCommitter(outputPath, context) with Logging {
+
+  val PENDING_DIR_NAME = "_temporary"
+  val SUCCEEDED_FILE_NAME = "_SUCCESS"
+  val SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = "mapreduce.fileoutputcommitter.marksuccessfuljobs"
+  val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION = "mapreduce.fileoutputcommitter.algorithm.version"
+  val FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT = 2
+  val FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED = "mapreduce.fileoutputcommitter.cleanup.skipped"
+  val FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT = false
+  val FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED =
+    "mapreduce.fileoutputcommitter.cleanup-failures.ignored"
+  val FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT = false
+  val FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS = "mapreduce.fileoutputcommitter.failures.attempts"
+  val FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS_DEFAULT = 1
+  val FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED =
+    "mapreduce.fileoutputcommitter.task.cleanup.enabled"
+  val FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED_DEFAULT = false
+  private val workPath: Path = if (getOutputPath != null) {
+    Preconditions.checkNotNull(getTaskAttemptPath(context, getOutputPath),
+      "Null task attempt path in %s and output path %s", context, outputPath)
+  } else {
+    null
+  }
+  private val algorithmVersion = context.getConfiguration.getInt(
+    FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT)
+  private val skipCleanup = context.getConfiguration.getBoolean(
+    FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED, FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED_DEFAULT)
+  private val ignoreCleanupFailures = context.getConfiguration.getBoolean(
+    FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED,
+    FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT)
+
+  private class CommittedTaskFilter extends PathFilter {
+    def accept(path: Path): Boolean = !(PENDING_DIR_NAME == path.getName)
+  }
+
+  override def getOutputPath: Path =
+    outputPath.getFileSystem(context.getConfiguration).makeQualified(outputPath)
+
+  def getPendingJobAttemptsPath(): Path = {
+    getPendingJobAttemptsPath(stagingDir)
+  }
+
+  def getPendingJobAttemptsPath(out: Path): Path = {
+    new Path(out, PENDING_DIR_NAME)
+  }
+
+  private def getAppAttemptId(context: JobContext) = {
+    context.getConfiguration.getInt("mapreduce.job.application.attempt.id", 0)
+  }
+
+  def getJobAttemptPath(context: JobContext): Path = {
+    getJobAttemptPath(context, this.stagingDir)
+  }
+
+  def getJobAttemptPath(context: JobContext, out: Path): Path = {
+    getJobAttemptPath(getAppAttemptId(context), out)
+  }
+
+  protected def getJobAttemptPath(appAttemptId: Int): Path = {
+    getJobAttemptPath(appAttemptId, this.stagingDir)
+  }
+
+  private def getJobAttemptPath(appAttemptId: Int, out: Path) = {
+    new Path(getPendingJobAttemptsPath(out), String.valueOf(appAttemptId))
+  }
+
+  private def getPendingTaskAttemptsPath(context: JobContext): Path = {
+    getPendingTaskAttemptsPath(context, this.stagingDir)
+  }
+
+  private def getPendingTaskAttemptsPath(context: JobContext, out: Path) = {
+    new Path(getJobAttemptPath(context, out), "_temporary")
+  }
+
+  def getTaskAttemptPath(context: TaskAttemptContext): Path = {
+    new Path(this.getPendingTaskAttemptsPath(context), String.valueOf(context.getTaskAttemptID))
+  }
+
+  def getTaskAttemptPath(context: TaskAttemptContext, out: Path): Path = {
+    new Path(getPendingTaskAttemptsPath(context, out), String.valueOf(context.getTaskAttemptID))
+  }
+
+  def getCommittedTaskPath(context: TaskAttemptContext): Path = {
+    this.getCommittedTaskPath(getAppAttemptId(context), context)
+  }
+
+  def getCommittedTaskPath(context: TaskAttemptContext, out: Path): Path = {
+    getCommittedTaskPath(getAppAttemptId(context), context, out)
+  }
+
+  protected def getCommittedTaskPath(appAttemptId: Int, context: TaskAttemptContext) = {
+    new Path(this.getJobAttemptPath(appAttemptId),
+      String.valueOf(context.getTaskAttemptID.getTaskID))
+  }
+
+  private def getCommittedTaskPath(
+      appAttemptId: Int,
+      context: TaskAttemptContext,
+      out: Path): Path = {
+    new Path(getJobAttemptPath(appAttemptId, out),
+      String.valueOf(context.getTaskAttemptID.getTaskID.toString))
+  }
+
+  @throws[IOException]
+  private def getAllCommittedTaskPaths(context: JobContext) = {
+    val jobAttemptPath = this.getJobAttemptPath(context)
+    val fs = jobAttemptPath.getFileSystem(context.getConfiguration)
+    fs.listStatus(jobAttemptPath, new CommittedTaskFilter)
+  }
+
+  @throws[IOException]
+  override def setupJob(context: JobContext): Unit = {
+    if (hasOutputPath) {
+      val jobAttemptPath = getJobAttemptPath(context)
+      val fs = jobAttemptPath.getFileSystem(context.getConfiguration)
+      if (!fs.mkdirs(jobAttemptPath)) {
+        logError("Mkdirs failed to create " + jobAttemptPath)
+      }
+    } else {
+      logWarning("Output Path is null in setupJob()")
+    }
+  }
+
+
+  @throws[IOException]
+  override def commitJob(context: JobContext): Unit = {
+    val maxAttemptsOnFailure = if (this.isCommitJobRepeatable(context)) {
+      context.getConfiguration.getInt(
+        FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS, FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS_DEFAULT)
+    } else {
+      1
+    }
+    var attempt = 0
+    var jobCommitNotFinished = true
+
+    while (jobCommitNotFinished) {
+      try {
+        this.commitJobInternal(context)
+        jobCommitNotFinished = false
+      } catch {
+        case var6: Exception =>
+          attempt += 1
+          if (attempt >= maxAttemptsOnFailure) throw var6
+          logWarning("Exception get thrown in job commit, retry (" + attempt + ") time.", var6)
+      }
+    }
+  }
+
+  @throws[IOException]
+  protected def commitJobInternal(context: JobContext): Unit = {
+    if (!this.hasOutputPath) {
+      logWarning("Output Path is null in commitJob()")
+    } else {
+      val finalOutput = this.getOutputPath
+      val fs = finalOutput.getFileSystem(context.getConfiguration)
+      if (this.algorithmVersion == 1) {
+        val committedTaskPaths = this.getAllCommittedTaskPaths(context)
+        committedTaskPaths.foreach { path =>
+          this.mergePaths(fs, path, finalOutput, context)
+        }
+      }
+      if (this.skipCleanup) {
+        logInfo("Skip cleanup the _temporary folders under job's output directory in commitJob.")
+      } else {
+        try this.cleanupJob(context)
+        catch {
+          case var8: IOException =>
+            if (!this.ignoreCleanupFailures) throw var8
+            logError("Error in cleanup job, manually cleanup is needed.", var8)
+        }
+      }
+      if (context.getConfiguration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
+        val markerPath = new Path(this.outputPath, "_SUCCESS")
+        if (this.isCommitJobRepeatable(context)) {
+          fs.create(markerPath, true).close()
+        } else {
+          fs.create(markerPath).close()
+        }
+      }
+    }
+  }
+
+  @throws[IOException]
+  private def mergePaths(fs: FileSystem, from: FileStatus, to: Path, context: JobContext): Unit = {
+    val d: DurationInfo =
+      new DurationInfo(log, false, "Merging data from %s to %s", Array[AnyRef](from, to))
+    var var6: Throwable = null
+    try {
+      this.reportProgress(context)
+      var toStat: FileStatus = null
+      try {
+        toStat = fs.getFileStatus(to)
+      } catch {
+        case _: FileNotFoundException =>
+          toStat = null
+      }
+      if (from.isFile) {
+        if (toStat != null && !fs.delete(to, true)) {
+          throw new IOException("Failed to delete " + to)
+        }
+        if (!fs.rename(from.getPath, to)) {
+          throw new IOException("Failed to rename " + from + " to " + to)
+        }
+      } else {
+        if (from.isDirectory) {
+          if (toStat != null) {
+            if (!toStat.isDirectory) {
+              if (!fs.delete(to, true)) {
+                throw new IOException("Failed to delete " + to)
+              }
+              this.renameOrMerge(fs, from, to, context)
+            } else {
+              val var8: Array[FileStatus] = fs.listStatus(from.getPath)
+              val var9: Int = var8.length
+              for (var10 <- 0 until var9) {
+                val subFrom: FileStatus = var8(var10)
+                val subTo: Path = new Path(to, subFrom.getPath.getName)
+                this.mergePaths(fs, subFrom, subTo, context)
+              }
+            }
+          } else {
+            this.renameOrMerge(fs, from, to, context)
+          }
+        }
+      }
+    } catch {
+      case var22: Throwable =>
+        var6 = var22
+        throw var22
+    } finally {
+      if (d != null) {
+        if (var6 != null) {
+          try d.close()
+          catch {
+            case var20: Throwable =>
+              var6.addSuppressed(var20)
+          }
+        }
+        else {
+          d.close()
+        }
+      }
+    }
+  }
+
+  private def reportProgress(context: JobContext): Unit = {
+    if (context.isInstanceOf[Progressable]) {
+      (context.asInstanceOf[Progressable]).progress()
+    }
+  }
+
+  @throws[IOException]
+  private def renameOrMerge(
+      fs: FileSystem,
+      from: FileStatus,
+      to: Path,
+      context: JobContext): Unit = {
+    if (this.algorithmVersion == 1) {
+      if (!(fs.rename(from.getPath, to))) {
+        throw new IOException("Failed to rename " + from + " to " + to)
+      }
+    }
+    else {
+      fs.mkdirs(to)
+      val var5: Array[FileStatus] = fs.listStatus(from.getPath)
+      val var6: Int = var5.length
+      for (var7 <- 0 until var6) {
+        val subFrom: FileStatus = var5(var7)
+        val subTo: Path = new Path(to, subFrom.getPath.getName)
+        this.mergePaths(fs, subFrom, subTo, context)
+      }
+    }
+  }
+
+  /** @deprecated */
+  @deprecated
+  @throws[IOException]
+  override def cleanupJob(context: JobContext): Unit = {
+    if (this.hasOutputPath) {
+      val pendingJobAttemptsPath: Path = this.getPendingJobAttemptsPath
+      val fs: FileSystem = pendingJobAttemptsPath.getFileSystem(context.getConfiguration)
+      try fs.delete(pendingJobAttemptsPath, true)
+      catch {
+        case var5: FileNotFoundException =>
+          if (!(this.isCommitJobRepeatable(context))) {
+            throw var5
+          }
+      }
+    } else {
+      logWarning("Output Path is null in cleanupJob()")
+    }
+  }
+
+  @throws[IOException]
+  override def abortJob(context: JobContext, state: JobStatus.State): Unit = {
+    this.cleanupJob(context)
+  }
+
+  @throws[IOException]
+  override def setupTask(context: TaskAttemptContext): Unit = {
+  }
+
+  @throws[IOException]
+  override def commitTask(context: TaskAttemptContext): Unit = {
+    this.commitTask(context, null.asInstanceOf[Path])
+  }
+
+  @throws[IOException]
+  def commitTask(context: TaskAttemptContext, taskAttemptPath: Path): Unit = {
+    val attemptId: TaskAttemptID = context.getTaskAttemptID
+    var currentTaskAttemptPath = taskAttemptPath
+    if (this.hasOutputPath) {
+      context.progress()
+      if (currentTaskAttemptPath == null) {
+        currentTaskAttemptPath = this.getTaskAttemptPath(context)
+      }
+      val fs: FileSystem = currentTaskAttemptPath.getFileSystem(context.getConfiguration)
+      var taskAttemptDirStatus: FileStatus = null
+      try {
+        taskAttemptDirStatus = fs.getFileStatus(currentTaskAttemptPath)
+      } catch {
+        case _: FileNotFoundException =>
+          taskAttemptDirStatus = null
+      }
+      if (taskAttemptDirStatus != null) {
+        if (this.algorithmVersion == 1) {
+          val committedTaskPath = this.getCommittedTaskPath(context)
+          if (fs.exists(committedTaskPath) && !fs.delete(committedTaskPath, true)) {
+            throw new IOException("Could not delete " + committedTaskPath)
+          }
+          if (!fs.rename(currentTaskAttemptPath, committedTaskPath)) {
+            throw new IOException(
+              "Could not rename " + currentTaskAttemptPath + " to " + committedTaskPath)
+          }
+          logInfo("Saved output of task '" + attemptId + "' to " + committedTaskPath)
+        } else {
+          this.mergePaths(fs, taskAttemptDirStatus, this.outputPath, context)
+          logInfo("Saved output of task '" + attemptId + "' to " + this.outputPath)
+          if (context.getConfiguration.getBoolean(
+            "mapreduce.fileoutputcommitter.task.cleanup.enabled", false)) {
+            logDebug(String.format(
+              "Deleting the temporary directory of '%s': '%s'", attemptId, currentTaskAttemptPath))
+            if (!fs.delete(currentTaskAttemptPath, true)) {
+              logWarning("Could not delete " + currentTaskAttemptPath)
+            }
+          }
+        }
+      } else {
+        logWarning("No Output found for " + attemptId)
+      }
+    } else {
+      logWarning("Output Path is null in commitTask()")
+    }
+  }
+
+  @throws[IOException]
+  override def abortTask(context: TaskAttemptContext): Unit = {
+    this.abortTask(context, null.asInstanceOf[Path])
+  }
+
+  @throws[IOException]
+  def abortTask(context: TaskAttemptContext, taskAttemptPath: Path): Unit = {
+    var currentTaskAttemptPath = taskAttemptPath
+    if (this.hasOutputPath) {
+      context.progress()
+      if (currentTaskAttemptPath == null) {
+        currentTaskAttemptPath = this.getTaskAttemptPath(context)
+      }
+      val fs: FileSystem = currentTaskAttemptPath.getFileSystem(context.getConfiguration)
+      if (!fs.delete(currentTaskAttemptPath, true)) {
+        logWarning("Could not delete " + currentTaskAttemptPath)
+      }
+    } else {
+      logWarning("Output Path is null in abortTask()")
+    }
+  }
+
+  @throws[IOException]
+  override def needsTaskCommit(context: TaskAttemptContext): Boolean = {
+    this.needsTaskCommit(context, null.asInstanceOf[Path])
+  }
+
+  @throws[IOException]
+  def needsTaskCommit(context: TaskAttemptContext, taskAttemptPath: Path): Boolean = {
+    var currentTaskAttemptPath = taskAttemptPath
+    if (this.hasOutputPath) {
+      if (currentTaskAttemptPath == null) {
+        currentTaskAttemptPath = this.getTaskAttemptPath(context)
+      }
+      val fs: FileSystem = currentTaskAttemptPath.getFileSystem(context.getConfiguration)
+      fs.exists(currentTaskAttemptPath)
+    } else {
+      false
+    }
+  }
+
+  @deprecated override def isRecoverySupported: Boolean = {
+    true
+  }
+
+  @throws[IOException]
+  override def isCommitJobRepeatable(context: JobContext): Boolean = {
+    this.algorithmVersion == 2
+  }
+
+  @throws[IOException]
+  override def recoverTask(context: TaskAttemptContext): Unit = {
+    if (this.hasOutputPath) {
+      context.progress()
+      val attemptId: TaskAttemptID = context.getTaskAttemptID
+      val previousAttempt: Int = getAppAttemptId(context) - 1
+      if (previousAttempt < 0) {
+        throw new IOException("Cannot recover task output for first attempt...")
+      }
+      val previousCommittedTaskPath: Path = this.getCommittedTaskPath(previousAttempt, context)
+      val fs: FileSystem = previousCommittedTaskPath.getFileSystem(context.getConfiguration)
+      if (log.isDebugEnabled) {
+        logDebug("Trying to recover task from " + previousCommittedTaskPath)
+      }
+      if (this.algorithmVersion == 1) {
+        if (fs.exists(previousCommittedTaskPath)) {
+          val committedTaskPath: Path = this.getCommittedTaskPath(context)
+          if (!fs.delete(committedTaskPath, true) && fs.exists(committedTaskPath)) {
+            throw new IOException("Could not delete " + committedTaskPath)
+          }
+          val committedParent: Path = committedTaskPath.getParent
+          fs.mkdirs(committedParent)
+          if (!fs.rename(previousCommittedTaskPath, committedTaskPath)) {
+            throw new IOException(
+              "Could not rename " + previousCommittedTaskPath + " to " + committedTaskPath)
+          }
+        }
+        else {
+          logWarning(attemptId + " had no output to recover.")
+        }
+      }
+      else {
+        try {
+          val from: FileStatus = fs.getFileStatus(previousCommittedTaskPath)
+          logInfo("Recovering task for upgrading scenario, moving files from " +
+            previousCommittedTaskPath + " to " + this.outputPath)
+          this.mergePaths(fs, from, this.outputPath, context)
+        } catch {
+          case var8: FileNotFoundException =>
+
+        }
+        logInfo("Done recovering task " + attemptId)
+      }
+    }
+    else {
+      logWarning("Output Path is null in recoverTask()")
+    }
+  }
+
+  override def getWorkPath: Path = workPath
+
+  override def toString: String = {
+    val sb: StringBuilder = new StringBuilder("FileOutputCommitter{")
+    sb.append(super.toString).append("; ")
+    sb.append("outputPath=").append(this.outputPath)
+    sb.append(", workPath=").append(this.workPath)
+    sb.append(", algorithmVersion=").append(this.algorithmVersion)
+    sb.append(", skipCleanup=").append(this.skipCleanup)
+    sb.append(", ignoreCleanupFailures=").append(this.ignoreCleanupFailures)
+    sb.append('}')
+    sb.toString
+  }
+}
+
+object NewFileOutputCommitter extends Logging {
+  def newVersionExternalTempPath(
+      jobId: String,
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String): Path = {
+    val extURI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, jobId)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, jobId), "-ext-10000")
+    }
+  }
+
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      jobId: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir, jobId), "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      jobId: String): Path = {
+      getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir,
+      jobId: String)
+  }
+
+  private def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      jobId: String): Path = {
+    val inputPathName: String = inputPath.toString
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir = fs.makeQualified(
+      new Path(stagingPathName + "_" + executionId + "-" + jobId))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    dir
+  }
+
+  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir().
+  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
+    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
+    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
+    path1.startsWith(path2)
+  }
+
+  private def executionId: String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    "spark_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
new file mode 100644
index 0000000000000..9416bcd728d73
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
+ * Hadoop output committer using an option specified in SQLConf.
+ */
+class NewSQLHadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+    with Serializable with Logging {
+
+  private val committerStagingDir = NewFileOutputCommitter.newVersionExternalTempPath(
+    jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf,
+    SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir"))
+
+  def currentCommitter: NewFileOutputCommitter =
+    committer.asInstanceOf[NewFileOutputCommitter]
+
+  override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
+    // The specified output committer is a FileOutputCommitter.
+    // So, we will use the FileOutputCommitter-specified constructor.
+    val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path)
+    val committer =
+      new NewFileOutputCommitter(committerStagingDir, committerOutputPath, context)
+    logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
+    committer
+  }
+
+  override def newTaskTempFile(
+    taskContext: TaskAttemptContext,
+    dir: Option[String],
+    ext: String): String = {
+    val filename = getFilename(taskContext, ext)
+    dir.map { d =>
+      new Path(new Path(currentCommitter.getTaskAttemptPath(taskContext), d), filename).toString
+    }.getOrElse {
+      new Path(currentCommitter.getTaskAttemptPath(taskContext), filename).toString
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index b9266429f81a5..862e14e3996af 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
+import org.apache.spark.sql.execution.datasources.{NewSQLHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -189,6 +189,55 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-36571") {
+    withTempDir { stagingDir =>
+      withSQLConf(
+        "spark.sql.source.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir",
+        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2",
+        SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+          classOf[NewSQLHadoopMapReduceCommitProtocol].getName) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | create table t(c1 int, p1 int) using ORC
+                 | location '${d.getAbsolutePath}'
+            """.stripMargin)
+
+            val df =
+              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
+                .toDF("c1", "p1").repartition(1)
+            df.write
+              .mode("overwrite")
+              .format("orc")
+              .saveAsTable("t")
+            checkAnswer(sql("select * from t"), df)
+          }
+
+          withTable("t") {
+            sql(
+              s"""
+                 | create table t(c1 int, p1 int) using ORC PARTITIONED BY(p1)
+                 | location '${d.getAbsolutePath}'
+            """.stripMargin)
+
+            val df =
+              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
+                .toDF("c1", "p1").repartition(1)
+
+            df.createOrReplaceTempView("view1")
+            sql("insert overwrite t partition(p1=2) select c1 from view1 ")
+            checkAnswer(sql("select * from t"), df)
+          }
+        }
+      }
+    }
+  }
 }
 
 /**

From 4d80c24253fcc8257b97bc44ff6517a003efa816 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 16:21:29 +0800
Subject: [PATCH 02/53] Update PartitionedWriteSuite.scala

---
 .../org/apache/spark/sql/sources/PartitionedWriteSuite.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 862e14e3996af..af94c1a01bd9d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -216,7 +216,8 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
               .saveAsTable("t")
             checkAnswer(sql("select * from t"), df)
           }
-
+        }
+        withTempDir { d =>
           withTable("t") {
             sql(
               s"""
@@ -232,7 +233,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 
             df.createOrReplaceTempView("view1")
             sql("insert overwrite t partition(p1=2) select c1 from view1 ")
-            checkAnswer(sql("select * from t"), df)
+            checkAnswer(sql("select * from t where p1=2"), df)
           }
         }
       }

From b692e0f2681c3e1e184a46e6dd3f188231488a52 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 16:28:51 +0800
Subject: [PATCH 03/53] Update PartitionedWriteSuite.scala

---
 .../sql/sources/PartitionedWriteSuite.scala     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index af94c1a01bd9d..186b0e355a700 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -234,6 +234,23 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
             df.createOrReplaceTempView("view1")
             sql("insert overwrite t partition(p1=2) select c1 from view1 ")
             checkAnswer(sql("select * from t where p1=2"), df)
+            sql("desc formatted t partition(p1=2)").show(100, 2000)
+          }
+        }
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | create table t(c1 int, p1 int) using parquet partitioned by (p1)
+                 | location '${d.getAbsolutePath}'
+            """.stripMargin)
+
+            val df = Seq((1, 2)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("select * from t"), df)
           }
         }
       }

From e6dbae1b9f59b9fe3f29554522fb84d8f20afd84 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 16:45:49 +0800
Subject: [PATCH 04/53] Update NewSQLHadoopMapReduceCommitProtocol.scala

---
 .../datasources/NewSQLHadoopMapReduceCommitProtocol.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
index 9416bcd728d73..1e71c21ece04c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
@@ -45,7 +45,7 @@ class NewSQLHadoopMapReduceCommitProtocol(
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     // The specified output committer is a FileOutputCommitter.
     // So, we will use the FileOutputCommitter-specified constructor.
-    val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path)
+    val committerOutputPath = new Path(path)
     val committer =
       new NewFileOutputCommitter(committerStagingDir, committerOutputPath, context)
     logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")

From 6419573557fb65501af60bd4c8e038afbfd4d549 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 17:34:50 +0800
Subject: [PATCH 05/53] Update NewSQLHadoopMapReduceCommitProtocol.scala

---
 .../datasources/NewSQLHadoopMapReduceCommitProtocol.scala        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
index 1e71c21ece04c..45159e7b85c09 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 import org.apache.spark.sql.SparkSession

From 44d1d8f8bae102ce647bf2ecb885d33c218b0e04 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 18:00:52 +0800
Subject: [PATCH 06/53] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index 2ca50878485c0..afeb116649f85 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -60,7 +60,7 @@ class PathOutputCommitProtocol(
   }
 
   /** The committer created. */
-  @transient private var committer: PathOutputCommitter = _
+  @transient protected var committer: PathOutputCommitter = _
 
   require(dest != null, "Null destination specified")
 

From e2c5318ecd5fc9d17b86bbaaf230a2dbe858bdaf Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 18:28:07 +0800
Subject: [PATCH 07/53] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index afeb116649f85..0d181c42a1d8e 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -60,7 +60,7 @@ class PathOutputCommitProtocol(
   }
 
   /** The committer created. */
-  @transient protected var committer: PathOutputCommitter = _
+  @transient override protected var committer: PathOutputCommitter = _
 
   require(dest != null, "Null destination specified")
 

From 9106d18615957f23ef4d16f6af6f9cf33598dcb3 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 19:23:34 +0800
Subject: [PATCH 08/53] update

---
 .../spark/internal/io/HadoopMapReduceCommitProtocol.scala     | 4 +++-
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala    | 2 +-
 .../datasources/NewSQLHadoopMapReduceCommitProtocol.scala     | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index f279d0514da66..a7baf74e51818 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -73,7 +73,7 @@ class HadoopMapReduceCommitProtocol(
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
-  @transient protected var committer: OutputCommitter = _
+  @transient private var committer: OutputCommitter = _
 
   /**
    * Checks whether there are files to be committed to a valid output location.
@@ -106,6 +106,8 @@ class HadoopMapReduceCommitProtocol(
    */
   protected def stagingDir = getStagingDir(path, jobId)
 
+  protected def getCommitter = committer
+
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index 0d181c42a1d8e..2ca50878485c0 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -60,7 +60,7 @@ class PathOutputCommitProtocol(
   }
 
   /** The committer created. */
-  @transient override protected var committer: PathOutputCommitter = _
+  @transient private var committer: PathOutputCommitter = _
 
   require(dest != null, "Null destination specified")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
index 45159e7b85c09..acb4415604488 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
@@ -41,7 +41,7 @@ class NewSQLHadoopMapReduceCommitProtocol(
     SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir"))
 
   def currentCommitter: NewFileOutputCommitter =
-    committer.asInstanceOf[NewFileOutputCommitter]
+    getCommitter.asInstanceOf[NewFileOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     // The specified output committer is a FileOutputCommitter.

From 2031f5bea5c83800b28190bee40895da7c1d25a0 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 11:12:01 +0800
Subject: [PATCH 09/53] [SPARK-36579][SQL] Make spark source stagingDir can use
 user defined

---
 .../apache/spark/sql/internal/SQLConf.scala   |  12 ++
 .../SQLHadoopMapReduceCommitProtocol.scala    |   7 ++
 .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ++++++++++++++++++
 .../sql/sources/PartitionedWriteSuite.scala   |  27 ++++
 .../sql/hive/execution/SaveAsHiveFile.scala   |  94 +-------------
 5 files changed, 166 insertions(+), 89 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index c6d9558d53eda..d8dbc36a4f9cc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1216,6 +1216,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val FILE_COMMIT_STAGING_DIR =
+    buildConf("spark.sql.source.stagingDir")
+      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
+        "absolute output path, or writing data into partitioned directory when " +
+        "dynamic partition overwrite mode.")
+      .version("3.3.0")
+      .internal()
+      .stringConf
+      .createWithDefault(".spark-staging")
+
   // The output committer class used by data sources. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
   val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass")
@@ -3824,6 +3834,8 @@ class SQLConf extends Serializable with Logging {
   def partitionColumnTypeInferenceEnabled: Boolean =
     getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
+  def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR)
+
   def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS)
 
   def parallelPartitionDiscoveryThreshold: Int =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 144be2316f091..37a401d842abd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -23,7 +23,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -36,6 +38,11 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
+  override val stagingDir: Path =
+    SQLFileCommitProtocolUtils.newVersionExternalTempPath(
+      new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(),
+      SQLConf.get.fileCommitStagingDir, "spark", jobId)
+
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
new file mode 100644
index 0000000000000..1b2bfb42f0e35
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.sql.util
+
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.internal.Logging
+
+object SQLFileCommitProtocolUtils extends Logging {
+
+  def newVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val extURI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
+        "-ext-10000")
+    }
+  }
+
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
+      "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir,
+      engineType,
+      jobId)
+  }
+
+  private def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val inputPathName: String = inputPath.toString
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir = fs.makeQualified(
+      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    dir
+  }
+
+  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
+    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
+    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
+    path1.startsWith(path2)
+  }
+
+  def executionId(engineType: String): String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index b9266429f81a5..4751767bfd239 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -189,6 +189,33 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
+    withTempDir { stagingDir =>
+      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
+        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
+        SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+                 | LOCATION '${d.getAbsolutePath}'
+               """.stripMargin)
+
+            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("SELECT * FROM t"), df)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
+          }
+        }
+      }
+    }
+  }
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index ec189344f4fa5..581c97ebcbfdd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.hive.execution
 
 import java.io.IOException
 import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
+import java.util.Locale
 
 import scala.util.control.NonFatal
 
@@ -40,6 +39,7 @@ import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.HiveVersion
+import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._
 
 // Base trait from which all hive insert statement physical execution extends.
 private[hive] trait SaveAsHiveFile extends DataWritingCommand {
@@ -129,7 +129,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      newVersionExternalTempPath(path, hadoopConf, stagingDir)
+      newVersionExternalTempPath(
+        path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
     } else {
       throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
     }
@@ -159,7 +160,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       hadoopConf: Configuration,
       scratchDir: String): Path = {
     val extURI: URI = path.toUri
-    val scratchPath = new Path(scratchDir, executionId)
+    val scratchPath = new Path(scratchDir, executionId("hive"))
     var dirPath = new Path(
       extURI.getScheme,
       extURI.getAuthority,
@@ -180,90 +181,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     }
     dirPath
   }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
-  private def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI: URI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
-    }
-  }
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir)
-  }
-
-  private[hive] def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir: Path =
-      fs.makeQualified(
-        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    try {
-      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
-      }
-      createdTempDir = Some(dir)
-      fs.deleteOnExit(dir)
-    } catch {
-      case e: IOException =>
-        throw QueryExecutionErrors.cannotCreateStagingDirError(
-          s"'${dir.toString}': ${e.getMessage}", e)
-    }
-    dir
-  }
-
-  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir().
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
 }
 

From c29f55ed00c7b50aa86a7d86b499739d7d377c96 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 14:27:08 +0800
Subject: [PATCH 10/53] Update SQLHadoopMapReduceCommitProtocol.scala

---
 .../datasources/SQLHadoopMapReduceCommitProtocol.scala        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 37a401d842abd..a6f86256de2c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 
@@ -40,7 +40,7 @@ class SQLHadoopMapReduceCommitProtocol(
 
   override val stagingDir: Path =
     SQLFileCommitProtocolUtils.newVersionExternalTempPath(
-      new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(),
+      new Path(path), SparkContext.getActive.get.hadoopConfiguration,
       SQLConf.get.fileCommitStagingDir, "spark", jobId)
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {

From 2604c9f2c95f50e90f77c300442344816baea3d3 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 15:07:15 +0800
Subject: [PATCH 11/53] Update

---
 .../spark/internal/io/FileCommitProtocol.scala      |  5 +++++
 .../internal/io/HadoopMapReduceCommitProtocol.scala |  8 ++++++++
 .../InsertIntoHadoopFsRelationCommand.scala         | 13 ++-----------
 .../streaming/ManifestFileCommitProtocol.scala      |  2 ++
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 5cd7397ea358f..b968a51d53d34 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -50,6 +50,11 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
+  /**
+   * The output path of this committer.
+   */
+  def outputPath: Path
+
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index c061d617fce4b..2434ffa91d7a3 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -160,6 +160,14 @@ class HadoopMapReduceCommitProtocol(
     f"part-$split%05d-$jobId$ext"
   }
 
+  override def outputPath: Path = {
+    if (dynamicPartitionOverwrite) {
+      stagingDir
+    } else {
+      new Path(path)
+    }
+  }
+
   override def setupJob(jobContext: JobContext): Unit = {
     // Setup IDs
     val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 267b360b474ca..7edb78471e0ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
-      outputPath = outputPath.toString,
+      outputPath = qualifiedOutputPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
@@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
-      // will be renamed from staging path to final output path during commit job
-      val committerOutputPath = if (dynamicPartitionOverwrite) {
-        FileCommitProtocol.getStagingDir(outputPath.toString, jobId)
-          .makeQualified(fs.getUri, fs.getWorkingDirectory)
-      } else {
-        qualifiedOutputPath
-      }
-
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committerOutputPath.toString, customPartitionLocations, outputColumns),
+            committer.outputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 46ce33687890d..30709266f1806 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
+  override def outputPath: Path = new Path(path)
+
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]

From 71f6b17c91b92a85913f816085290a0212e46939 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 19:27:56 +0800
Subject: [PATCH 12/53] fix ut

---
 .../sql/util/SQLFileCommitProtocolUtils.scala |  2 +-
 .../spark/sql/sources/InsertSuite.scala       |  2 +-
 .../sql/hive/execution/SaveAsHiveFile.scala   |  4 +++-
 .../apache/spark/sql/hive/InsertSuite.scala   | 21 ++++++++++++-------
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
index 1b2bfb42f0e35..0d5455a411c2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
@@ -69,7 +69,7 @@ object SQLFileCommitProtocolUtils extends Logging {
       jobId)
   }
 
-  private def getStagingDir(
+  def getStagingDir(
       inputPath: Path,
       hadoopConf: Configuration,
       stagingDir: String,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index d3c21032d0b32..ddcbfe44fbef8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -1073,6 +1073,6 @@ class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLoca
   }
 
   private def isSparkStagingDir(path: Path): Boolean = {
-    path.toString.contains(".spark-staging-")
+    path.toString.contains(".spark-staging_")
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 581c97ebcbfdd..12a9ed3d4d311 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -129,8 +129,10 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      newVersionExternalTempPath(
+      val externalTempPath = newVersionExternalTempPath(
         path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
+      createdTempDir = Some(externalTempPath.getParent)
+      externalTempPath
     } else {
       throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index b715f484fa02a..307d30d88296d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -21,16 +21,17 @@ import java.io.File
 
 import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.catalyst.parser.ParseException
-import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 import org.apache.spark.util.Utils
 
 case class TestData(key: Int, value: String)
@@ -556,25 +557,29 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
     val conf = spark.sessionState.newHadoopConf()
     val inputPath = new Path("/tmp/b/c")
     var stagingDir = "tmp/b"
-    val saveHiveFile = InsertIntoHiveTable(null, Map.empty, null, false, false, null)
-    val getStagingDir = PrivateMethod[Path](Symbol("getStagingDir"))
-    var path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    val id = TaskRunner.getTaskRunnerID
+    var path = SQLFileCommitProtocolUtils.
+      getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b_hive_") != -1)
 
     stagingDir = "tmp/b/c"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = "d/e"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = ".d/e"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1)
 
     stagingDir = "/tmp/c/"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/c_hive_") != -1)
   }
 

From 30113d28dc4a0a619e4bd732b360c84542eb25b3 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 13:38:01 +0800
Subject: [PATCH 13/53] Update SaveAsHiveFile.scala

---
 .../sql/hive/execution/SaveAsHiveFile.scala   | 85 -------------------
 1 file changed, 85 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 298fadcb01fac..e669f9cfcab2f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -189,90 +189,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     }
     dirPath
   }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
-  private def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI: URI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path, hadoopConf, stagingDir)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
-    }
-  }
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir)
-  }
-
-  private[hive] def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir: Path =
-      fs.makeQualified(
-        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    try {
-      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
-      }
-      createdTempDir = Some(dir)
-      fs.deleteOnExit(dir)
-    } catch {
-      case e: IOException =>
-        throw QueryExecutionErrors.cannotCreateStagingDirError(
-          s"'${dir.toString}': ${e.getMessage}", e)
-    }
-    dir
-  }
-
-  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir().
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
 }
 

From 6f405dc89d507d36da03d6a97d9aa268926f7bfc Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 14:15:46 +0800
Subject: [PATCH 14/53] update

---
 .../spark/internal/config/package.scala       |  10 ++
 .../internal/io/FileCommitProtocol.scala      | 103 ++++++++++++++--
 .../io/HadoopMapReduceCommitProtocol.scala    |  11 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  12 --
 .../InsertIntoHadoopFsRelationCommand.scala   |  13 +-
 .../SQLHadoopMapReduceCommitProtocol.scala    |   7 --
 .../ManifestFileCommitProtocol.scala          |   2 -
 .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ------------------
 .../sql/sources/PartitionedWriteSuite.scala   |   3 +-
 .../sql/hive/execution/SaveAsHiveFile.scala   |   5 +-
 .../apache/spark/sql/hive/InsertSuite.scala   |  12 +-
 11 files changed, 129 insertions(+), 164 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 4b7ba9b19f4de..8e24105d7742b 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -2258,4 +2258,14 @@ package object config {
       .version("3.2.0")
       .stringConf
       .createOptional
+
+  val EXEC_STAGING_DIR =
+    ConfigBuilder("spark.exec.stagingDir")
+      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
+        "absolute output path, or writing data into partitioned directory when " +
+        "dynamic partition overwrite mode.")
+      .version("3.3.0")
+      .internal()
+      .stringConf
+      .createWithDefault(".spark-staging")
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index b968a51d53d34..958a9fa8aa034 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -17,11 +17,18 @@
 
 package org.apache.spark.internal.io
 
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
 
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Unstable
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.util.Utils
 
 
@@ -50,11 +57,6 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
-  /**
-   * The output path of this committer.
-   */
-  def outputPath: Path
-
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
@@ -232,8 +234,95 @@ object FileCommitProtocol extends Logging {
     }
   }
 
-  def getStagingDir(path: String, jobId: String): Path = {
-    new Path(path, ".spark-staging-" + jobId)
+  def getStagingDir(path: String, jobId: String, conf: Configuration): Path = {
+    newVersionExternalTempPath(
+      new Path(path), conf,
+      SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId)
+  }
+
+
+  def newVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val extURI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
+        "-ext-10000")
+    }
+  }
+
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
+      "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir,
+      engineType,
+      jobId)
+  }
+
+  def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val inputPathName: String = inputPath.toString
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir = fs.makeQualified(
+      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    dir
+  }
+
+  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
+    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
+    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
+    path1.startsWith(path2)
+  }
+
+  def executionId(engineType: String): String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index e1c0367e46b98..9796d805be35f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -104,7 +104,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected def stagingDir = getStagingDir(path, jobId)
+  protected var stagingDir: Path = _
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
@@ -113,6 +113,7 @@ class HadoopMapReduceCommitProtocol(
       case c: Configurable => c.setConf(context.getConfiguration)
       case _ => ()
     }
+    stagingDir = getStagingDir(path, jobId, context.getConfiguration)
     format.getOutputCommitter(context)
   }
 
@@ -170,14 +171,6 @@ class HadoopMapReduceCommitProtocol(
     f"${spec.prefix}part-$split%05d-$jobId${spec.suffix}"
   }
 
-  override def outputPath: Path = {
-    if (dynamicPartitionOverwrite) {
-      stagingDir
-    } else {
-      new Path(path)
-    }
-  }
-
   override def setupJob(jobContext: JobContext): Unit = {
     // Setup IDs
     val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 0defec5becd7e..98aad1c9c83f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1219,16 +1219,6 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
-  val FILE_COMMIT_STAGING_DIR =
-    buildConf("spark.sql.source.stagingDir")
-      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
-        "absolute output path, or writing data into partitioned directory when " +
-        "dynamic partition overwrite mode.")
-      .version("3.3.0")
-      .internal()
-      .stringConf
-      .createWithDefault(".spark-staging")
-
   // The output committer class used by data sources. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
   val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass")
@@ -3842,8 +3832,6 @@ class SQLConf extends Serializable with Logging {
   def partitionColumnTypeInferenceEnabled: Boolean =
     getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
-  def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR)
-
   def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS)
 
   def parallelPartitionDiscoveryThreshold: Int =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 7edb78471e0ff..f1a0e0dedbddd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
-      outputPath = qualifiedOutputPath.toString,
+      outputPath = outputPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
@@ -162,6 +162,15 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
+      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
+      // will be renamed from staging path to final output path during commit job
+      val committerOutputPath = if (dynamicPartitionOverwrite) {
+        FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf)
+          .makeQualified(fs.getUri, fs.getWorkingDirectory)
+      } else {
+        qualifiedOutputPath
+      }
+
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -169,7 +178,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committer.outputPath.toString, customPartitionLocations, outputColumns),
+            committerOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index a6f86256de2c3..144be2316f091 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -21,11 +21,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
-import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -38,11 +36,6 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  override val stagingDir: Path =
-    SQLFileCommitProtocolUtils.newVersionExternalTempPath(
-      new Path(path), SparkContext.getActive.get.hadoopConfiguration,
-      SQLConf.get.fileCommitStagingDir, "spark", jobId)
-
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 30709266f1806..46ce33687890d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,8 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
-  override def outputPath: Path = new Path(path)
-
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
deleted file mode 100644
index 0d5455a411c2d..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.apache.spark.sql.util
-
-import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-import org.apache.spark.internal.Logging
-
-object SQLFileCommitProtocolUtils extends Logging {
-
-  def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    val extURI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
-        "-ext-10000")
-    }
-  }
-
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
-      "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir,
-      engineType,
-      jobId)
-  }
-
-  def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir = fs.makeQualified(
-      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    dir
-  }
-
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  def executionId(engineType: String): String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 4751767bfd239..86894945591da 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.TestUtils
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -194,7 +195,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
     withTempDir { stagingDir =>
       withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
         SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
-        SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
+        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
         withTempDir { d =>
           withTable("t") {
             sql(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index e669f9cfcab2f..8c6c10fee7935 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -40,7 +40,6 @@ import org.apache.spark.sql.execution.datasources.{BucketingUtils, FileFormatWri
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.HiveVersion
-import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._
 
 // Base trait from which all hive insert statement physical execution extends.
 private[hive] trait SaveAsHiveFile extends DataWritingCommand {
@@ -135,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      val externalTempPath = newVersionExternalTempPath(
+      val externalTempPath = FileCommitProtocol.newVersionExternalTempPath(
         path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
       createdTempDir = Some(externalTempPath.getParent)
       externalTempPath
@@ -168,7 +167,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       hadoopConf: Configuration,
       scratchDir: String): Path = {
     val extURI: URI = path.toUri
-    val scratchPath = new Path(scratchDir, executionId("hive"))
+    val scratchPath = new Path(scratchDir, FileCommitProtocol.executionId("hive"))
     var dirPath = new Path(
       extURI.getScheme,
       extURI.getAuthority,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index f0355eaf18b68..1dfeedcd9a18b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -26,13 +26,13 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.SparkException
+import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 import org.apache.spark.util.Utils
 
 case class TestData(key: Int, value: String)
@@ -543,27 +543,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
     val inputPath = new Path("/tmp/b/c")
     var stagingDir = "tmp/b"
     val id = TaskRunner.getTaskRunnerID
-    var path = SQLFileCommitProtocolUtils.
+    var path = FileCommitProtocol.
       getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b_hive_") != -1)
 
     stagingDir = "tmp/b/c"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = "d/e"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = ".d/e"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1)
 
     stagingDir = "/tmp/c/"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/c_hive_") != -1)
   }

From 361263b9b11e5668286268fc306b399b8760c48a Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 14:43:35 +0800
Subject: [PATCH 15/53] update

---
 .../spark/internal/io/FileCommitProtocol.scala      |  9 ++++++---
 .../internal/io/HadoopMapReduceCommitProtocol.scala | 13 +++++++++++--
 .../InsertIntoHadoopFsRelationCommand.scala         | 11 +----------
 .../streaming/ManifestFileCommitProtocol.scala      |  2 ++
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 958a9fa8aa034..73b7fd62a25b6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Unstable
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.EXEC_STAGING_DIR
@@ -57,6 +56,11 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
+  /**
+   * The output path of this committer.
+   */
+  def outputPath: Path
+
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
@@ -236,8 +240,7 @@ object FileCommitProtocol extends Logging {
 
   def getStagingDir(path: String, jobId: String, conf: Configuration): Path = {
     newVersionExternalTempPath(
-      new Path(path), conf,
-      SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId)
+      new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
   }
 
 
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 9796d805be35f..86d087cdb5cc2 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
@@ -104,7 +105,8 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected var stagingDir: Path = _
+  protected var stagingDir: Path =
+    getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
@@ -113,10 +115,17 @@ class HadoopMapReduceCommitProtocol(
       case c: Configurable => c.setConf(context.getConfiguration)
       case _ => ()
     }
-    stagingDir = getStagingDir(path, jobId, context.getConfiguration)
     format.getOutputCommitter(context)
   }
 
+  override def outputPath: Path = {
+    if (dynamicPartitionOverwrite) {
+      stagingDir
+    } else {
+      new Path(path)
+    }
+  }
+
   override def newTaskTempFile(
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     newTaskTempFile(taskContext, dir, FileNameSpec("", ext))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index f1a0e0dedbddd..382c1e3c16384 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
-      // will be renamed from staging path to final output path during commit job
-      val committerOutputPath = if (dynamicPartitionOverwrite) {
-        FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf)
-          .makeQualified(fs.getUri, fs.getWorkingDirectory)
-      } else {
-        qualifiedOutputPath
-      }
-
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committerOutputPath.toString, customPartitionLocations, outputColumns),
+            committer.outputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 46ce33687890d..30709266f1806 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
+  override def outputPath: Path = new Path(path)
+
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]

From 9ee6ee545c1c6490afaec2febbaf68cb3888586e Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 14:55:26 +0800
Subject: [PATCH 16/53] update

---
 .../sql/sources/PartitionedWriteSuite.scala   | 28 -------
 .../sql/sources/StagingInsertSuite.scala      | 74 +++++++++++++++++++
 2 files changed, 74 insertions(+), 28 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 86894945591da..b9266429f81a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -25,7 +25,6 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.TestUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -190,33 +189,6 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
-
-  test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
-    withTempDir { stagingDir =>
-      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
-        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
-        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
-                 | LOCATION '${d.getAbsolutePath}'
-               """.stripMargin)
-
-            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
-            df.write
-              .partitionBy("p1")
-              .mode("overwrite")
-              .saveAsTable("t")
-            checkAnswer(sql("SELECT * FROM t"), df)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
-          }
-        }
-      }
-    }
-  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
new file mode 100644
index 0000000000000..72c6534dc8751
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.util.Utils
+
+class StagingInsertSuite extends QueryTest with SharedSparkSession {
+  import testImplicits._
+
+  val stagingDir = Utils.createTempDir()
+
+  override def sparkConf: SparkConf =
+    super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath)
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    stagingDir.delete()
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      Utils.deleteRecursively(stagingDir)
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
+    withTempDir { stagingDir =>
+      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
+        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
+        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+                 | LOCATION '${d.getAbsolutePath}'
+               """.stripMargin)
+
+            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("SELECT * FROM t"), df)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
+          }
+        }
+      }
+    }
+  }
+}

From 7773fb24bc5071b76b97b5ebbe8befc7c80d1e71 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 15:24:13 +0800
Subject: [PATCH 17/53] update

---
 .../internal/io/FileCommitProtocol.scala      |  9 +++--
 .../io/HadoopMapReduceCommitProtocol.scala    |  6 +--
 ...FileCommitProtocolInstantiationSuite.scala | 12 +++---
 .../InsertIntoHadoopFsRelationCommand.scala   |  1 +
 .../SQLHadoopMapReduceCommitProtocol.scala    |  4 +-
 .../execution/datasources/v2/FileWrite.scala  |  3 +-
 .../execution/streaming/FileStreamSink.scala  |  3 +-
 .../execution/metric/SQLMetricsSuite.scala    |  4 +-
 .../sql/sources/PartitionedWriteSuite.scala   |  6 ++-
 .../sql/sources/StagingInsertSuite.scala      | 37 +++++++++----------
 .../sql/hive/execution/SaveAsHiveFile.scala   |  3 +-
 11 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 73b7fd62a25b6..d6f7370c9a582 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -215,6 +215,7 @@ object FileCommitProtocol extends Logging {
       className: String,
       jobId: String,
       outputPath: String,
+      conf: Configuration = new Configuration(),
       dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
@@ -224,9 +225,11 @@ object FileCommitProtocol extends Logging {
     // dynamicPartitionOverwrite: Boolean).
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
-      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
-      logDebug("Using (String, String, Boolean) constructor")
-      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+      val ctor = clazz.getDeclaredConstructor(
+        classOf[String], classOf[String], classOf[Configuration], classOf[Boolean])
+      logDebug("Using (String, String, Configuration,  Boolean) constructor")
+      ctor.newInstance(
+        jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
         logDebug("Falling back to (String, String) constructor")
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 86d087cdb5cc2..9cdfe851c89d7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -24,12 +24,12 @@ import scala.collection.mutable
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configurable
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
-import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
@@ -68,6 +68,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
+    conf: Configuration = new Configuration(),
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
@@ -105,8 +106,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected var stagingDir: Path =
-    getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration)
+  protected var stagingDir: Path = getStagingDir(path, jobId, conf)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 778f748f83950..1535ca4c6624e 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.conf.Configuration
+
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -58,8 +60,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[Other].getCanonicalName,
         "job",
-        "path",
-        false)
+        "path")
     }
   }
 
@@ -68,8 +69,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[NoMatchingArgs].getCanonicalName,
         "job",
-        "path",
-        false)
+        "path")
     }
   }
 
@@ -83,6 +83,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[ClassicConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
+      new Configuration(),
       dynamic).asInstanceOf[ClassicConstructorCommitProtocol]
   }
 
@@ -97,6 +98,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[FullConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
+      new Configuration(),
       dynamic).asInstanceOf[FullConstructorCommitProtocol]
   }
 
@@ -121,7 +123,7 @@ private class FullConstructorCommitProtocol(
   arg2: String,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) {
 
   def this(arg1: String, arg2: String) = {
     this(arg1, arg2, false, 2)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 382c1e3c16384..fff93a579a628 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -109,6 +109,7 @@ case class InsertIntoHadoopFsRelationCommand(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
       outputPath = outputPath.toString,
+      hadoopConf,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 144be2316f091..dde68006402d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
@@ -32,8 +33,9 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
+    conf: Configuration,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
index ccc467aae1f15..d822b1bb6ef12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
@@ -61,7 +61,8 @@ trait FileWrite extends Write {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = paths.head)
+      outputPath = paths.head,
+      hadoopConf)
     lazy val description =
       createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 5058a1dfc3baf..8db2ce027fdfd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -150,7 +150,8 @@ class FileStreamSink(
       val committer = FileCommitProtocol.instantiate(
         className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
         jobId = batchId.toString,
-        outputPath = path)
+        outputPath = path,
+        sparkSession.sessionState.newHadoopConf())
 
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 32428fbde0016..9638924b394db 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -834,7 +835,8 @@ case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(
+    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index b9266429f81a5..df41b93f45fad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.sources
 import java.io.File
 import java.sql.Timestamp
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
@@ -35,7 +36,7 @@ import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
 private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration())
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -199,7 +200,8 @@ private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(
+    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
index 72c6534dc8751..162cef0cca3f8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -46,27 +46,24 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession {
   }
 
   test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
-    withTempDir { stagingDir =>
-      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
-        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
-        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
-                 | LOCATION '${d.getAbsolutePath}'
-               """.stripMargin)
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
+      SQLConf.PartitionOverwriteMode.DYNAMIC.toString) {
+      withTempDir { d =>
+        withTable("t") {
+          sql(
+            s"""
+               |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+               |LOCATION '${d.getAbsolutePath}'
+             """.stripMargin)
 
-            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
-            df.write
-              .partitionBy("p1")
-              .mode("overwrite")
-              .saveAsTable("t")
-            checkAnswer(sql("SELECT * FROM t"), df)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
-          }
+          val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+          df.write
+            .partitionBy("p1")
+            .mode("overwrite")
+            .saveAsTable("t")
+          checkAnswer(sql("SELECT * FROM t"), df)
+          checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+          checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
         }
       }
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 8c6c10fee7935..734a0aa64df2a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -83,7 +83,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputLocation)
+      outputPath = outputLocation,
+      hadoopConf)
 
     val options = bucketSpec
       .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true"))

From a3b3c51f326fcf285eb870a2874c774a1498d145 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 15:42:17 +0800
Subject: [PATCH 18/53] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index fc5d0a0b3a7f5..d8b783c460f58 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -19,10 +19,10 @@ package org.apache.spark.internal.io.cloud
 
 import java.io.IOException
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
-
 import org.apache.spark.internal.io.FileNameSpec
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 
@@ -51,7 +51,7 @@ class PathOutputCommitProtocol(
     jobId: String,
     dest: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable {
+  extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable {
 
   if (dynamicPartitionOverwrite) {
     // until there's explicit extensions to the PathOutputCommitProtocols

From b4d60e4b431ebfcc843f826086b1d09135c823c5 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 22:29:45 +0800
Subject: [PATCH 19/53] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index d8b783c460f58..c6497fc47d16f 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
+
 import org.apache.spark.internal.io.FileNameSpec
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 

From 2c4180886d06e61304774900c45db3d45c18c39c Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 10:22:05 +0800
Subject: [PATCH 20/53] Update CommitterBindingSuite.scala

---
 .../spark/internal/io/cloud/CommitterBindingSuite.scala       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
index 546f54229ea59..13317b61276dc 100644
--- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
+++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
@@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite {
   test("local filesystem instantiation") {
     val instance = FileCommitProtocol.instantiate(
       pathCommitProtocolClassname,
-      jobId, "file:///tmp", false)
+      jobId, "file:///tmp", new Configuration(), false)
 
     val protocol = instance.asInstanceOf[PathOutputCommitProtocol]
     assert("file:///tmp" === protocol.destination)
@@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite {
     val cause = intercept[InvocationTargetException] {
       FileCommitProtocol.instantiate(
         pathCommitProtocolClassname,
-        jobId, "file:///tmp", true)
+        jobId, "file:///tmp", new Configuration(), true)
     }.getCause
     if (cause == null || !cause.isInstanceOf[IOException]
         || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) {

From 454118d3e7037b99775c53d3d473984e19ed92cf Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 14:02:22 +0800
Subject: [PATCH 21/53] update

---
 ...cala => SQLPathHadoopMapReduceCommitProtocol.scala} | 10 +++++-----
 ...putCommitter.scala => SQLPathOutputCommitter.scala} |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{NewSQLHadoopMapReduceCommitProtocol.scala => SQLPathHadoopMapReduceCommitProtocol.scala} (90%)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/{NewFileOutputCommitter.scala => SQLPathOutputCommitter.scala} (99%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
similarity index 90%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index acb4415604488..2135b1ff82b8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewSQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -29,26 +29,26 @@ import org.apache.spark.sql.internal.SQLConf
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
  * Hadoop output committer using an option specified in SQLConf.
  */
-class NewSQLHadoopMapReduceCommitProtocol(
+class SQLPathHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  private val committerStagingDir = NewFileOutputCommitter.newVersionExternalTempPath(
+  private val committerStagingDir = SQLPathOutputCommitter.newVersionExternalTempPath(
     jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf,
     SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir"))
 
-  def currentCommitter: NewFileOutputCommitter =
-    getCommitter.asInstanceOf[NewFileOutputCommitter]
+  def currentCommitter: SQLPathOutputCommitter =
+    getCommitter.asInstanceOf[SQLPathOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     // The specified output committer is a FileOutputCommitter.
     // So, we will use the FileOutputCommitter-specified constructor.
     val committerOutputPath = new Path(path)
     val committer =
-      new NewFileOutputCommitter(committerStagingDir, committerOutputPath, context)
+      new SQLPathOutputCommitter(committerStagingDir, committerOutputPath, context)
     logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
     committer
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
index 1a8760f8c6f6d..f831594b49561 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/NewFileOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.util.{DurationInfo, Progressable}
 
 import org.apache.spark.internal.Logging
 
-class NewFileOutputCommitter(
+class SQLPathOutputCommitter(
     stagingDir: Path,
     outputPath: Path,
     context: TaskAttemptContext)
@@ -504,7 +504,7 @@ class NewFileOutputCommitter(
   }
 }
 
-object NewFileOutputCommitter extends Logging {
+object SQLPathOutputCommitter extends Logging {
   def newVersionExternalTempPath(
       jobId: String,
       path: Path,

From 11d6d15d1fa19dd67d5ed3b0335d797e7c8fa02b Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 14:04:39 +0800
Subject: [PATCH 22/53] Update PartitionedWriteSuite.scala

---
 .../org/apache/spark/sql/sources/PartitionedWriteSuite.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 186b0e355a700..ab5216b1439cf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.datasources.{NewSQLHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol}
+import org.apache.spark.sql.execution.datasources.{SQLPathHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -196,7 +196,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
         "spark.sql.source.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir",
         "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2",
         SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
-          classOf[NewSQLHadoopMapReduceCommitProtocol].getName) {
+          classOf[SQLPathHadoopMapReduceCommitProtocol].getName) {
         withTempDir { d =>
           withTable("t") {
             sql(

From 9c9826cf9988150792368a817e7b258a692e638d Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 14:20:30 +0800
Subject: [PATCH 23/53] update

---
 .../internal/io/HadoopMapReduceCommitProtocol.scala      | 4 +---
 .../SQLPathHadoopMapReduceCommitProtocol.scala           | 9 +++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index a7baf74e51818..f279d0514da66 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -73,7 +73,7 @@ class HadoopMapReduceCommitProtocol(
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
-  @transient private var committer: OutputCommitter = _
+  @transient protected var committer: OutputCommitter = _
 
   /**
    * Checks whether there are files to be committed to a valid output location.
@@ -106,8 +106,6 @@ class HadoopMapReduceCommitProtocol(
    */
   protected def stagingDir = getStagingDir(path, jobId)
 
-  protected def getCommitter = committer
-
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 2135b1ff82b8f..930e242fd6f66 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -40,8 +40,8 @@ class SQLPathHadoopMapReduceCommitProtocol(
     jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf,
     SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir"))
 
-  def currentCommitter: SQLPathOutputCommitter =
-    getCommitter.asInstanceOf[SQLPathOutputCommitter]
+  val sqlPathOutputCommitter: SQLPathOutputCommitter =
+    committer.asInstanceOf[SQLPathOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     // The specified output committer is a FileOutputCommitter.
@@ -59,9 +59,10 @@ class SQLPathHadoopMapReduceCommitProtocol(
     ext: String): String = {
     val filename = getFilename(taskContext, ext)
     dir.map { d =>
-      new Path(new Path(currentCommitter.getTaskAttemptPath(taskContext), d), filename).toString
+      new Path(
+        new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString
     }.getOrElse {
-      new Path(currentCommitter.getTaskAttemptPath(taskContext), filename).toString
+      new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), filename).toString
     }
   }
 }

From 592682235822113a73befe3247f2b7e1bebbd81f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:10:21 +0800
Subject: [PATCH 24/53] update

---
 .../internal/io/FileCommitProtocol.scala      | 21 +++++++++++++++++--
 .../io/HadoopMapReduceCommitProtocol.scala    |  4 +++-
 .../InsertIntoHadoopFsRelationCommand.scala   |  2 +-
 .../ManifestFileCommitProtocol.scala          |  4 +++-
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index d6f7370c9a582..73e6abcba867e 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -57,9 +57,26 @@ abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
   /**
-   * The output path of this committer.
+   * Get the final directory where work will be placed once the job
+   * is committed. This may be null, in which case, there is no output
+   * path to write data to.
    */
-  def outputPath: Path
+  def getOutputPath(): Path = null
+
+  /**
+   * Get the directory that the task should write results into.
+   * Warning: there's no guarantee that this work path is on the same
+   * FS as the final output, or that it's visible across machines.
+   * May be null.
+   */
+  def getWorkPath(): Path = null
+
+  /**
+   * Predicate: is there an output path?
+   */
+  def hasOutputPath(): Boolean = {
+    getOutputPath() != null;
+  }
 
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 9cdfe851c89d7..aa511d385b78a 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -118,7 +118,9 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
-  override def outputPath: Path = {
+  override def getOutputPath(): Path = new Path(path)
+
+  override def getWorkPath: Path = {
     if (dynamicPartitionOverwrite) {
       stagingDir
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index fff93a579a628..1a46b60fe111e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -170,7 +170,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committer.outputPath.toString, customPartitionLocations, outputColumns),
+            committer.getWorkPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 30709266f1806..09681f882853f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,7 +56,9 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
-  override def outputPath: Path = new Path(path)
+  override def getOutputPath: Path = new Path(path)
+
+  override def getWorkPath(): Path = new Path(path)
 
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")

From 6cdee5868655c972f3aebf9294ae0e1680501447 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:38:22 +0800
Subject: [PATCH 25/53] update

---
 .../apache/spark/internal/io/FileCommitProtocol.scala |  9 +++++----
 .../internal/io/HadoopMapRedCommitProtocol.scala      |  2 +-
 .../internal/io/HadoopMapReduceCommitProtocol.scala   |  5 ++---
 .../io/FileCommitProtocolInstantiationSuite.scala     |  6 +-----
 .../internal/io/cloud/CommitterBindingSuite.scala     |  4 ++--
 .../InsertIntoHadoopFsRelationCommand.scala           |  8 ++++++--
 .../SQLHadoopMapReduceCommitProtocol.scala            |  5 ++---
 .../sql/execution/datasources/v2/FileWrite.scala      |  3 +--
 .../sql/execution/streaming/FileStreamSink.scala      |  3 +--
 .../spark/sql/execution/metric/SQLMetricsSuite.scala  |  4 ++--
 .../spark/sql/sources/PartitionedWriteSuite.scala     | 11 +++++++----
 .../spark/sql/hive/execution/SaveAsHiveFile.scala     |  3 +--
 12 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 73e6abcba867e..7029f6f3eb1f1 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -232,8 +232,8 @@ object FileCommitProtocol extends Logging {
       className: String,
       jobId: String,
       outputPath: String,
-      conf: Configuration = new Configuration(),
-      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
+      dynamicPartitionOverwrite: Boolean = false,
+      stagingDir: Option[String] = None): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
       s" dynamic=$dynamicPartitionOverwrite")
@@ -245,8 +245,9 @@ object FileCommitProtocol extends Logging {
       val ctor = clazz.getDeclaredConstructor(
         classOf[String], classOf[String], classOf[Configuration], classOf[Boolean])
       logDebug("Using (String, String, Configuration,  Boolean) constructor")
-      ctor.newInstance(
-        jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+      ctor.newInstance(jobId, outputPath,
+        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
+        dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
         logDebug("Falling back to (String, String) constructor")
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
index af0aa41518766..30c85e1570d63 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -27,7 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
  */
 class HadoopMapRedCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path) {
+  extends HadoopMapReduceCommitProtocol(jobId, path, "") {
 
   override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
     val config = context.getConfiguration.asInstanceOf[JobConf]
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index aa511d385b78a..20b2be17a305d 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configurable
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
@@ -68,7 +67,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    conf: Configuration = new Configuration(),
+    stagingPath: String = "",
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
@@ -106,7 +105,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected var stagingDir: Path = getStagingDir(path, jobId, conf)
+  protected var stagingDir: Path = new Path(stagingPath)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 1535ca4c6624e..87ffe0b69a469 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.internal.io
 
-import org.apache.hadoop.conf.Configuration
-
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -83,7 +81,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[ClassicConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
-      new Configuration(),
       dynamic).asInstanceOf[ClassicConstructorCommitProtocol]
   }
 
@@ -98,7 +95,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[FullConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
-      new Configuration(),
       dynamic).asInstanceOf[FullConstructorCommitProtocol]
   }
 
@@ -123,7 +119,7 @@ private class FullConstructorCommitProtocol(
   arg2: String,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) {
 
   def this(arg1: String, arg2: String) = {
     this(arg1, arg2, false, 2)
diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
index 13317b61276dc..546f54229ea59 100644
--- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
+++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
@@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite {
   test("local filesystem instantiation") {
     val instance = FileCommitProtocol.instantiate(
       pathCommitProtocolClassname,
-      jobId, "file:///tmp", new Configuration(), false)
+      jobId, "file:///tmp", false)
 
     val protocol = instance.asInstanceOf[PathOutputCommitProtocol]
     assert("file:///tmp" === protocol.destination)
@@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite {
     val cause = intercept[InvocationTargetException] {
       FileCommitProtocol.instantiate(
         pathCommitProtocolClassname,
-        jobId, "file:///tmp", new Configuration(), true)
+        jobId, "file:///tmp", true)
     }.getCause
     if (cause == null || !cause.isInstanceOf[IOException]
         || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 1a46b60fe111e..dc6ad7eb8b95f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -105,12 +107,14 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val jobId = java.util.UUID.randomUUID().toString
+    val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf,
+      hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
       outputPath = outputPath.toString,
-      hadoopConf,
-      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite,
+      stagingDir = Some(tmpLocation.toString))
 
     val doInsertion = if (mode == SaveMode.Append) {
       true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index dde68006402d7..680c6d63d38ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
@@ -33,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    conf: Configuration,
+    stagingPath: String = "",
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
index d822b1bb6ef12..ccc467aae1f15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
@@ -61,8 +61,7 @@ trait FileWrite extends Write {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = paths.head,
-      hadoopConf)
+      outputPath = paths.head)
     lazy val description =
       createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 8db2ce027fdfd..5058a1dfc3baf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -150,8 +150,7 @@ class FileStreamSink(
       val committer = FileCommitProtocol.instantiate(
         className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
         jobId = batchId.toString,
-        outputPath = path,
-        sparkSession.sessionState.newHadoopConf())
+        outputPath = path)
 
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 9638924b394db..46d44e3b1a95f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,7 +22,6 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -834,9 +833,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
 case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
+    jobId, path, stagingPath, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index df41b93f45fad..96ab46df1f50c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.sources
 import java.io.File
 import java.sql.Timestamp
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
@@ -35,8 +34,11 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
-private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration())
+private class OnlyDetectCustomPathFileCommitProtocol(
+    jobId: String,
+    path: String,
+    stagingPath: String)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -199,9 +201,10 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean)
   extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
+    jobId, path, stagingPath, dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 734a0aa64df2a..8c6c10fee7935 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -83,8 +83,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputLocation,
-      hadoopConf)
+      outputPath = outputLocation)
 
     val options = bucketSpec
       .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true"))

From 824ec0474c5df84fa589394e0f82f7d57e65f224 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:39:50 +0800
Subject: [PATCH 26/53] Update FileCommitProtocol.scala

---
 .../scala/org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 7029f6f3eb1f1..4050edfc61481 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -243,7 +243,7 @@ object FileCommitProtocol extends Logging {
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
       val ctor = clazz.getDeclaredConstructor(
-        classOf[String], classOf[String], classOf[Configuration], classOf[Boolean])
+        classOf[String], classOf[String], classOf[String], classOf[Boolean])
       logDebug("Using (String, String, Configuration,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
         stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),

From 8c8a174814c9e38cdb790ca77975ce3cd7d64264 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:48:25 +0800
Subject: [PATCH 27/53] update

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala   | 6 ------
 .../apache/spark/sql/sources/PartitionedWriteSuite.scala    | 5 +++--
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 4050edfc61481..23f640b40af87 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -27,7 +27,6 @@ import org.apache.hadoop.mapreduce._
 
 import org.apache.spark.annotation.Unstable
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.util.Utils
 
 
@@ -259,11 +258,6 @@ object FileCommitProtocol extends Logging {
     }
   }
 
-  def getStagingDir(path: String, jobId: String, conf: Configuration): Path = {
-    newVersionExternalTempPath(
-      new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
-  }
-
 
   def newVersionExternalTempPath(
       path: Path,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 96ab46df1f50c..0fb0d85ced2b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -37,8 +37,9 @@ import org.apache.spark.util.Utils
 private class OnlyDetectCustomPathFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath)
+    stagingPath: String,
+    dynamicPartitionOverwrite: Boolean)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(

From 8d7ce6e58800791042a4a3f12c16327abbbe903b Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:51:54 +0800
Subject: [PATCH 28/53] update

---
 .../apache/spark/internal/io/HadoopMapRedCommitProtocol.scala | 4 +++-
 .../datasources/SQLHadoopMapReduceCommitProtocol.scala        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
index 30c85e1570d63..d250601349429 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
 
@@ -27,7 +28,8 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
  */
 class HadoopMapRedCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path, "") {
+  extends HadoopMapReduceCommitProtocol(
+    jobId, path, new Path(path, ".spark-staging-" + jobId).toString) {
 
   override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
     val config = context.getConfiguration.asInstanceOf[JobConf]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 680c6d63d38ed..7ed76643a4eb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String = "",
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
     with Serializable with Logging {

From da6a0b9ccbd06bc1f08558e74d00facf2bf487f8 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:58:41 +0800
Subject: [PATCH 29/53] update

---
 .../spark/internal/io/HadoopMapRedCommitProtocol.scala   | 4 +---
 .../internal/io/HadoopMapReduceCommitProtocol.scala      | 9 ++++++++-
 .../io/FileCommitProtocolInstantiationSuite.scala        | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
index d250601349429..af0aa41518766 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.internal.io
 
-import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
 
@@ -28,8 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
  */
 class HadoopMapRedCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(
-    jobId, path, new Path(path, ".spark-staging-" + jobId).toString) {
+  extends HadoopMapReduceCommitProtocol(jobId, path) {
 
   override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
     val config = context.getConfiguration.asInstanceOf[JobConf]
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 20b2be17a305d..6f2b73d34c0c6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -67,10 +67,17 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String = "",
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
+  def this(jobId: String, path: String) =
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString)
+
+  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString,
+      dynamicPartitionOverwrite)
+
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 87ffe0b69a469..77f42577517e8 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -119,7 +119,7 @@ private class FullConstructorCommitProtocol(
   arg2: String,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
 
   def this(arg1: String, arg2: String) = {
     this(arg1, arg2, false, 2)

From 63e466c5ade9078489e0990acdd5c5f1dd97ac10 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 18:07:59 +0800
Subject: [PATCH 30/53] update

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala   | 2 +-
 .../internal/io/FileCommitProtocolInstantiationSuite.scala  | 6 ++++--
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala  | 3 +--
 .../apache/spark/sql/execution/metric/SQLMetricsSuite.scala | 3 +--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 23f640b40af87..daa6addc0a9f6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging {
         classOf[String], classOf[String], classOf[String], classOf[Boolean])
       logDebug("Using (String, String, Configuration,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
+        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString),
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 77f42577517e8..778f748f83950 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -58,7 +58,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[Other].getCanonicalName,
         "job",
-        "path")
+        "path",
+        false)
     }
   }
 
@@ -67,7 +68,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[NoMatchingArgs].getCanonicalName,
         "job",
-        "path")
+        "path",
+        false)
     }
   }
 
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index c6497fc47d16f..fc5d0a0b3a7f5 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -19,7 +19,6 @@ package org.apache.spark.internal.io.cloud
 
 import java.io.IOException
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
@@ -52,7 +51,7 @@ class PathOutputCommitProtocol(
     jobId: String,
     dest: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable {
+  extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable {
 
   if (dynamicPartitionOverwrite) {
     // until there's explicit extensions to the PathOutputCommitProtocols
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 46d44e3b1a95f..f90c5064b0fcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -835,8 +835,7 @@ case class CustomFileCommitProtocol(
     path: String,
     stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, stagingPath, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))

From fd2ac5f62e28959ce4c6bd5f92718a5eba19c213 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 18:18:37 +0800
Subject: [PATCH 31/53] update

---
 .../spark/internal/io/FileCommitProtocol.scala   |  8 ++++----
 .../io/HadoopMapReduceCommitProtocol.scala       | 16 ++++++----------
 .../InsertIntoHadoopFsRelationCommand.scala      |  2 +-
 .../SQLHadoopMapReduceCommitProtocol.scala       |  4 ++--
 .../sql/execution/metric/SQLMetricsSuite.scala   |  5 +++--
 .../sql/sources/PartitionedWriteSuite.scala      |  9 ++++-----
 6 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index daa6addc0a9f6..151fcdbec17a5 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -232,7 +232,7 @@ object FileCommitProtocol extends Logging {
       jobId: String,
       outputPath: String,
       dynamicPartitionOverwrite: Boolean = false,
-      stagingDir: Option[String] = None): FileCommitProtocol = {
+      stagingDir: Option[Path] = None): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
       s" dynamic=$dynamicPartitionOverwrite")
@@ -242,10 +242,10 @@ object FileCommitProtocol extends Logging {
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
       val ctor = clazz.getDeclaredConstructor(
-        classOf[String], classOf[String], classOf[String], classOf[Boolean])
-      logDebug("Using (String, String, Configuration,  Boolean) constructor")
+        classOf[String], classOf[String], classOf[Path], classOf[Boolean])
+      logDebug("Using (String, String, Path,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString),
+        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 6f2b73d34c0c6..27e155c5733c7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -40,6 +40,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
+ * @param stagingDir The staging directory of this write job. Spark uses it to deal with
+ *                   files with absolute output path, or writing data into partitioned directory
+ *                   with dynamicPartitionOverwrite=true
  * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
  *                                  dynamically. Suppose final path is /path/to/outputPath, output
  *                                  path of [[FileOutputCommitter]] is an intermediate path, e.g.
@@ -67,16 +70,15 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
   def this(jobId: String, path: String) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString)
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId))
 
   def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString,
-      dynamicPartitionOverwrite)
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite)
 
   import FileCommitProtocol._
 
@@ -108,12 +110,6 @@ class HadoopMapReduceCommitProtocol(
    */
   @transient private var partitionPaths: mutable.Set[String] = null
 
-  /**
-   * The staging directory of this write job. Spark uses it to deal with files with absolute output
-   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
-   */
-  protected var stagingDir: Path = new Path(stagingPath)
-
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index dc6ad7eb8b95f..6605bb6c1e870 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -114,7 +114,7 @@ case class InsertIntoHadoopFsRelationCommand(
       jobId = jobId,
       outputPath = outputPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite,
-      stagingDir = Some(tmpLocation.toString))
+      stagingDir = Some(tmpLocation))
 
     val doInsertion = if (mode == SaveMode.Append) {
       true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 7ed76643a4eb2..34ce32e710c48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -32,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index f90c5064b0fcc..08c86e6edbb08 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -833,9 +834,9 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
 case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 0fb0d85ced2b9..4fffe8f560345 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -37,9 +37,9 @@ import org.apache.spark.util.Utils
 private class OnlyDetectCustomPathFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -202,10 +202,9 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, stagingPath, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")

From e2951f128b91574e8019bc9e8f3a1bf666f73008 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 08:33:31 +0800
Subject: [PATCH 32/53] fix  UT

---
 .../spark/internal/io/FileCommitProtocol.scala      |  9 ++++++++-
 .../internal/io/HadoopMapReduceCommitProtocol.scala |  5 +++--
 .../io/FileCommitProtocolInstantiationSuite.scala   | 13 ++++++++-----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 151fcdbec17a5..ab6dbd97f11d5 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging {
         classOf[String], classOf[String], classOf[Path], classOf[Boolean])
       logDebug("Using (String, String, Path,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
+        stagingDir.getOrElse(getStagingDir(outputPath, jobId)),
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
@@ -258,6 +258,13 @@ object FileCommitProtocol extends Logging {
     }
   }
 
+  def getStagingDir(path: String, jobId: String): Path = {
+    try {
+      new Path(path, ".spark-staging-" + jobId)
+    } catch {
+      case _: Exception => null
+    }
+  }
 
   def newVersionExternalTempPath(
       path: Path,
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 27e155c5733c7..4c8dd50c146cc 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
 /**
@@ -75,10 +76,10 @@ class HadoopMapReduceCommitProtocol(
   extends FileCommitProtocol with Serializable with Logging {
 
   def this(jobId: String, path: String) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId))
+    this(jobId, path, getStagingDir(path, jobId))
 
   def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite)
+    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
 
   import FileCommitProtocol._
 
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 778f748f83950..26f92cb669b6b 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -119,16 +121,17 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String)
 private class FullConstructorCommitProtocol(
   arg1: String,
   arg2: String,
+  arg3: Path,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) {
 
-  def this(arg1: String, arg2: String) = {
-    this(arg1, arg2, false, 2)
+  def this(arg1: String, arg2: String, arg3: Path) = {
+    this(arg1, arg2, arg3, false, 2)
   }
 
-  def this(arg1: String, arg2: String, b: Boolean) = {
-    this(arg1, arg2, false, 3)
+  def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = {
+    this(arg1, arg2, arg3, false, 3)
   }
 }
 

From c8d0c33b25cf06409798bdc18417887e0f1d1d1c Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 08:49:20 +0800
Subject: [PATCH 33/53] complicated

---
 .../internal/io/FileCommitProtocol.scala      | 21 +++++++++++++------
 ...FileCommitProtocolInstantiationSuite.scala | 13 +++++-------
 .../SQLHadoopMapReduceCommitProtocol.scala    |  7 +++++++
 .../sql/sources/PartitionedWriteSuite.scala   | 11 +++-------
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index ab6dbd97f11d5..7c67e3441b33a 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -249,12 +249,21 @@ object FileCommitProtocol extends Logging {
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
-        logDebug("Falling back to (String, String) constructor")
-        require(!dynamicPartitionOverwrite,
-          "Dynamic Partition Overwrite is enabled but" +
-            s" the committer ${className} does not have the appropriate constructor")
-        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-        ctor.newInstance(jobId, outputPath)
+        try {
+          logDebug("Falling back to (String, String, Boolean) constructor")
+          val ctor = clazz.getDeclaredConstructor(
+            classOf[String], classOf[String], classOf[Boolean])
+          ctor.newInstance(jobId, outputPath,
+            dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+        } catch {
+          case _: NoSuchMethodException =>
+            logDebug("Falling back to (String, String) constructor")
+            require(!dynamicPartitionOverwrite,
+              "Dynamic Partition Overwrite is enabled but" +
+                s" the committer ${className} does not have the appropriate constructor")
+            val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+            ctor.newInstance(jobId, outputPath)
+        }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 26f92cb669b6b..778f748f83950 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.internal.io
 
-import org.apache.hadoop.fs.Path
-
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -121,17 +119,16 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String)
 private class FullConstructorCommitProtocol(
   arg1: String,
   arg2: String,
-  arg3: Path,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
 
-  def this(arg1: String, arg2: String, arg3: Path) = {
-    this(arg1, arg2, arg3, false, 2)
+  def this(arg1: String, arg2: String) = {
+    this(arg1, arg2, false, 2)
   }
 
-  def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = {
-    this(arg1, arg2, arg3, false, 3)
+  def this(arg1: String, arg2: String, b: Boolean) = {
+    this(arg1, arg2, false, 3)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 34ce32e710c48..a6ecffdcaeb5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -22,6 +22,7 @@ import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 import org.apache.spark.sql.internal.SQLConf
 
@@ -37,6 +38,12 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
+  def this(jobId: String, path: String) =
+    this(jobId, path, getStagingDir(path, jobId), false)
+
+  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
+    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
+
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 4fffe8f560345..b9266429f81a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -34,12 +34,8 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
-private class OnlyDetectCustomPathFileCommitProtocol(
-    jobId: String,
-    path: String,
-    stagingDir: Path,
-    dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
+private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -202,9 +198,8 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")

From eafb8dd5b81c587280adddcd9183ee26b2e08c75 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 10:49:26 +0800
Subject: [PATCH 34/53] update

---
 .../internal/io/FileCommitProtocol.scala      | 20 ++++++++++---------
 .../io/HadoopMapReduceCommitProtocol.scala    | 18 ++++-------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 7c67e3441b33a..b32a3979a8d5f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -21,6 +21,8 @@ import java.net.URI
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, Random}
 
+import scala.util.Try
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
@@ -71,7 +73,12 @@ abstract class FileCommitProtocol extends Logging {
   def getWorkPath(): Path = null
 
   /**
-   * Predicate: is there an output path?
+   * Checks whether there are files to be committed to a valid output location.
+   *
+   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
+   * it is necessary to check whether a valid output path is specified.
+   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
+   * committers not writing to distributed file systems.
    */
   def hasOutputPath(): Boolean = {
     getOutputPath() != null;
@@ -268,11 +275,7 @@ object FileCommitProtocol extends Logging {
   }
 
   def getStagingDir(path: String, jobId: String): Path = {
-    try {
-      new Path(path, ".spark-staging-" + jobId)
-    } catch {
-      case _: Exception => null
-    }
+    Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null)
   }
 
   def newVersionExternalTempPath(
@@ -290,15 +293,14 @@ object FileCommitProtocol extends Logging {
     }
   }
 
-
   private def getExtTmpPathRelTo(
       path: Path,
       hadoopConf: Configuration,
       stagingDir: String,
       engineType: String,
       jobId: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
-      "-ext-10000") // Hive uses 10000
+    // Hive uses 10000
+    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), "-ext-10000")
   }
 
   private def getExternalScratchDir(
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 4c8dd50c146cc..fff87da244c0c 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -86,16 +86,6 @@ class HadoopMapReduceCommitProtocol(
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
   @transient private var committer: OutputCommitter = _
 
-  /**
-   * Checks whether there are files to be committed to a valid output location.
-   *
-   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
-   * it is necessary to check whether a valid output path is specified.
-   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
-   * committers not writing to distributed file systems.
-   */
-  private val hasValidPath = Try { new Path(path) }.isSuccess
-
   /**
    * Tracks files staged by this task for absolute output paths. These outputs are not managed by
    * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
@@ -121,13 +111,13 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
-  override def getOutputPath(): Path = new Path(path)
+  override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null)
 
   override def getWorkPath: Path = {
     if (dynamicPartitionOverwrite) {
       stagingDir
     } else {
-      new Path(path)
+      getOutputPath()
     }
   }
 
@@ -206,7 +196,7 @@ class HadoopMapReduceCommitProtocol(
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
 
-    if (hasValidPath) {
+    if (hasOutputPath()) {
       val (allAbsPathFiles, allPartitionPaths) =
         taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
       val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
@@ -270,7 +260,7 @@ class HadoopMapReduceCommitProtocol(
         logWarning(s"Exception while aborting ${jobContext.getJobID}", e)
     }
     try {
-      if (hasValidPath) {
+      if (hasOutputPath()) {
         val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
         fs.delete(stagingDir, true)
       }

From ff2bfb89ea40c2626b430aa6c06614e6a9228503 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 11:30:10 +0800
Subject: [PATCH 35/53] revert API change

---
 .../internal/io/FileCommitProtocol.scala      | 27 -------------------
 .../io/HadoopMapReduceCommitProtocol.scala    | 24 ++++++++---------
 .../InsertIntoHadoopFsRelationCommand.scala   | 10 ++++++-
 .../ManifestFileCommitProtocol.scala          |  4 ---
 .../execution/metric/SQLMetricsSuite.scala    |  4 +--
 5 files changed, 22 insertions(+), 47 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index b32a3979a8d5f..f2f26297e02c7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -57,33 +57,6 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
-  /**
-   * Get the final directory where work will be placed once the job
-   * is committed. This may be null, in which case, there is no output
-   * path to write data to.
-   */
-  def getOutputPath(): Path = null
-
-  /**
-   * Get the directory that the task should write results into.
-   * Warning: there's no guarantee that this work path is on the same
-   * FS as the final output, or that it's visible across machines.
-   * May be null.
-   */
-  def getWorkPath(): Path = null
-
-  /**
-   * Checks whether there are files to be committed to a valid output location.
-   *
-   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
-   * it is necessary to check whether a valid output path is specified.
-   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
-   * committers not writing to distributed file systems.
-   */
-  def hasOutputPath(): Boolean = {
-    getOutputPath() != null;
-  }
-
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index fff87da244c0c..35bad427253ed 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -86,6 +86,16 @@ class HadoopMapReduceCommitProtocol(
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
   @transient private var committer: OutputCommitter = _
 
+  /**
+   * Checks whether there are files to be committed to a valid output location.
+   *
+   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
+   * it is necessary to check whether a valid output path is specified.
+   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
+   * committers not writing to distributed file systems.
+   */
+  private val hasValidPath = Try { new Path(path) }.isSuccess
+
   /**
    * Tracks files staged by this task for absolute output paths. These outputs are not managed by
    * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
@@ -111,16 +121,6 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
-  override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null)
-
-  override def getWorkPath: Path = {
-    if (dynamicPartitionOverwrite) {
-      stagingDir
-    } else {
-      getOutputPath()
-    }
-  }
-
   override def newTaskTempFile(
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     newTaskTempFile(taskContext, dir, FileNameSpec("", ext))
@@ -196,7 +196,7 @@ class HadoopMapReduceCommitProtocol(
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
 
-    if (hasOutputPath()) {
+    if (hasValidPath) {
       val (allAbsPathFiles, allPartitionPaths) =
         taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
       val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
@@ -260,7 +260,7 @@ class HadoopMapReduceCommitProtocol(
         logWarning(s"Exception while aborting ${jobContext.getJobID}", e)
     }
     try {
-      if (hasOutputPath()) {
+      if (hasValidPath) {
         val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
         fs.delete(stagingDir, true)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 6605bb6c1e870..e0461766097c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -167,6 +167,14 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
+      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
+      // will be renamed from staging path to final output path during commit job
+      val committerOutputPath = if (dynamicPartitionOverwrite) {
+        tmpLocation
+      } else {
+        qualifiedOutputPath
+      }
+
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -174,7 +182,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committer.getWorkPath.toString, customPartitionLocations, outputColumns),
+            committerOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 09681f882853f..46ce33687890d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,10 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
-  override def getOutputPath: Path = new Path(path)
-
-  override def getWorkPath(): Path = new Path(path)
-
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 08c86e6edbb08..32428fbde0016 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,7 +22,6 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
-import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -834,9 +833,8 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
 case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))

From c5a1d16d55591edfc539490d193a77eb38a80ea4 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 11:36:11 +0800
Subject: [PATCH 36/53] update

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala      | 2 +-
 .../datasources/InsertIntoHadoopFsRelationCommand.scala        | 3 +--
 .../org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala   | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index f2f26297e02c7..c421c44414b7f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -251,7 +251,7 @@ object FileCommitProtocol extends Logging {
     Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null)
   }
 
-  def newVersionExternalTempPath(
+  def externalTempPath(
       path: Path,
       hadoopConf: Configuration,
       stagingDir: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index e0461766097c6..45ba105c501fa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -21,7 +21,6 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.FileCommitProtocol
-import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -107,7 +106,7 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val jobId = java.util.UUID.randomUUID().toString
-    val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf,
+    val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf,
       hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 8c6c10fee7935..c3e9deacd7e80 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -134,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      val externalTempPath = FileCommitProtocol.newVersionExternalTempPath(
+      val externalTempPath = FileCommitProtocol.externalTempPath(
         path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
       createdTempDir = Some(externalTempPath.getParent)
       externalTempPath

From ac468fa3b872e93b7124c265da6131f2206b7310 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 16:05:20 +0800
Subject: [PATCH 37/53] update

---
 ...SQLPathHadoopMapReduceCommitProtocol.scala |  25 ++--
 .../datasources/SQLPathOutputCommitter.scala  | 114 +++---------------
 2 files changed, 25 insertions(+), 114 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 930e242fd6f66..823cba3c07d7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -21,9 +21,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.internal.io.{FileNameSpec, HadoopMapReduceCommitProtocol}
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -32,32 +30,25 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLPathHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  private val committerStagingDir = SQLPathOutputCommitter.newVersionExternalTempPath(
-    jobId, new Path(path), SparkSession.getActiveSession.get.sharedState.hadoopConf,
-    SQLConf.get.getConfString("spark.sql.source.stagingDir", ".stagingDir"))
-
   val sqlPathOutputCommitter: SQLPathOutputCommitter =
     committer.asInstanceOf[SQLPathOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
-    // The specified output committer is a FileOutputCommitter.
-    // So, we will use the FileOutputCommitter-specified constructor.
-    val committerOutputPath = new Path(path)
-    val committer =
-      new SQLPathOutputCommitter(committerStagingDir, committerOutputPath, context)
+    val committer = new SQLPathOutputCommitter(stagingDir, new Path(path), context)
     logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
     committer
   }
 
   override def newTaskTempFile(
-    taskContext: TaskAttemptContext,
-    dir: Option[String],
-    ext: String): String = {
-    val filename = getFilename(taskContext, ext)
+      taskContext: TaskAttemptContext,
+      dir: Option[String],
+      spec: FileNameSpec): String = {
+    val filename = getFilename(taskContext, spec)
     dir.map { d =>
       new Path(
         new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
index f831594b49561..61c4e947e969b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
@@ -199,10 +199,13 @@ class SQLPathOutputCommitter(
       if (this.skipCleanup) {
         logInfo("Skip cleanup the _temporary folders under job's output directory in commitJob.")
       } else {
-        try this.cleanupJob(context)
-        catch {
+        try {
+          this.cleanupJob(context)
+        } catch {
           case var8: IOException =>
-            if (!this.ignoreCleanupFailures) throw var8
+            if (!this.ignoreCleanupFailures) {
+              throw var8
+            }
             logError("Error in cleanup job, manually cleanup is needed.", var8)
         }
       }
@@ -267,13 +270,13 @@ class SQLPathOutputCommitter(
     } finally {
       if (d != null) {
         if (var6 != null) {
-          try d.close()
-          catch {
+          try {
+            d.close()
+          } catch {
             case var20: Throwable =>
               var6.addSuppressed(var20)
           }
-        }
-        else {
+        } else {
           d.close()
         }
       }
@@ -296,8 +299,7 @@ class SQLPathOutputCommitter(
       if (!(fs.rename(from.getPath, to))) {
         throw new IOException("Failed to rename " + from + " to " + to)
       }
-    }
-    else {
+    } else {
       fs.mkdirs(to)
       val var5: Array[FileStatus] = fs.listStatus(from.getPath)
       val var6: Int = var5.length
@@ -316,8 +318,9 @@ class SQLPathOutputCommitter(
     if (this.hasOutputPath) {
       val pendingJobAttemptsPath: Path = this.getPendingJobAttemptsPath
       val fs: FileSystem = pendingJobAttemptsPath.getFileSystem(context.getConfiguration)
-      try fs.delete(pendingJobAttemptsPath, true)
-      catch {
+      try {
+        fs.delete(pendingJobAttemptsPath, true)
+      } catch {
         case var5: FileNotFoundException =>
           if (!(this.isCommitJobRepeatable(context))) {
             throw var5
@@ -466,12 +469,10 @@ class SQLPathOutputCommitter(
             throw new IOException(
               "Could not rename " + previousCommittedTaskPath + " to " + committedTaskPath)
           }
-        }
-        else {
+        } else {
           logWarning(attemptId + " had no output to recover.")
         }
-      }
-      else {
+      } else {
         try {
           val from: FileStatus = fs.getFileStatus(previousCommittedTaskPath)
           logInfo("Recovering task for upgrading scenario, moving files from " +
@@ -483,8 +484,7 @@ class SQLPathOutputCommitter(
         }
         logInfo("Done recovering task " + attemptId)
       }
-    }
-    else {
+    } else {
       logWarning("Output Path is null in recoverTask()")
     }
   }
@@ -503,83 +503,3 @@ class SQLPathOutputCommitter(
     sb.toString
   }
 }
-
-object SQLPathOutputCommitter extends Logging {
-  def newVersionExternalTempPath(
-      jobId: String,
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, jobId)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, jobId), "-ext-10000")
-    }
-  }
-
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      jobId: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir, jobId), "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      jobId: String): Path = {
-      getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir,
-      jobId: String)
-  }
-
-  private def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      jobId: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir = fs.makeQualified(
-      new Path(stagingPathName + "_" + executionId + "-" + jobId))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    dir
-  }
-
-  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir().
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "spark_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
-}

From 09f211b4d22533bd04f124104b3ab7965e3e017f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 16:34:16 +0800
Subject: [PATCH 38/53] update

---
 .../datasources/FileFormatDataWriter.scala    |  2 +
 ...SQLPathHadoopMapReduceCommitProtocol.scala |  3 +-
 .../datasources/SQLPathOutputCommitter.scala  |  4 --
 .../sql/sources/PartitionedWriteSuite.scala   | 69 +------------------
 .../sql/sources/StagingInsertSuite.scala      | 68 ++++++++++++++++++
 5 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
index 0b1b616bd835c..e1865d68b9c0a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
@@ -292,6 +292,8 @@ abstract class BaseDynamicPartitionDataWriter(
       committer.newTaskTempFile(taskAttemptContext, partDir, fileNameSpec)
     }
 
+    println(s"write current path = ${currentPath}")
+
     currentWriter = description.outputWriterFactory.newInstance(
       path = currentPath,
       dataSchema = description.dataColumns.toStructType,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 823cba3c07d7e..86ad89f920693 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -35,10 +35,11 @@ class SQLPathHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  val sqlPathOutputCommitter: SQLPathOutputCommitter =
+  def sqlPathOutputCommitter(): SQLPathOutputCommitter =
     committer.asInstanceOf[SQLPathOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
+    println(s"output path = ${path}")
     val committer = new SQLPathOutputCommitter(stagingDir, new Path(path), context)
     logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
     committer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
index 61c4e947e969b..22cd72e353f96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathOutputCommitter.scala
@@ -18,12 +18,8 @@
 package org.apache.spark.sql.execution.datasources
 
 import java.io.{FileNotFoundException, IOException}
-import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
 
 import com.google.common.base.Preconditions
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
 import org.apache.hadoop.mapreduce.{JobContext, JobStatus, TaskAttemptContext, TaskAttemptID}
 import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index ab5216b1439cf..b9266429f81a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.datasources.{SQLPathHadoopMapReduceCommitProtocol, SQLHadoopMapReduceCommitProtocol}
+import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -189,73 +189,6 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
-
-  test("SPARK-36571") {
-    withTempDir { stagingDir =>
-      withSQLConf(
-        "spark.sql.source.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir",
-        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2",
-        SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
-          classOf[SQLPathHadoopMapReduceCommitProtocol].getName) {
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | create table t(c1 int, p1 int) using ORC
-                 | location '${d.getAbsolutePath}'
-            """.stripMargin)
-
-            val df =
-              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
-                .toDF("c1", "p1").repartition(1)
-            df.write
-              .mode("overwrite")
-              .format("orc")
-              .saveAsTable("t")
-            checkAnswer(sql("select * from t"), df)
-          }
-        }
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | create table t(c1 int, p1 int) using ORC PARTITIONED BY(p1)
-                 | location '${d.getAbsolutePath}'
-            """.stripMargin)
-
-            val df =
-              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
-                .toDF("c1", "p1").repartition(1)
-
-            df.createOrReplaceTempView("view1")
-            sql("insert overwrite t partition(p1=2) select c1 from view1 ")
-            checkAnswer(sql("select * from t where p1=2"), df)
-            sql("desc formatted t partition(p1=2)").show(100, 2000)
-          }
-        }
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | create table t(c1 int, p1 int) using parquet partitioned by (p1)
-                 | location '${d.getAbsolutePath}'
-            """.stripMargin)
-
-            val df = Seq((1, 2)).toDF("c1", "p1")
-            df.write
-              .partitionBy("p1")
-              .mode("overwrite")
-              .saveAsTable("t")
-            checkAnswer(sql("select * from t"), df)
-          }
-        }
-      }
-    }
-  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
index 162cef0cca3f8..131758cc1f84d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.sources
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.execution.datasources.SQLPathHadoopMapReduceCommitProtocol
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
@@ -68,4 +69,71 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-36571: Add a new commit protocol that support writing data to staging " +
+    "then mv to output path") {
+    withTempDir { stagingDir =>
+      withSQLConf(
+        "spark.exec.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir",
+        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2",
+        SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+          classOf[SQLPathHadoopMapReduceCommitProtocol].getName) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) using ORC
+                 | LOCATION '${d.getAbsolutePath}'
+            """.stripMargin)
+
+            val df =
+              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
+                .toDF("c1", "p1").repartition(1)
+            df.write
+              .mode("overwrite")
+              .format("orc")
+              .saveAsTable("t")
+            checkAnswer(sql("select * from t"), df)
+          }
+        }
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) USING ORC PARTITIONED BY(p1)
+                 | LOCATION '${d.getAbsolutePath}'
+            """.stripMargin)
+
+            val df =
+              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
+                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
+                .toDF("c1", "p1").repartition(1)
+
+            df.createOrReplaceTempView("view1")
+            sql("INSERT OVERWRITE t PARTITION(p1=2) SELECT c1 FROM view1 ")
+            checkAnswer(sql("SELECT * FROM t WHERE p1=2"), df)
+          }
+        }
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+                 | LOCATION '${d.getAbsolutePath}'
+            """.stripMargin)
+
+            val df = Seq((1, 2)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("select * from t"), df)
+          }
+        }
+      }
+    }
+  }
 }

From f1f12c39c27ba85a8fd99967b618ce19e06ab39b Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 16:43:28 +0800
Subject: [PATCH 39/53] update

---
 .../spark/sql/execution/datasources/FileFormatDataWriter.scala  | 2 --
 .../datasources/SQLPathHadoopMapReduceCommitProtocol.scala      | 1 -
 2 files changed, 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
index e1865d68b9c0a..0b1b616bd835c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala
@@ -292,8 +292,6 @@ abstract class BaseDynamicPartitionDataWriter(
       committer.newTaskTempFile(taskAttemptContext, partDir, fileNameSpec)
     }
 
-    println(s"write current path = ${currentPath}")
-
     currentWriter = description.outputWriterFactory.newInstance(
       path = currentPath,
       dataSchema = description.dataColumns.toStructType,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 86ad89f920693..4690417e1e2bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -39,7 +39,6 @@ class SQLPathHadoopMapReduceCommitProtocol(
     committer.asInstanceOf[SQLPathOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
-    println(s"output path = ${path}")
     val committer = new SQLPathOutputCommitter(stagingDir, new Path(path), context)
     logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
     committer

From 023787a7914ef6c06c36622ee7e843025e8b4c8c Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 17:42:49 +0800
Subject: [PATCH 40/53] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index fc5d0a0b3a7f5..cbdc5c3438842 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -61,7 +61,7 @@ class PathOutputCommitProtocol(
   }
 
   /** The committer created. */
-  @transient private var committer: PathOutputCommitter = _
+  @transient override protected var committer: PathOutputCommitter = _
 
   require(dest != null, "Null destination specified")
 

From 5984fb2d767c9c951d0437fa432e0501d98220ee Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 18:35:40 +0800
Subject: [PATCH 41/53] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index cbdc5c3438842..467de4509e7fe 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -60,9 +60,6 @@ class PathOutputCommitProtocol(
     throw new IOException(PathOutputCommitProtocol.UNSUPPORTED)
   }
 
-  /** The committer created. */
-  @transient override protected var committer: PathOutputCommitter = _
-
   require(dest != null, "Null destination specified")
 
   private[cloud] val destination: String = dest
@@ -115,7 +112,7 @@ class PathOutputCommitProtocol(
         logTrace(s"Committer $committer may not be tolerant of task commit failures")
       }
     }
-    committer
+    committer.asInstanceOf[PathOutputCommitter]
   }
 
   /**
@@ -131,7 +128,7 @@ class PathOutputCommitProtocol(
       dir: Option[String],
       spec: FileNameSpec): String = {
 
-    val workDir = committer.getWorkPath
+    val workDir = committer.asInstanceOf[PathOutputCommitter].getWorkPath
     val parent = dir.map {
       d => new Path(workDir, d)
     }.getOrElse(workDir)

From c2f606f0a6a3816f5d46ade0466912be0dfb4996 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Fri, 15 Oct 2021 15:26:33 +0800
Subject: [PATCH 42/53] trigger GA


From b90caeb5014282daf520dc81a3902bf4e0399c58 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:34:28 +0800
Subject: [PATCH 43/53] Update FileCommitProtocol.scala

---
 .../internal/io/FileCommitProtocol.scala      | 48 ++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index c421c44414b7f..684aca08005d7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -57,6 +57,22 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
+  /**
+   * Get the final directory where the result data will be placed once the job
+   * is committed. This may be null, in which case, there is no output
+   * path to write data to and won't write any data.
+   */
+  def getOutputPath: Path = null
+
+  /**
+   * Get the directory that the task should write results into.
+   * Warning: there's no guarantee that this work path is on the same
+   * FS as the final output, or that it's visible across machines.
+   * May be null, in which case, there is no output path to write data to
+   * and won't write any data.
+   */
+  def getWorkPath: Path = null
+
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
@@ -221,34 +237,22 @@ object FileCommitProtocol extends Logging {
     // dynamicPartitionOverwrite: Boolean).
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
-      val ctor = clazz.getDeclaredConstructor(
-        classOf[String], classOf[String], classOf[Path], classOf[Boolean])
-      logDebug("Using (String, String, Path,  Boolean) constructor")
-      ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(getStagingDir(outputPath, jobId)),
-        dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      logDebug("Using (String, String, Boolean) constructor")
+      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
-        try {
-          logDebug("Falling back to (String, String, Boolean) constructor")
-          val ctor = clazz.getDeclaredConstructor(
-            classOf[String], classOf[String], classOf[Boolean])
-          ctor.newInstance(jobId, outputPath,
-            dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
-        } catch {
-          case _: NoSuchMethodException =>
-            logDebug("Falling back to (String, String) constructor")
-            require(!dynamicPartitionOverwrite,
-              "Dynamic Partition Overwrite is enabled but" +
-                s" the committer ${className} does not have the appropriate constructor")
-            val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-            ctor.newInstance(jobId, outputPath)
-        }
+        logDebug("Falling back to (String, String) constructor")
+        require(!dynamicPartitionOverwrite,
+          "Dynamic Partition Overwrite is enabled but" +
+            s" the committer ${className} does not have the appropriate constructor")
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
     }
   }
 
   def getStagingDir(path: String, jobId: String): Path = {
-    Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null)
+    new Path(path, ".spark-staging-" + jobId)
   }
 
   def externalTempPath(

From 98f066c1f4f580cc429387f317c437d56c991717 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:35:03 +0800
Subject: [PATCH 44/53] Update FileCommitProtocol.scala

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala    | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 684aca08005d7..bdbef8d63bfbc 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -21,8 +21,6 @@ import java.net.URI
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, Random}
 
-import scala.util.Try
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
@@ -227,8 +225,7 @@ object FileCommitProtocol extends Logging {
       className: String,
       jobId: String,
       outputPath: String,
-      dynamicPartitionOverwrite: Boolean = false,
-      stagingDir: Option[Path] = None): FileCommitProtocol = {
+      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
       s" dynamic=$dynamicPartitionOverwrite")

From 2ea214c22b5481be6c10e35d40bb0b4b16b220d2 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:37:07 +0800
Subject: [PATCH 45/53] update

---
 .../io/HadoopMapReduceCommitProtocol.scala    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index e013d01e8e709..73399a33be88c 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -71,16 +71,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
-  def this(jobId: String, path: String) =
-    this(jobId, path, getStagingDir(path, jobId))
-
-  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
-
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
@@ -111,6 +104,22 @@ class HadoopMapReduceCommitProtocol(
    */
   @transient private var partitionPaths: mutable.Set[String] = null
 
+  /**
+   * The staging directory of this write job. Spark uses it to deal with files with absolute output
+   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
+   */
+  protected def stagingDir = getStagingDir(path, jobId)
+
+  override def getOutputPath: Path = {
+    if (dynamicPartitionOverwrite) {
+      stagingDir
+    } else {
+      new Path(path)
+    }
+  }
+
+  override def getWorkPath: Path = getOutputPath
+
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
     // If OutputFormat is Configurable, we should set conf to it.

From 201b0b53f84523d316d3c310ae2dc9cf264c4b09 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:37:46 +0800
Subject: [PATCH 46/53] Update HadoopMapReduceCommitProtocol.scala

---
 .../spark/internal/io/HadoopMapReduceCommitProtocol.scala     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 73399a33be88c..2f356d5231730 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -30,7 +30,6 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
 /**
@@ -41,9 +40,6 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
- * @param stagingDir The staging directory of this write job. Spark uses it to deal with
- *                   files with absolute output path, or writing data into partitioned directory
- *                   with dynamicPartitionOverwrite=true
  * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
  *                                  dynamically. Suppose final path is /path/to/outputPath, output
  *                                  path of [[FileOutputCommitter]] is an intermediate path, e.g.

From 772bf6b1e57baf66ea50c1fcb477c6c3445a3267 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:39:43 +0800
Subject: [PATCH 47/53] Update InsertIntoHadoopFsRelationCommand.scala

---
 .../InsertIntoHadoopFsRelationCommand.scala       | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 45ba105c501fa..27da50ba21c53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -106,14 +106,11 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val jobId = java.util.UUID.randomUUID().toString
-    val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf,
-      hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
       outputPath = outputPath.toString,
-      dynamicPartitionOverwrite = dynamicPartitionOverwrite,
-      stagingDir = Some(tmpLocation))
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
       true
@@ -166,14 +163,6 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
-      // will be renamed from staging path to final output path during commit job
-      val committerOutputPath = if (dynamicPartitionOverwrite) {
-        tmpLocation
-      } else {
-        qualifiedOutputPath
-      }
-
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -181,7 +170,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committerOutputPath.toString, customPartitionLocations, outputColumns),
+            committer.getOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,

From 866681cc7f16d3badda73675c048fe432ab4588f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:39:51 +0800
Subject: [PATCH 48/53] Update InsertIntoHadoopFsRelationCommand.scala

---
 .../datasources/InsertIntoHadoopFsRelationCommand.scala          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 27da50ba21c53..f2a8c2372b551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}

From aaae0f2bd25a1f90783ff6ee4bd2c4a91d6a2e43 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:42:52 +0800
Subject: [PATCH 49/53] update

---
 .../apache/spark/internal/config/package.scala    | 10 ----------
 .../org/apache/spark/sql/internal/SQLConf.scala   | 13 +++++++++++++
 .../SQLHadoopMapReduceCommitProtocol.scala        | 15 ++++++---------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 24b988e6734fc..9e6cf341c197b 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -2310,16 +2310,6 @@ package object config {
       .stringConf
       .createOptional
 
-  val EXEC_STAGING_DIR =
-    ConfigBuilder("spark.exec.stagingDir")
-      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
-        "absolute output path, or writing data into partitioned directory when " +
-        "dynamic partition overwrite mode.")
-      .version("3.3.0")
-      .internal()
-      .stringConf
-      .createWithDefault(".spark-staging")
-
   private[spark] val EXECUTOR_STATE_SYNC_MAX_ATTEMPTS =
     ConfigBuilder("spark.worker.executorStateSync.maxAttempts")
       .internal()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 42979a68d8578..b946907d1b31c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1282,6 +1282,17 @@ object SQLConf {
       .createWithDefault(
         "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol")
 
+  val EXEC_STAGING_DIR = buildConf("spark.sql.exec.stagingDir")
+    .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
+      "absolute output path, or writing data into partitioned directory " +
+      "when dynamic partition overwrite mode is on. " +
+      "Default value means staging directory is under table path.")
+    .version("3.3.0")
+    .internal()
+    .stringConf
+    .checkValue(!_.isEmpty, "Should not pass an empty string as staging diretory.")
+    .createWithDefault(".spark-staging")
+
   val PARALLEL_PARTITION_DISCOVERY_THRESHOLD =
     buildConf("spark.sql.sources.parallelPartitionDiscovery.threshold")
       .doc("The maximum number of paths allowed for listing files at driver side. If the number " +
@@ -3966,6 +3977,8 @@ class SQLConf extends Serializable with Logging {
 
   def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS)
 
+  def stagingDir: String = getConf(SQLConf.EXEC_STAGING_DIR)
+
   def parallelPartitionDiscoveryThreshold: Int =
     getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index a6ecffdcaeb5a..9e0b2a1a00552 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
-import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -33,16 +33,13 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  def this(jobId: String, path: String) =
-    this(jobId, path, getStagingDir(path, jobId), false)
-
-  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
+  override val stagingDir: Path =
+    FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
+      SQLConf.get.stagingDir, "spark", jobId)
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)

From c838dd376baf98437b637ab60e822347c073608f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:45:44 +0800
Subject: [PATCH 50/53] Update SQLPathHadoopMapReduceCommitProtocol.scala

---
 .../SQLPathHadoopMapReduceCommitProtocol.scala   | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 4690417e1e2bf..50ee8a8fec3d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -20,8 +20,10 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.{FileNameSpec, HadoopMapReduceCommitProtocol}
+import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec, HadoopMapReduceCommitProtocol}
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -30,12 +32,16 @@ import org.apache.spark.internal.io.{FileNameSpec, HadoopMapReduceCommitProtocol
 class SQLPathHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  def sqlPathOutputCommitter(): SQLPathOutputCommitter =
+  override val stagingDir: Path =
+    FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
+      SQLConf.get.stagingDir, "spark", jobId)
+
+  // This variable only can be used after setupCommitter.
+  private lazy val sqlPathOutputCommitter: SQLPathOutputCommitter =
     committer.asInstanceOf[SQLPathOutputCommitter]
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
@@ -48,7 +54,7 @@ class SQLPathHadoopMapReduceCommitProtocol(
       taskContext: TaskAttemptContext,
       dir: Option[String],
       spec: FileNameSpec): String = {
-    val filename = getFilename(taskContext, spec)
+    val filename = getFilename(taskContext, spce)
     dir.map { d =>
       new Path(
         new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString

From 1f2222c7a8e541bc5c0dc90ccffba6245a2eb2f8 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:47:23 +0800
Subject: [PATCH 51/53] Update StagingInsertSuite.scala

---
 .../sql/sources/StagingInsertSuite.scala      | 143 ++++++++++--------
 1 file changed, 80 insertions(+), 63 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
index 131758cc1f84d..42b047229bb46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -17,121 +17,138 @@
 
 package org.apache.spark.sql.sources
 
-import org.apache.spark.SparkConf
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{QueryTest, Row}
-import org.apache.spark.sql.execution.datasources.SQLPathHadoopMapReduceCommitProtocol
+import org.apache.spark.sql.execution.datasources.{SQLHadoopMapReduceCommitProtocol, SQLPathHadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
 class StagingInsertSuite extends QueryTest with SharedSparkSession {
+
   import testImplicits._
 
-  val stagingDir = Utils.createTempDir()
+  val stagingParentDir = Utils.createTempDir()
 
-  override def sparkConf: SparkConf =
-    super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath)
+  val stagingDir = stagingParentDir.getCanonicalPath + "/.spark-stagingDir"
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    stagingDir.delete()
+    stagingParentDir.delete()
   }
 
   override def afterAll(): Unit = {
     try {
-      Utils.deleteRecursively(stagingDir)
+      Utils.deleteRecursively(stagingParentDir)
     } finally {
       super.afterAll()
     }
   }
 
+  test("SPDI-25701: Assert staging dir work") {
+    withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir) {
+      val commitProtocol = new SQLHadoopMapReduceCommitProtocol("job-id", "dummy-path", true)
+      assert(commitProtocol.stagingDir.toString.contains(stagingDir))
+    }
+  }
+
   test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
-    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
-      SQLConf.PartitionOverwriteMode.DYNAMIC.toString) {
-      withTempDir { d =>
+    // Partition Insert
+    Seq("static", "dynamic").foreach { mode =>
+      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> mode,
+        SQLConf.EXEC_STAGING_DIR.key -> stagingDir) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+                 |LOCATION '${d.getAbsolutePath}'
+             """.stripMargin)
+
+            val df = Seq((1, 2), (3, 4), (5, 6)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("SELECT * FROM t"), df)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
+          }
+        }
+      }
+    }
+  }
+
+  test("SPARK-36571: Add a new commit protocol - None-partitioned insert") {
+    Seq("1", "2").foreach { version =>
+      withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir,
+        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> version,
+        SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+          classOf[SQLPathHadoopMapReduceCommitProtocol].getName) {
         withTable("t") {
           sql(
             s"""
-               |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
-               |LOCATION '${d.getAbsolutePath}'
-             """.stripMargin)
+               | CREATE TABLE t(c1 int, p1 int) using PARQUET
+            """.stripMargin)
 
-          val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+          val df =
+            Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
+              .toDF("c1", "p1").repartition(1)
           df.write
-            .partitionBy("p1")
             .mode("overwrite")
+            .format("parquet")
             .saveAsTable("t")
           checkAnswer(sql("SELECT * FROM t"), df)
-          checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
-          checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
         }
       }
     }
   }
 
-  test("SPARK-36571: Add a new commit protocol that support writing data to staging " +
-    "then mv to output path") {
-    withTempDir { stagingDir =>
-      withSQLConf(
-        "spark.exec.stagingDir" -> s"${stagingDir.getAbsolutePath}/.spark-stagingDir",
-        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> "2",
+  test("SPARK-36571: Add a new commit protocol - Static partition insert") {
+    Seq("1", "2").foreach { version =>
+      withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir,
+        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> version,
         SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
           classOf[SQLPathHadoopMapReduceCommitProtocol].getName) {
         withTempDir { d =>
           withTable("t") {
             sql(
               s"""
-                 | CREATE TABLE t(c1 int, p1 int) using ORC
-                 | LOCATION '${d.getAbsolutePath}'
-            """.stripMargin)
-
-            val df =
-              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
-                .toDF("c1", "p1").repartition(1)
-            df.write
-              .mode("overwrite")
-              .format("orc")
-              .saveAsTable("t")
-            checkAnswer(sql("select * from t"), df)
-          }
-        }
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | CREATE TABLE t(c1 int, p1 int) USING ORC PARTITIONED BY(p1)
+                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
                  | LOCATION '${d.getAbsolutePath}'
             """.stripMargin)
 
             val df =
-              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2),
-                (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
+              Seq((1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (1, 2))
                 .toDF("c1", "p1").repartition(1)
-
             df.createOrReplaceTempView("view1")
-            sql("INSERT OVERWRITE t PARTITION(p1=2) SELECT c1 FROM view1 ")
+            sql("INSERT OVERWRITE t PARTITION(p1=2) SELECT c1 FROM view1")
             checkAnswer(sql("SELECT * FROM t WHERE p1=2"), df)
           }
         }
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
-                 | LOCATION '${d.getAbsolutePath}'
+      }
+    }
+  }
+
+  test("SPARK-36571: Add a new commit protocol - Dynamic partition insert") {
+    Seq("1", "2").foreach { version =>
+      withSQLConf(SQLConf.EXEC_STAGING_DIR.key -> stagingDir,
+        "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version" -> version,
+        SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key ->
+          classOf[SQLPathHadoopMapReduceCommitProtocol].getName) {
+        withTable("t") {
+          sql(
+            s"""
+               | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
             """.stripMargin)
 
-            val df = Seq((1, 2)).toDF("c1", "p1")
-            df.write
-              .partitionBy("p1")
-              .mode("overwrite")
-              .saveAsTable("t")
-            checkAnswer(sql("select * from t"), df)
-          }
+          val df = Seq((1, 2), (3, 4), (5, 6), (7, 8)).toDF("c1", "p1")
+          df.write
+            .partitionBy("p1")
+            .mode("overwrite")
+            .saveAsTable("t")
+          checkAnswer(sql("SELECT * FROM t"), df)
+          checkAnswer(sql("SELECT * FROM t WHERE p1 > 5"), Row(5, 6) :: Row(7, 8) :: Nil)
+          checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
         }
       }
     }

From c3472d9031e26ae148f6a036a419958afa9c9991 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 25 Jan 2022 19:51:23 +0800
Subject: [PATCH 52/53] Update SQLPathHadoopMapReduceCommitProtocol.scala

---
 .../datasources/SQLPathHadoopMapReduceCommitProtocol.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 50ee8a8fec3d9..845d4895d566f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -54,10 +54,10 @@ class SQLPathHadoopMapReduceCommitProtocol(
       taskContext: TaskAttemptContext,
       dir: Option[String],
       spec: FileNameSpec): String = {
-    val filename = getFilename(taskContext, spce)
+    val filename = getFilename(taskContext, spec)
     dir.map { d =>
-      new Path(
-        new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString
+      new Path(new Path(
+        sqlPathOutputCommitter.getTaskAttemptPath(taskContext), d), filename).toString
     }.getOrElse {
       new Path(sqlPathOutputCommitter.getTaskAttemptPath(taskContext), filename).toString
     }

From 6b44163f2ae0186fc3bd415352afb3df8f5883e3 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 26 Jan 2022 09:51:16 +0800
Subject: [PATCH 53/53] Update SQLPathHadoopMapReduceCommitProtocol.scala

---
 .../SQLPathHadoopMapReduceCommitProtocol.scala     | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
index 845d4895d566f..51ad8ae5b0ac2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLPathHadoopMapReduceCommitProtocol.scala
@@ -20,25 +20,17 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec, HadoopMapReduceCommitProtocol}
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.internal.io.FileNameSpec
 
 /**
- * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
+ * A variant of [[SQLHadoopMapReduceCommitProtocol]] that allows specifying the actual
  * Hadoop output committer using an option specified in SQLConf.
  */
 class SQLPathHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
-    with Serializable with Logging {
-
-  override val stagingDir: Path =
-    FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
-      SQLConf.get.stagingDir, "spark", jobId)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
 
   // This variable only can be used after setupCommitter.
   private lazy val sqlPathOutputCommitter: SQLPathOutputCommitter =