From 2031f5bea5c83800b28190bee40895da7c1d25a0 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 11:12:01 +0800
Subject: [PATCH 01/42] [SPARK-36579][SQL] Make spark source stagingDir can use
 user defined

---
 .../apache/spark/sql/internal/SQLConf.scala   |  12 ++
 .../SQLHadoopMapReduceCommitProtocol.scala    |   7 ++
 .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ++++++++++++++++++
 .../sql/sources/PartitionedWriteSuite.scala   |  27 ++++
 .../sql/hive/execution/SaveAsHiveFile.scala   |  94 +-------------
 5 files changed, 166 insertions(+), 89 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index c6d9558d53eda..d8dbc36a4f9cc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1216,6 +1216,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val FILE_COMMIT_STAGING_DIR =
+    buildConf("spark.sql.source.stagingDir")
+      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
+        "absolute output path, or writing data into partitioned directory when " +
+        "dynamic partition overwrite mode.")
+      .version("3.3.0")
+      .internal()
+      .stringConf
+      .createWithDefault(".spark-staging")
+
   // The output committer class used by data sources. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
   val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass")
@@ -3824,6 +3834,8 @@ class SQLConf extends Serializable with Logging {
   def partitionColumnTypeInferenceEnabled: Boolean =
     getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
+  def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR)
+
   def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS)
 
   def parallelPartitionDiscoveryThreshold: Int =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 144be2316f091..37a401d842abd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -23,7 +23,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -36,6 +38,11 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
+  override val stagingDir: Path =
+    SQLFileCommitProtocolUtils.newVersionExternalTempPath(
+      new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(),
+      SQLConf.get.fileCommitStagingDir, "spark", jobId)
+
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
new file mode 100644
index 0000000000000..1b2bfb42f0e35
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.sql.util
+
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.internal.Logging
+
+object SQLFileCommitProtocolUtils extends Logging {
+
+  def newVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val extURI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
+        "-ext-10000")
+    }
+  }
+
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
+      "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir,
+      engineType,
+      jobId)
+  }
+
+  private def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val inputPathName: String = inputPath.toString
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir = fs.makeQualified(
+      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    dir
+  }
+
+  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
+    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
+    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
+    path1.startsWith(path2)
+  }
+
+  def executionId(engineType: String): String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index b9266429f81a5..4751767bfd239 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -189,6 +189,33 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
+    withTempDir { stagingDir =>
+      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
+        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
+        SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+                 | LOCATION '${d.getAbsolutePath}'
+               """.stripMargin)
+
+            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("SELECT * FROM t"), df)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
+          }
+        }
+      }
+    }
+  }
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index ec189344f4fa5..581c97ebcbfdd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.hive.execution
 
 import java.io.IOException
 import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
+import java.util.Locale
 
 import scala.util.control.NonFatal
 
@@ -40,6 +39,7 @@ import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.HiveVersion
+import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._
 
 // Base trait from which all hive insert statement physical execution extends.
 private[hive] trait SaveAsHiveFile extends DataWritingCommand {
@@ -129,7 +129,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      newVersionExternalTempPath(path, hadoopConf, stagingDir)
+      newVersionExternalTempPath(
+        path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
     } else {
       throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
     }
@@ -159,7 +160,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       hadoopConf: Configuration,
       scratchDir: String): Path = {
     val extURI: URI = path.toUri
-    val scratchPath = new Path(scratchDir, executionId)
+    val scratchPath = new Path(scratchDir, executionId("hive"))
     var dirPath = new Path(
       extURI.getScheme,
       extURI.getAuthority,
@@ -180,90 +181,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     }
     dirPath
   }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
-  private def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI: URI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
-    }
-  }
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir)
-  }
-
-  private[hive] def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir: Path =
-      fs.makeQualified(
-        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    try {
-      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
-      }
-      createdTempDir = Some(dir)
-      fs.deleteOnExit(dir)
-    } catch {
-      case e: IOException =>
-        throw QueryExecutionErrors.cannotCreateStagingDirError(
-          s"'${dir.toString}': ${e.getMessage}", e)
-    }
-    dir
-  }
-
-  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir().
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
 }
 

From c29f55ed00c7b50aa86a7d86b499739d7d377c96 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 14:27:08 +0800
Subject: [PATCH 02/42] Update SQLHadoopMapReduceCommitProtocol.scala

---
 .../datasources/SQLHadoopMapReduceCommitProtocol.scala        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 37a401d842abd..a6f86256de2c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -21,9 +21,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 
@@ -40,7 +40,7 @@ class SQLHadoopMapReduceCommitProtocol(
 
   override val stagingDir: Path =
     SQLFileCommitProtocolUtils.newVersionExternalTempPath(
-      new Path(path), SparkSession.getActiveSession.get.sessionState.newHadoopConf(),
+      new Path(path), SparkContext.getActive.get.hadoopConfiguration,
       SQLConf.get.fileCommitStagingDir, "spark", jobId)
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {

From 2604c9f2c95f50e90f77c300442344816baea3d3 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 15:07:15 +0800
Subject: [PATCH 03/42] Update

---
 .../spark/internal/io/FileCommitProtocol.scala      |  5 +++++
 .../internal/io/HadoopMapReduceCommitProtocol.scala |  8 ++++++++
 .../InsertIntoHadoopFsRelationCommand.scala         | 13 ++-----------
 .../streaming/ManifestFileCommitProtocol.scala      |  2 ++
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 5cd7397ea358f..b968a51d53d34 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -50,6 +50,11 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
+  /**
+   * The output path of this committer.
+   */
+  def outputPath: Path
+
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index c061d617fce4b..2434ffa91d7a3 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -160,6 +160,14 @@ class HadoopMapReduceCommitProtocol(
     f"part-$split%05d-$jobId$ext"
   }
 
+  override def outputPath: Path = {
+    if (dynamicPartitionOverwrite) {
+      stagingDir
+    } else {
+      new Path(path)
+    }
+  }
+
   override def setupJob(jobContext: JobContext): Unit = {
     // Setup IDs
     val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 267b360b474ca..7edb78471e0ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
-      outputPath = outputPath.toString,
+      outputPath = qualifiedOutputPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
@@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
-      // will be renamed from staging path to final output path during commit job
-      val committerOutputPath = if (dynamicPartitionOverwrite) {
-        FileCommitProtocol.getStagingDir(outputPath.toString, jobId)
-          .makeQualified(fs.getUri, fs.getWorkingDirectory)
-      } else {
-        qualifiedOutputPath
-      }
-
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committerOutputPath.toString, customPartitionLocations, outputColumns),
+            committer.outputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 46ce33687890d..30709266f1806 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
+  override def outputPath: Path = new Path(path)
+
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]

From 71f6b17c91b92a85913f816085290a0212e46939 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 19:27:56 +0800
Subject: [PATCH 04/42] fix ut

---
 .../sql/util/SQLFileCommitProtocolUtils.scala |  2 +-
 .../spark/sql/sources/InsertSuite.scala       |  2 +-
 .../sql/hive/execution/SaveAsHiveFile.scala   |  4 +++-
 .../apache/spark/sql/hive/InsertSuite.scala   | 21 ++++++++++++-------
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
index 1b2bfb42f0e35..0d5455a411c2d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
@@ -69,7 +69,7 @@ object SQLFileCommitProtocolUtils extends Logging {
       jobId)
   }
 
-  private def getStagingDir(
+  def getStagingDir(
       inputPath: Path,
       hadoopConf: Configuration,
       stagingDir: String,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index d3c21032d0b32..ddcbfe44fbef8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -1073,6 +1073,6 @@ class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLoca
   }
 
   private def isSparkStagingDir(path: Path): Boolean = {
-    path.toString.contains(".spark-staging-")
+    path.toString.contains(".spark-staging_")
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 581c97ebcbfdd..12a9ed3d4d311 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -129,8 +129,10 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      newVersionExternalTempPath(
+      val externalTempPath = newVersionExternalTempPath(
         path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
+      createdTempDir = Some(externalTempPath.getParent)
+      externalTempPath
     } else {
       throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index b715f484fa02a..307d30d88296d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -21,16 +21,17 @@ import java.io.File
 
 import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.catalyst.parser.ParseException
-import org.apache.spark.sql.hive.execution.InsertIntoHiveTable
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 import org.apache.spark.util.Utils
 
 case class TestData(key: Int, value: String)
@@ -556,25 +557,29 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
     val conf = spark.sessionState.newHadoopConf()
     val inputPath = new Path("/tmp/b/c")
     var stagingDir = "tmp/b"
-    val saveHiveFile = InsertIntoHiveTable(null, Map.empty, null, false, false, null)
-    val getStagingDir = PrivateMethod[Path](Symbol("getStagingDir"))
-    var path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    val id = TaskRunner.getTaskRunnerID
+    var path = SQLFileCommitProtocolUtils.
+      getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b_hive_") != -1)
 
     stagingDir = "tmp/b/c"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = "d/e"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = ".d/e"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1)
 
     stagingDir = "/tmp/c/"
-    path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir)
+    path = SQLFileCommitProtocolUtils.getStagingDir(
+      inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/c_hive_") != -1)
   }
 

From 30113d28dc4a0a619e4bd732b360c84542eb25b3 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 13:38:01 +0800
Subject: [PATCH 05/42] Update SaveAsHiveFile.scala

---
 .../sql/hive/execution/SaveAsHiveFile.scala   | 85 -------------------
 1 file changed, 85 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 298fadcb01fac..e669f9cfcab2f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -189,90 +189,5 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     }
     dirPath
   }
-
-  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
-  private def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val extURI: URI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path, hadoopConf, stagingDir)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir), "-ext-10000")
-    }
-  }
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir), "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir)
-  }
-
-  private[hive] def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir: Path =
-      fs.makeQualified(
-        new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    try {
-      if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
-        throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
-      }
-      createdTempDir = Some(dir)
-      fs.deleteOnExit(dir)
-    } catch {
-      case e: IOException =>
-        throw QueryExecutionErrors.cannotCreateStagingDirError(
-          s"'${dir.toString}': ${e.getMessage}", e)
-    }
-    dir
-  }
-
-  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir().
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  private def executionId: String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
 }
 

From 6f405dc89d507d36da03d6a97d9aa268926f7bfc Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 14:15:46 +0800
Subject: [PATCH 06/42] update

---
 .../spark/internal/config/package.scala       |  10 ++
 .../internal/io/FileCommitProtocol.scala      | 103 ++++++++++++++--
 .../io/HadoopMapReduceCommitProtocol.scala    |  11 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  12 --
 .../InsertIntoHadoopFsRelationCommand.scala   |  13 +-
 .../SQLHadoopMapReduceCommitProtocol.scala    |   7 --
 .../ManifestFileCommitProtocol.scala          |   2 -
 .../sql/util/SQLFileCommitProtocolUtils.scala | 115 ------------------
 .../sql/sources/PartitionedWriteSuite.scala   |   3 +-
 .../sql/hive/execution/SaveAsHiveFile.scala   |   5 +-
 .../apache/spark/sql/hive/InsertSuite.scala   |  12 +-
 11 files changed, 129 insertions(+), 164 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 4b7ba9b19f4de..8e24105d7742b 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -2258,4 +2258,14 @@ package object config {
       .version("3.2.0")
       .stringConf
       .createOptional
+
+  val EXEC_STAGING_DIR =
+    ConfigBuilder("spark.exec.stagingDir")
+      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
+        "absolute output path, or writing data into partitioned directory when " +
+        "dynamic partition overwrite mode.")
+      .version("3.3.0")
+      .internal()
+      .stringConf
+      .createWithDefault(".spark-staging")
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index b968a51d53d34..958a9fa8aa034 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -17,11 +17,18 @@
 
 package org.apache.spark.internal.io
 
+import java.net.URI
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale, Random}
+
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
 
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Unstable
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.util.Utils
 
 
@@ -50,11 +57,6 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
-  /**
-   * The output path of this committer.
-   */
-  def outputPath: Path
-
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
@@ -232,8 +234,95 @@ object FileCommitProtocol extends Logging {
     }
   }
 
-  def getStagingDir(path: String, jobId: String): Path = {
-    new Path(path, ".spark-staging-" + jobId)
+  def getStagingDir(path: String, jobId: String, conf: Configuration): Path = {
+    newVersionExternalTempPath(
+      new Path(path), conf,
+      SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId)
+  }
+
+
+  def newVersionExternalTempPath(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val extURI = path.toUri
+    if (extURI.getScheme == "viewfs") {
+      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
+    } else {
+      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
+        "-ext-10000")
+    }
+  }
+
+
+  private def getExtTmpPathRelTo(
+      path: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
+      "-ext-10000") // Hive uses 10000
+  }
+
+  private def getExternalScratchDir(
+      extURI: URI,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    getStagingDir(
+      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
+      hadoopConf,
+      stagingDir,
+      engineType,
+      jobId)
+  }
+
+  def getStagingDir(
+      inputPath: Path,
+      hadoopConf: Configuration,
+      stagingDir: String,
+      engineType: String,
+      jobId: String): Path = {
+    val inputPathName: String = inputPath.toString
+    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
+    var stagingPathName: String =
+      if (inputPathName.indexOf(stagingDir) == -1) {
+        new Path(inputPathName, stagingDir).toString
+      } else {
+        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
+      }
+
+    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
+    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
+    // under the table directory.
+    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
+      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
+      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
+        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
+        "directory.")
+      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+    }
+
+    val dir = fs.makeQualified(
+      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
+    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    dir
+  }
+
+  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
+    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
+    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
+    path1.startsWith(path2)
+  }
+
+  def executionId(engineType: String): String = {
+    val rand: Random = new Random
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index e1c0367e46b98..9796d805be35f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -104,7 +104,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected def stagingDir = getStagingDir(path, jobId)
+  protected var stagingDir: Path = _
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
@@ -113,6 +113,7 @@ class HadoopMapReduceCommitProtocol(
       case c: Configurable => c.setConf(context.getConfiguration)
       case _ => ()
     }
+    stagingDir = getStagingDir(path, jobId, context.getConfiguration)
     format.getOutputCommitter(context)
   }
 
@@ -170,14 +171,6 @@ class HadoopMapReduceCommitProtocol(
     f"${spec.prefix}part-$split%05d-$jobId${spec.suffix}"
   }
 
-  override def outputPath: Path = {
-    if (dynamicPartitionOverwrite) {
-      stagingDir
-    } else {
-      new Path(path)
-    }
-  }
-
   override def setupJob(jobContext: JobContext): Unit = {
     // Setup IDs
     val jobId = SparkHadoopWriterUtils.createJobID(new Date, 0)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 0defec5becd7e..98aad1c9c83f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1219,16 +1219,6 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
-  val FILE_COMMIT_STAGING_DIR =
-    buildConf("spark.sql.source.stagingDir")
-      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
-        "absolute output path, or writing data into partitioned directory when " +
-        "dynamic partition overwrite mode.")
-      .version("3.3.0")
-      .internal()
-      .stringConf
-      .createWithDefault(".spark-staging")
-
   // The output committer class used by data sources. The specified class needs to be a
   // subclass of org.apache.hadoop.mapreduce.OutputCommitter.
   val OUTPUT_COMMITTER_CLASS = buildConf("spark.sql.sources.outputCommitterClass")
@@ -3842,8 +3832,6 @@ class SQLConf extends Serializable with Logging {
   def partitionColumnTypeInferenceEnabled: Boolean =
     getConf(SQLConf.PARTITION_COLUMN_TYPE_INFERENCE)
 
-  def fileCommitStagingDir: String = getConf(SQLConf.FILE_COMMIT_STAGING_DIR)
-
   def fileCommitProtocolClass: String = getConf(SQLConf.FILE_COMMIT_PROTOCOL_CLASS)
 
   def parallelPartitionDiscoveryThreshold: Int =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 7edb78471e0ff..f1a0e0dedbddd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -108,7 +108,7 @@ case class InsertIntoHadoopFsRelationCommand(
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
-      outputPath = qualifiedOutputPath.toString,
+      outputPath = outputPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
@@ -162,6 +162,15 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
+      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
+      // will be renamed from staging path to final output path during commit job
+      val committerOutputPath = if (dynamicPartitionOverwrite) {
+        FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf)
+          .makeQualified(fs.getUri, fs.getWorkingDirectory)
+      } else {
+        qualifiedOutputPath
+      }
+
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -169,7 +178,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committer.outputPath.toString, customPartitionLocations, outputColumns),
+            committerOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index a6f86256de2c3..144be2316f091 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -21,11 +21,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
-import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -38,11 +36,6 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  override val stagingDir: Path =
-    SQLFileCommitProtocolUtils.newVersionExternalTempPath(
-      new Path(path), SparkContext.getActive.get.hadoopConfiguration,
-      SQLConf.get.fileCommitStagingDir, "spark", jobId)
-
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 30709266f1806..46ce33687890d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,8 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
-  override def outputPath: Path = new Path(path)
-
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
deleted file mode 100644
index 0d5455a411c2d..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/SQLFileCommitProtocolUtils.scala
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.apache.spark.sql.util
-
-import java.net.URI
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale, Random}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-import org.apache.spark.internal.Logging
-
-object SQLFileCommitProtocolUtils extends Logging {
-
-  def newVersionExternalTempPath(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    val extURI = path.toUri
-    if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
-    } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
-        "-ext-10000")
-    }
-  }
-
-
-  private def getExtTmpPathRelTo(
-      path: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
-      "-ext-10000") // Hive uses 10000
-  }
-
-  private def getExternalScratchDir(
-      extURI: URI,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    getStagingDir(
-      new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
-      hadoopConf,
-      stagingDir,
-      engineType,
-      jobId)
-  }
-
-  def getStagingDir(
-      inputPath: Path,
-      hadoopConf: Configuration,
-      stagingDir: String,
-      engineType: String,
-      jobId: String): Path = {
-    val inputPathName: String = inputPath.toString
-    val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
-    var stagingPathName: String =
-      if (inputPathName.indexOf(stagingDir) == -1) {
-        new Path(inputPathName, stagingDir).toString
-      } else {
-        inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length)
-      }
-
-    // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the
-    // staging directory needs to avoid being deleted when users set hive.exec.stagingdir
-    // under the table directory.
-    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
-      !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) {
-      logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
-        "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
-        "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
-    }
-
-    val dir = fs.makeQualified(
-      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
-    dir
-  }
-
-  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
-    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
-    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
-    path1.startsWith(path2)
-  }
-
-  def executionId(engineType: String): String = {
-    val rand: Random = new Random
-    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
-    s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 4751767bfd239..86894945591da 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.TestUtils
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -194,7 +195,7 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
     withTempDir { stagingDir =>
       withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
         SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
-        SQLConf.FILE_COMMIT_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
+        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
         withTempDir { d =>
           withTable("t") {
             sql(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index e669f9cfcab2f..8c6c10fee7935 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -40,7 +40,6 @@ import org.apache.spark.sql.execution.datasources.{BucketingUtils, FileFormatWri
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.HiveVersion
-import org.apache.spark.sql.util.SQLFileCommitProtocolUtils._
 
 // Base trait from which all hive insert statement physical execution extends.
 private[hive] trait SaveAsHiveFile extends DataWritingCommand {
@@ -135,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      val externalTempPath = newVersionExternalTempPath(
+      val externalTempPath = FileCommitProtocol.newVersionExternalTempPath(
         path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
       createdTempDir = Some(externalTempPath.getParent)
       externalTempPath
@@ -168,7 +167,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       hadoopConf: Configuration,
       scratchDir: String): Path = {
     val extURI: URI = path.toUri
-    val scratchPath = new Path(scratchDir, executionId("hive"))
+    val scratchPath = new Path(scratchDir, FileCommitProtocol.executionId("hive"))
     var dirPath = new Path(
       extURI.getScheme,
       extURI.getAuthority,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index f0355eaf18b68..1dfeedcd9a18b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -26,13 +26,13 @@ import org.apache.hadoop.hive.ql.exec.TaskRunner
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.SparkException
+import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.util.SQLFileCommitProtocolUtils
 import org.apache.spark.util.Utils
 
 case class TestData(key: Int, value: String)
@@ -543,27 +543,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
     val inputPath = new Path("/tmp/b/c")
     var stagingDir = "tmp/b"
     val id = TaskRunner.getTaskRunnerID
-    var path = SQLFileCommitProtocolUtils.
+    var path = FileCommitProtocol.
       getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b_hive_") != -1)
 
     stagingDir = "tmp/b/c"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = "d/e"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = ".d/e"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1)
 
     stagingDir = "/tmp/c/"
-    path = SQLFileCommitProtocolUtils.getStagingDir(
+    path = FileCommitProtocol.getStagingDir(
       inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/c_hive_") != -1)
   }

From 361263b9b11e5668286268fc306b399b8760c48a Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 14:43:35 +0800
Subject: [PATCH 07/42] update

---
 .../spark/internal/io/FileCommitProtocol.scala      |  9 ++++++---
 .../internal/io/HadoopMapReduceCommitProtocol.scala | 13 +++++++++++--
 .../InsertIntoHadoopFsRelationCommand.scala         | 11 +----------
 .../streaming/ManifestFileCommitProtocol.scala      |  2 ++
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 958a9fa8aa034..73b7fd62a25b6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
 
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Unstable
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.EXEC_STAGING_DIR
@@ -57,6 +56,11 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
+  /**
+   * The output path of this committer.
+   */
+  def outputPath: Path
+
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
@@ -236,8 +240,7 @@ object FileCommitProtocol extends Logging {
 
   def getStagingDir(path: String, jobId: String, conf: Configuration): Path = {
     newVersionExternalTempPath(
-      new Path(path), conf,
-      SparkContext.getActive.get.conf.get(EXEC_STAGING_DIR), "spark", jobId)
+      new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
   }
 
 
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 9796d805be35f..86d087cdb5cc2 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -29,6 +29,7 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
+import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
@@ -104,7 +105,8 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected var stagingDir: Path = _
+  protected var stagingDir: Path =
+    getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
@@ -113,10 +115,17 @@ class HadoopMapReduceCommitProtocol(
       case c: Configurable => c.setConf(context.getConfiguration)
       case _ => ()
     }
-    stagingDir = getStagingDir(path, jobId, context.getConfiguration)
     format.getOutputCommitter(context)
   }
 
+  override def outputPath: Path = {
+    if (dynamicPartitionOverwrite) {
+      stagingDir
+    } else {
+      new Path(path)
+    }
+  }
+
   override def newTaskTempFile(
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     newTaskTempFile(taskContext, dir, FileNameSpec("", ext))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index f1a0e0dedbddd..382c1e3c16384 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -162,15 +162,6 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
-      // will be renamed from staging path to final output path during commit job
-      val committerOutputPath = if (dynamicPartitionOverwrite) {
-        FileCommitProtocol.getStagingDir(outputPath.toString, jobId, hadoopConf)
-          .makeQualified(fs.getUri, fs.getWorkingDirectory)
-      } else {
-        qualifiedOutputPath
-      }
-
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -178,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committerOutputPath.toString, customPartitionLocations, outputColumns),
+            committer.outputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 46ce33687890d..30709266f1806 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,6 +56,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
+  override def outputPath: Path = new Path(path)
+
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]

From 9ee6ee545c1c6490afaec2febbaf68cb3888586e Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 14:55:26 +0800
Subject: [PATCH 08/42] update

---
 .../sql/sources/PartitionedWriteSuite.scala   | 28 -------
 .../sql/sources/StagingInsertSuite.scala      | 74 +++++++++++++++++++
 2 files changed, 74 insertions(+), 28 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 86894945591da..b9266429f81a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -25,7 +25,6 @@ import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.TestUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -190,33 +189,6 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
-
-  test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
-    withTempDir { stagingDir =>
-      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
-        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
-        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
-                 | LOCATION '${d.getAbsolutePath}'
-               """.stripMargin)
-
-            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
-            df.write
-              .partitionBy("p1")
-              .mode("overwrite")
-              .saveAsTable("t")
-            checkAnswer(sql("SELECT * FROM t"), df)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
-          }
-        }
-      }
-    }
-  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
new file mode 100644
index 0000000000000..72c6534dc8751
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.util.Utils
+
+class StagingInsertSuite extends QueryTest with SharedSparkSession {
+  import testImplicits._
+
+  val stagingDir = Utils.createTempDir()
+
+  override def sparkConf: SparkConf =
+    super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath)
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    stagingDir.delete()
+  }
+
+  override def afterAll(): Unit = {
+    try {
+      Utils.deleteRecursively(stagingDir)
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
+    withTempDir { stagingDir =>
+      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
+        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
+        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
+        withTempDir { d =>
+          withTable("t") {
+            sql(
+              s"""
+                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+                 | LOCATION '${d.getAbsolutePath}'
+               """.stripMargin)
+
+            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+            df.write
+              .partitionBy("p1")
+              .mode("overwrite")
+              .saveAsTable("t")
+            checkAnswer(sql("SELECT * FROM t"), df)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
+          }
+        }
+      }
+    }
+  }
+}

From 7773fb24bc5071b76b97b5ebbe8befc7c80d1e71 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 15:24:13 +0800
Subject: [PATCH 09/42] update

---
 .../internal/io/FileCommitProtocol.scala      |  9 +++--
 .../io/HadoopMapReduceCommitProtocol.scala    |  6 +--
 ...FileCommitProtocolInstantiationSuite.scala | 12 +++---
 .../InsertIntoHadoopFsRelationCommand.scala   |  1 +
 .../SQLHadoopMapReduceCommitProtocol.scala    |  4 +-
 .../execution/datasources/v2/FileWrite.scala  |  3 +-
 .../execution/streaming/FileStreamSink.scala  |  3 +-
 .../execution/metric/SQLMetricsSuite.scala    |  4 +-
 .../sql/sources/PartitionedWriteSuite.scala   |  6 ++-
 .../sql/sources/StagingInsertSuite.scala      | 37 +++++++++----------
 .../sql/hive/execution/SaveAsHiveFile.scala   |  3 +-
 11 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 73b7fd62a25b6..d6f7370c9a582 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -215,6 +215,7 @@ object FileCommitProtocol extends Logging {
       className: String,
       jobId: String,
       outputPath: String,
+      conf: Configuration = new Configuration(),
       dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
@@ -224,9 +225,11 @@ object FileCommitProtocol extends Logging {
     // dynamicPartitionOverwrite: Boolean).
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
-      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
-      logDebug("Using (String, String, Boolean) constructor")
-      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+      val ctor = clazz.getDeclaredConstructor(
+        classOf[String], classOf[String], classOf[Configuration], classOf[Boolean])
+      logDebug("Using (String, String, Configuration,  Boolean) constructor")
+      ctor.newInstance(
+        jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
         logDebug("Falling back to (String, String) constructor")
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 86d087cdb5cc2..9cdfe851c89d7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -24,12 +24,12 @@ import scala.collection.mutable
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configurable
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
-import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
@@ -68,6 +68,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
+    conf: Configuration = new Configuration(),
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
@@ -105,8 +106,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected var stagingDir: Path =
-    getStagingDir(path, jobId, SparkContext.getActive.get.hadoopConfiguration)
+  protected var stagingDir: Path = getStagingDir(path, jobId, conf)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 778f748f83950..1535ca4c6624e 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.conf.Configuration
+
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -58,8 +60,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[Other].getCanonicalName,
         "job",
-        "path",
-        false)
+        "path")
     }
   }
 
@@ -68,8 +69,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[NoMatchingArgs].getCanonicalName,
         "job",
-        "path",
-        false)
+        "path")
     }
   }
 
@@ -83,6 +83,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[ClassicConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
+      new Configuration(),
       dynamic).asInstanceOf[ClassicConstructorCommitProtocol]
   }
 
@@ -97,6 +98,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[FullConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
+      new Configuration(),
       dynamic).asInstanceOf[FullConstructorCommitProtocol]
   }
 
@@ -121,7 +123,7 @@ private class FullConstructorCommitProtocol(
   arg2: String,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) {
 
   def this(arg1: String, arg2: String) = {
     this(arg1, arg2, false, 2)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 382c1e3c16384..fff93a579a628 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -109,6 +109,7 @@ case class InsertIntoHadoopFsRelationCommand(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
       outputPath = outputPath.toString,
+      hadoopConf,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 144be2316f091..dde68006402d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
@@ -32,8 +33,9 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
+    conf: Configuration,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
index ccc467aae1f15..d822b1bb6ef12 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
@@ -61,7 +61,8 @@ trait FileWrite extends Write {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = paths.head)
+      outputPath = paths.head,
+      hadoopConf)
     lazy val description =
       createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 5058a1dfc3baf..8db2ce027fdfd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -150,7 +150,8 @@ class FileStreamSink(
       val committer = FileCommitProtocol.instantiate(
         className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
         jobId = batchId.toString,
-        outputPath = path)
+        outputPath = path,
+        sparkSession.sessionState.newHadoopConf())
 
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 32428fbde0016..9638924b394db 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -834,7 +835,8 @@ case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(
+    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index b9266429f81a5..df41b93f45fad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.sources
 import java.io.File
 import java.sql.Timestamp
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
@@ -35,7 +36,7 @@ import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
 private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration())
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -199,7 +200,8 @@ private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(
+    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
index 72c6534dc8751..162cef0cca3f8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -46,27 +46,24 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession {
   }
 
   test("SPARK-36579: dynamic partition overwrite can use user defined staging dir") {
-    withTempDir { stagingDir =>
-      withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
-        SQLConf.PartitionOverwriteMode.DYNAMIC.toString,
-        EXEC_STAGING_DIR.key -> stagingDir.getAbsolutePath) {
-        withTempDir { d =>
-          withTable("t") {
-            sql(
-              s"""
-                 | CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
-                 | LOCATION '${d.getAbsolutePath}'
-               """.stripMargin)
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key ->
+      SQLConf.PartitionOverwriteMode.DYNAMIC.toString) {
+      withTempDir { d =>
+        withTable("t") {
+          sql(
+            s"""
+               |CREATE TABLE t(c1 int, p1 int) USING PARQUET PARTITIONED BY(p1)
+               |LOCATION '${d.getAbsolutePath}'
+             """.stripMargin)
 
-            val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
-            df.write
-              .partitionBy("p1")
-              .mode("overwrite")
-              .saveAsTable("t")
-            checkAnswer(sql("SELECT * FROM t"), df)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
-            checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
-          }
+          val df = Seq((1, 2), (3, 4)).toDF("c1", "p1")
+          df.write
+            .partitionBy("p1")
+            .mode("overwrite")
+            .saveAsTable("t")
+          checkAnswer(sql("SELECT * FROM t"), df)
+          checkAnswer(sql("SELECT * FROM t WHERE p1 = 2"), Row(1, 2) :: Nil)
+          checkAnswer(sql("SELECT * FROM t WHERE p1 = 4"), Row(3, 4) :: Nil)
         }
       }
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 8c6c10fee7935..734a0aa64df2a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -83,7 +83,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputLocation)
+      outputPath = outputLocation,
+      hadoopConf)
 
     val options = bucketSpec
       .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true"))

From a3b3c51f326fcf285eb870a2874c774a1498d145 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 15:42:17 +0800
Subject: [PATCH 10/42] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index fc5d0a0b3a7f5..d8b783c460f58 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -19,10 +19,10 @@ package org.apache.spark.internal.io.cloud
 
 import java.io.IOException
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
-
 import org.apache.spark.internal.io.FileNameSpec
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 
@@ -51,7 +51,7 @@ class PathOutputCommitProtocol(
     jobId: String,
     dest: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable {
+  extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable {
 
   if (dynamicPartitionOverwrite) {
     // until there's explicit extensions to the PathOutputCommitProtocols

From b4d60e4b431ebfcc843f826086b1d09135c823c5 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 12 Oct 2021 22:29:45 +0800
Subject: [PATCH 11/42] Update PathOutputCommitProtocol.scala

---
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index d8b783c460f58..c6497fc47d16f 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
+
 import org.apache.spark.internal.io.FileNameSpec
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 

From 2c4180886d06e61304774900c45db3d45c18c39c Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 10:22:05 +0800
Subject: [PATCH 12/42] Update CommitterBindingSuite.scala

---
 .../spark/internal/io/cloud/CommitterBindingSuite.scala       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
index 546f54229ea59..13317b61276dc 100644
--- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
+++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
@@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite {
   test("local filesystem instantiation") {
     val instance = FileCommitProtocol.instantiate(
       pathCommitProtocolClassname,
-      jobId, "file:///tmp", false)
+      jobId, "file:///tmp", new Configuration(), false)
 
     val protocol = instance.asInstanceOf[PathOutputCommitProtocol]
     assert("file:///tmp" === protocol.destination)
@@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite {
     val cause = intercept[InvocationTargetException] {
       FileCommitProtocol.instantiate(
         pathCommitProtocolClassname,
-        jobId, "file:///tmp", true)
+        jobId, "file:///tmp", new Configuration(), true)
     }.getCause
     if (cause == null || !cause.isInstanceOf[IOException]
         || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) {

From 592682235822113a73befe3247f2b7e1bebbd81f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:10:21 +0800
Subject: [PATCH 13/42] update

---
 .../internal/io/FileCommitProtocol.scala      | 21 +++++++++++++++++--
 .../io/HadoopMapReduceCommitProtocol.scala    |  4 +++-
 .../InsertIntoHadoopFsRelationCommand.scala   |  2 +-
 .../ManifestFileCommitProtocol.scala          |  4 +++-
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index d6f7370c9a582..73e6abcba867e 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -57,9 +57,26 @@ abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
   /**
-   * The output path of this committer.
+   * Get the final directory where work will be placed once the job
+   * is committed. This may be null, in which case, there is no output
+   * path to write data to.
    */
-  def outputPath: Path
+  def getOutputPath(): Path = null
+
+  /**
+   * Get the directory that the task should write results into.
+   * Warning: there's no guarantee that this work path is on the same
+   * FS as the final output, or that it's visible across machines.
+   * May be null.
+   */
+  def getWorkPath(): Path = null
+
+  /**
+   * Predicate: is there an output path?
+   */
+  def hasOutputPath(): Boolean = {
+    getOutputPath() != null;
+  }
 
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 9cdfe851c89d7..aa511d385b78a 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -118,7 +118,9 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
-  override def outputPath: Path = {
+  override def getOutputPath(): Path = new Path(path)
+
+  override def getWorkPath: Path = {
     if (dynamicPartitionOverwrite) {
       stagingDir
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index fff93a579a628..1a46b60fe111e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -170,7 +170,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committer.outputPath.toString, customPartitionLocations, outputColumns),
+            committer.getWorkPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 30709266f1806..09681f882853f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,7 +56,9 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
-  override def outputPath: Path = new Path(path)
+  override def getOutputPath: Path = new Path(path)
+
+  override def getWorkPath(): Path = new Path(path)
 
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")

From 6cdee5868655c972f3aebf9294ae0e1680501447 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:38:22 +0800
Subject: [PATCH 14/42] update

---
 .../apache/spark/internal/io/FileCommitProtocol.scala |  9 +++++----
 .../internal/io/HadoopMapRedCommitProtocol.scala      |  2 +-
 .../internal/io/HadoopMapReduceCommitProtocol.scala   |  5 ++---
 .../io/FileCommitProtocolInstantiationSuite.scala     |  6 +-----
 .../internal/io/cloud/CommitterBindingSuite.scala     |  4 ++--
 .../InsertIntoHadoopFsRelationCommand.scala           |  8 ++++++--
 .../SQLHadoopMapReduceCommitProtocol.scala            |  5 ++---
 .../sql/execution/datasources/v2/FileWrite.scala      |  3 +--
 .../sql/execution/streaming/FileStreamSink.scala      |  3 +--
 .../spark/sql/execution/metric/SQLMetricsSuite.scala  |  4 ++--
 .../spark/sql/sources/PartitionedWriteSuite.scala     | 11 +++++++----
 .../spark/sql/hive/execution/SaveAsHiveFile.scala     |  3 +--
 12 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 73e6abcba867e..7029f6f3eb1f1 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -232,8 +232,8 @@ object FileCommitProtocol extends Logging {
       className: String,
       jobId: String,
       outputPath: String,
-      conf: Configuration = new Configuration(),
-      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
+      dynamicPartitionOverwrite: Boolean = false,
+      stagingDir: Option[String] = None): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
       s" dynamic=$dynamicPartitionOverwrite")
@@ -245,8 +245,9 @@ object FileCommitProtocol extends Logging {
       val ctor = clazz.getDeclaredConstructor(
         classOf[String], classOf[String], classOf[Configuration], classOf[Boolean])
       logDebug("Using (String, String, Configuration,  Boolean) constructor")
-      ctor.newInstance(
-        jobId, outputPath, conf, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+      ctor.newInstance(jobId, outputPath,
+        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
+        dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
         logDebug("Falling back to (String, String) constructor")
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
index af0aa41518766..30c85e1570d63 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -27,7 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
  */
 class HadoopMapRedCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path) {
+  extends HadoopMapReduceCommitProtocol(jobId, path, "") {
 
   override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
     val config = context.getConfiguration.asInstanceOf[JobConf]
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index aa511d385b78a..20b2be17a305d 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configurable
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
@@ -68,7 +67,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    conf: Configuration = new Configuration(),
+    stagingPath: String = "",
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
@@ -106,7 +105,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected var stagingDir: Path = getStagingDir(path, jobId, conf)
+  protected var stagingDir: Path = new Path(stagingPath)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 1535ca4c6624e..87ffe0b69a469 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.internal.io
 
-import org.apache.hadoop.conf.Configuration
-
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -83,7 +81,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[ClassicConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
-      new Configuration(),
       dynamic).asInstanceOf[ClassicConstructorCommitProtocol]
   }
 
@@ -98,7 +95,6 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       classOf[FullConstructorCommitProtocol].getCanonicalName,
       "job",
       "path",
-      new Configuration(),
       dynamic).asInstanceOf[FullConstructorCommitProtocol]
   }
 
@@ -123,7 +119,7 @@ private class FullConstructorCommitProtocol(
   arg2: String,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, new Configuration(), b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) {
 
   def this(arg1: String, arg2: String) = {
     this(arg1, arg2, false, 2)
diff --git a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
index 13317b61276dc..546f54229ea59 100644
--- a/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
+++ b/hadoop-cloud/src/hadoop-3/test/scala/org/apache/spark/internal/io/cloud/CommitterBindingSuite.scala
@@ -124,7 +124,7 @@ class CommitterBindingSuite extends SparkFunSuite {
   test("local filesystem instantiation") {
     val instance = FileCommitProtocol.instantiate(
       pathCommitProtocolClassname,
-      jobId, "file:///tmp", new Configuration(), false)
+      jobId, "file:///tmp", false)
 
     val protocol = instance.asInstanceOf[PathOutputCommitProtocol]
     assert("file:///tmp" === protocol.destination)
@@ -134,7 +134,7 @@ class CommitterBindingSuite extends SparkFunSuite {
     val cause = intercept[InvocationTargetException] {
       FileCommitProtocol.instantiate(
         pathCommitProtocolClassname,
-        jobId, "file:///tmp", new Configuration(), true)
+        jobId, "file:///tmp", true)
     }.getCause
     if (cause == null || !cause.isInstanceOf[IOException]
         || !cause.getMessage.contains(PathOutputCommitProtocol.UNSUPPORTED)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 1a46b60fe111e..dc6ad7eb8b95f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -19,7 +19,9 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -105,12 +107,14 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val jobId = java.util.UUID.randomUUID().toString
+    val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf,
+      hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
       outputPath = outputPath.toString,
-      hadoopConf,
-      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite,
+      stagingDir = Some(tmpLocation.toString))
 
     val doInsertion = if (mode == SaveMode.Append) {
       true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index dde68006402d7..680c6d63d38ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
@@ -33,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    conf: Configuration,
+    stagingPath: String = "",
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, conf, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
index d822b1bb6ef12..ccc467aae1f15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala
@@ -61,8 +61,7 @@ trait FileWrite extends Write {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = paths.head,
-      hadoopConf)
+      outputPath = paths.head)
     lazy val description =
       createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 8db2ce027fdfd..5058a1dfc3baf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -150,8 +150,7 @@ class FileStreamSink(
       val committer = FileCommitProtocol.instantiate(
         className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
         jobId = batchId.toString,
-        outputPath = path,
-        sparkSession.sessionState.newHadoopConf())
+        outputPath = path)
 
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 9638924b394db..46d44e3b1a95f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,7 +22,6 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -834,9 +833,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
 case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
+    jobId, path, stagingPath, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index df41b93f45fad..96ab46df1f50c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.sources
 import java.io.File
 import java.sql.Timestamp
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
@@ -35,8 +34,11 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
-private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, new Configuration())
+private class OnlyDetectCustomPathFileCommitProtocol(
+    jobId: String,
+    path: String,
+    stagingPath: String)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -199,9 +201,10 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean)
   extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, new Configuration(), dynamicPartitionOverwrite) {
+    jobId, path, stagingPath, dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 734a0aa64df2a..8c6c10fee7935 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -83,8 +83,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputLocation,
-      hadoopConf)
+      outputPath = outputLocation)
 
     val options = bucketSpec
       .map(_ => Map(BucketingUtils.optionForHiveCompatibleBucketWrite -> "true"))

From 824ec0474c5df84fa589394e0f82f7d57e65f224 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:39:50 +0800
Subject: [PATCH 15/42] Update FileCommitProtocol.scala

---
 .../scala/org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 7029f6f3eb1f1..4050edfc61481 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -243,7 +243,7 @@ object FileCommitProtocol extends Logging {
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
       val ctor = clazz.getDeclaredConstructor(
-        classOf[String], classOf[String], classOf[Configuration], classOf[Boolean])
+        classOf[String], classOf[String], classOf[String], classOf[Boolean])
       logDebug("Using (String, String, Configuration,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
         stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),

From 8c8a174814c9e38cdb790ca77975ce3cd7d64264 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:48:25 +0800
Subject: [PATCH 16/42] update

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala   | 6 ------
 .../apache/spark/sql/sources/PartitionedWriteSuite.scala    | 5 +++--
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 4050edfc61481..23f640b40af87 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -27,7 +27,6 @@ import org.apache.hadoop.mapreduce._
 
 import org.apache.spark.annotation.Unstable
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.util.Utils
 
 
@@ -259,11 +258,6 @@ object FileCommitProtocol extends Logging {
     }
   }
 
-  def getStagingDir(path: String, jobId: String, conf: Configuration): Path = {
-    newVersionExternalTempPath(
-      new Path(path), conf, conf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
-  }
-
 
   def newVersionExternalTempPath(
       path: Path,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 96ab46df1f50c..0fb0d85ced2b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -37,8 +37,9 @@ import org.apache.spark.util.Utils
 private class OnlyDetectCustomPathFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath)
+    stagingPath: String,
+    dynamicPartitionOverwrite: Boolean)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(

From 8d7ce6e58800791042a4a3f12c16327abbbe903b Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:51:54 +0800
Subject: [PATCH 17/42] update

---
 .../apache/spark/internal/io/HadoopMapRedCommitProtocol.scala | 4 +++-
 .../datasources/SQLHadoopMapReduceCommitProtocol.scala        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
index 30c85e1570d63..d250601349429 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
 
@@ -27,7 +28,8 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
  */
 class HadoopMapRedCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path, "") {
+  extends HadoopMapReduceCommitProtocol(
+    jobId, path, new Path(path, ".spark-staging-" + jobId).toString) {
 
   override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
     val config = context.getConfiguration.asInstanceOf[JobConf]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 680c6d63d38ed..7ed76643a4eb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String = "",
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
     with Serializable with Logging {

From da6a0b9ccbd06bc1f08558e74d00facf2bf487f8 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 17:58:41 +0800
Subject: [PATCH 18/42] update

---
 .../spark/internal/io/HadoopMapRedCommitProtocol.scala   | 4 +---
 .../internal/io/HadoopMapReduceCommitProtocol.scala      | 9 ++++++++-
 .../io/FileCommitProtocolInstantiationSuite.scala        | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
index d250601349429..af0aa41518766 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapRedCommitProtocol.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.internal.io
 
-import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapred._
 import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
 
@@ -28,8 +27,7 @@ import org.apache.hadoop.mapreduce.{TaskAttemptContext => NewTaskAttemptContext}
  * Unlike Hadoop's OutputCommitter, this implementation is serializable.
  */
 class HadoopMapRedCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(
-    jobId, path, new Path(path, ".spark-staging-" + jobId).toString) {
+  extends HadoopMapReduceCommitProtocol(jobId, path) {
 
   override def setupCommitter(context: NewTaskAttemptContext): OutputCommitter = {
     val config = context.getConfiguration.asInstanceOf[JobConf]
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 20b2be17a305d..6f2b73d34c0c6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -67,10 +67,17 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String = "",
+    stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
+  def this(jobId: String, path: String) =
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString)
+
+  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString,
+      dynamicPartitionOverwrite)
+
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 87ffe0b69a469..77f42577517e8 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -119,7 +119,7 @@ private class FullConstructorCommitProtocol(
   arg2: String,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, "", b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
 
   def this(arg1: String, arg2: String) = {
     this(arg1, arg2, false, 2)

From 63e466c5ade9078489e0990acdd5c5f1dd97ac10 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 18:07:59 +0800
Subject: [PATCH 19/42] update

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala   | 2 +-
 .../internal/io/FileCommitProtocolInstantiationSuite.scala  | 6 ++++--
 .../spark/internal/io/cloud/PathOutputCommitProtocol.scala  | 3 +--
 .../apache/spark/sql/execution/metric/SQLMetricsSuite.scala | 3 +--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 23f640b40af87..daa6addc0a9f6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging {
         classOf[String], classOf[String], classOf[String], classOf[Boolean])
       logDebug("Using (String, String, Configuration,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
+        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString),
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 77f42577517e8..778f748f83950 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -58,7 +58,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[Other].getCanonicalName,
         "job",
-        "path")
+        "path",
+        false)
     }
   }
 
@@ -67,7 +68,8 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite {
       FileCommitProtocol.instantiate(
         classOf[NoMatchingArgs].getCanonicalName,
         "job",
-        "path")
+        "path",
+        false)
     }
   }
 
diff --git a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
index c6497fc47d16f..fc5d0a0b3a7f5 100644
--- a/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
+++ b/hadoop-cloud/src/hadoop-3/main/scala/org/apache/spark/internal/io/cloud/PathOutputCommitProtocol.scala
@@ -19,7 +19,6 @@ package org.apache.spark.internal.io.cloud
 
 import java.io.IOException
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter, PathOutputCommitter, PathOutputCommitterFactory}
@@ -52,7 +51,7 @@ class PathOutputCommitProtocol(
     jobId: String,
     dest: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, dest, new Configuration(), false) with Serializable {
+  extends HadoopMapReduceCommitProtocol(jobId, dest, false) with Serializable {
 
   if (dynamicPartitionOverwrite) {
     // until there's explicit extensions to the PathOutputCommitProtocols
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 46d44e3b1a95f..f90c5064b0fcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -835,8 +835,7 @@ case class CustomFileCommitProtocol(
     path: String,
     stagingPath: String,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, stagingPath, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))

From fd2ac5f62e28959ce4c6bd5f92718a5eba19c213 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 13 Oct 2021 18:18:37 +0800
Subject: [PATCH 20/42] update

---
 .../spark/internal/io/FileCommitProtocol.scala   |  8 ++++----
 .../io/HadoopMapReduceCommitProtocol.scala       | 16 ++++++----------
 .../InsertIntoHadoopFsRelationCommand.scala      |  2 +-
 .../SQLHadoopMapReduceCommitProtocol.scala       |  4 ++--
 .../sql/execution/metric/SQLMetricsSuite.scala   |  5 +++--
 .../sql/sources/PartitionedWriteSuite.scala      |  9 ++++-----
 6 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index daa6addc0a9f6..151fcdbec17a5 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -232,7 +232,7 @@ object FileCommitProtocol extends Logging {
       jobId: String,
       outputPath: String,
       dynamicPartitionOverwrite: Boolean = false,
-      stagingDir: Option[String] = None): FileCommitProtocol = {
+      stagingDir: Option[Path] = None): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
       s" dynamic=$dynamicPartitionOverwrite")
@@ -242,10 +242,10 @@ object FileCommitProtocol extends Logging {
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
       val ctor = clazz.getDeclaredConstructor(
-        classOf[String], classOf[String], classOf[String], classOf[Boolean])
-      logDebug("Using (String, String, Configuration,  Boolean) constructor")
+        classOf[String], classOf[String], classOf[Path], classOf[Boolean])
+      logDebug("Using (String, String, Path,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId).toString),
+        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 6f2b73d34c0c6..27e155c5733c7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -40,6 +40,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
+ * @param stagingDir The staging directory of this write job. Spark uses it to deal with
+ *                   files with absolute output path, or writing data into partitioned directory
+ *                   with dynamicPartitionOverwrite=true
  * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
  *                                  dynamically. Suppose final path is /path/to/outputPath, output
  *                                  path of [[FileOutputCommitter]] is an intermediate path, e.g.
@@ -67,16 +70,15 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
   def this(jobId: String, path: String) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString)
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId))
 
   def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId).toString,
-      dynamicPartitionOverwrite)
+    this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite)
 
   import FileCommitProtocol._
 
@@ -108,12 +110,6 @@ class HadoopMapReduceCommitProtocol(
    */
   @transient private var partitionPaths: mutable.Set[String] = null
 
-  /**
-   * The staging directory of this write job. Spark uses it to deal with files with absolute output
-   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
-   */
-  protected var stagingDir: Path = new Path(stagingPath)
-
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index dc6ad7eb8b95f..6605bb6c1e870 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -114,7 +114,7 @@ case class InsertIntoHadoopFsRelationCommand(
       jobId = jobId,
       outputPath = outputPath.toString,
       dynamicPartitionOverwrite = dynamicPartitionOverwrite,
-      stagingDir = Some(tmpLocation.toString))
+      stagingDir = Some(tmpLocation))
 
     val doInsertion = if (mode == SaveMode.Append) {
       true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 7ed76643a4eb2..34ce32e710c48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -32,9 +32,9 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index f90c5064b0fcc..08c86e6edbb08 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -833,9 +834,9 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
 case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 0fb0d85ced2b9..4fffe8f560345 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -37,9 +37,9 @@ import org.apache.spark.util.Utils
 private class OnlyDetectCustomPathFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingPath, dynamicPartitionOverwrite)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -202,10 +202,9 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
-    stagingPath: String,
+    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(
-    jobId, path, stagingPath, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")

From e2951f128b91574e8019bc9e8f3a1bf666f73008 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 08:33:31 +0800
Subject: [PATCH 21/42] fix  UT

---
 .../spark/internal/io/FileCommitProtocol.scala      |  9 ++++++++-
 .../internal/io/HadoopMapReduceCommitProtocol.scala |  5 +++--
 .../io/FileCommitProtocolInstantiationSuite.scala   | 13 ++++++++-----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 151fcdbec17a5..ab6dbd97f11d5 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -245,7 +245,7 @@ object FileCommitProtocol extends Logging {
         classOf[String], classOf[String], classOf[Path], classOf[Boolean])
       logDebug("Using (String, String, Path,  Boolean) constructor")
       ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(new Path(outputPath, ".spark-staging-" + jobId)),
+        stagingDir.getOrElse(getStagingDir(outputPath, jobId)),
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
@@ -258,6 +258,13 @@ object FileCommitProtocol extends Logging {
     }
   }
 
+  def getStagingDir(path: String, jobId: String): Path = {
+    try {
+      new Path(path, ".spark-staging-" + jobId)
+    } catch {
+      case _: Exception => null
+    }
+  }
 
   def newVersionExternalTempPath(
       path: Path,
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 27e155c5733c7..4c8dd50c146cc 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
 /**
@@ -75,10 +76,10 @@ class HadoopMapReduceCommitProtocol(
   extends FileCommitProtocol with Serializable with Logging {
 
   def this(jobId: String, path: String) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId))
+    this(jobId, path, getStagingDir(path, jobId))
 
   def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, new Path(path, ".spark-staging-" + jobId), dynamicPartitionOverwrite)
+    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
 
   import FileCommitProtocol._
 
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 778f748f83950..26f92cb669b6b 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.internal.io
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -119,16 +121,17 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String)
 private class FullConstructorCommitProtocol(
   arg1: String,
   arg2: String,
+  arg3: Path,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) {
 
-  def this(arg1: String, arg2: String) = {
-    this(arg1, arg2, false, 2)
+  def this(arg1: String, arg2: String, arg3: Path) = {
+    this(arg1, arg2, arg3, false, 2)
   }
 
-  def this(arg1: String, arg2: String, b: Boolean) = {
-    this(arg1, arg2, false, 3)
+  def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = {
+    this(arg1, arg2, arg3, false, 3)
   }
 }
 

From c8d0c33b25cf06409798bdc18417887e0f1d1d1c Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 08:49:20 +0800
Subject: [PATCH 22/42] complicated

---
 .../internal/io/FileCommitProtocol.scala      | 21 +++++++++++++------
 ...FileCommitProtocolInstantiationSuite.scala | 13 +++++-------
 .../SQLHadoopMapReduceCommitProtocol.scala    |  7 +++++++
 .../sql/sources/PartitionedWriteSuite.scala   | 11 +++-------
 4 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index ab6dbd97f11d5..7c67e3441b33a 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -249,12 +249,21 @@ object FileCommitProtocol extends Logging {
         dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
-        logDebug("Falling back to (String, String) constructor")
-        require(!dynamicPartitionOverwrite,
-          "Dynamic Partition Overwrite is enabled but" +
-            s" the committer ${className} does not have the appropriate constructor")
-        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-        ctor.newInstance(jobId, outputPath)
+        try {
+          logDebug("Falling back to (String, String, Boolean) constructor")
+          val ctor = clazz.getDeclaredConstructor(
+            classOf[String], classOf[String], classOf[Boolean])
+          ctor.newInstance(jobId, outputPath,
+            dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+        } catch {
+          case _: NoSuchMethodException =>
+            logDebug("Falling back to (String, String) constructor")
+            require(!dynamicPartitionOverwrite,
+              "Dynamic Partition Overwrite is enabled but" +
+                s" the committer ${className} does not have the appropriate constructor")
+            val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+            ctor.newInstance(jobId, outputPath)
+        }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
index 26f92cb669b6b..778f748f83950 100644
--- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.internal.io
 
-import org.apache.hadoop.fs.Path
-
 import org.apache.spark.SparkFunSuite
 
 /**
@@ -121,17 +119,16 @@ private class ClassicConstructorCommitProtocol(arg1: String, arg2: String)
 private class FullConstructorCommitProtocol(
   arg1: String,
   arg2: String,
-  arg3: Path,
   b: Boolean,
   val argCount: Int)
-  extends HadoopMapReduceCommitProtocol(arg1, arg2, arg3, b) {
+  extends HadoopMapReduceCommitProtocol(arg1, arg2, b) {
 
-  def this(arg1: String, arg2: String, arg3: Path) = {
-    this(arg1, arg2, arg3, false, 2)
+  def this(arg1: String, arg2: String) = {
+    this(arg1, arg2, false, 2)
   }
 
-  def this(arg1: String, arg2: String, arg3: Path, b: Boolean) = {
-    this(arg1, arg2, arg3, false, 3)
+  def this(arg1: String, arg2: String, b: Boolean) = {
+    this(arg1, arg2, false, 3)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 34ce32e710c48..a6ecffdcaeb5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -22,6 +22,7 @@ import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
 import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
 import org.apache.spark.sql.internal.SQLConf
 
@@ -37,6 +38,12 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
+  def this(jobId: String, path: String) =
+    this(jobId, path, getStagingDir(path, jobId), false)
+
+  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
+    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
+
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
index 4fffe8f560345..b9266429f81a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala
@@ -34,12 +34,8 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
-private class OnlyDetectCustomPathFileCommitProtocol(
-    jobId: String,
-    path: String,
-    stagingDir: Path,
-    dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
+private class OnlyDetectCustomPathFileCommitProtocol(jobId: String, path: String)
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path)
     with Serializable with Logging {
 
   override def newTaskTempFileAbsPath(
@@ -202,9 +198,8 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession {
 private class PartitionFileExistCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
   override def setupJob(jobContext: JobContext): Unit = {
     super.setupJob(jobContext)
     val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId")

From eafb8dd5b81c587280adddcd9183ee26b2e08c75 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 10:49:26 +0800
Subject: [PATCH 23/42] update

---
 .../internal/io/FileCommitProtocol.scala      | 20 ++++++++++---------
 .../io/HadoopMapReduceCommitProtocol.scala    | 18 ++++-------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 7c67e3441b33a..b32a3979a8d5f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -21,6 +21,8 @@ import java.net.URI
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, Random}
 
+import scala.util.Try
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
@@ -71,7 +73,12 @@ abstract class FileCommitProtocol extends Logging {
   def getWorkPath(): Path = null
 
   /**
-   * Predicate: is there an output path?
+   * Checks whether there are files to be committed to a valid output location.
+   *
+   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
+   * it is necessary to check whether a valid output path is specified.
+   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
+   * committers not writing to distributed file systems.
    */
   def hasOutputPath(): Boolean = {
     getOutputPath() != null;
@@ -268,11 +275,7 @@ object FileCommitProtocol extends Logging {
   }
 
   def getStagingDir(path: String, jobId: String): Path = {
-    try {
-      new Path(path, ".spark-staging-" + jobId)
-    } catch {
-      case _: Exception => null
-    }
+    Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null)
   }
 
   def newVersionExternalTempPath(
@@ -290,15 +293,14 @@ object FileCommitProtocol extends Logging {
     }
   }
 
-
   private def getExtTmpPathRelTo(
       path: Path,
       hadoopConf: Configuration,
       stagingDir: String,
       engineType: String,
       jobId: String): Path = {
-    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId),
-      "-ext-10000") // Hive uses 10000
+    // Hive uses 10000
+    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), "-ext-10000")
   }
 
   private def getExternalScratchDir(
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 4c8dd50c146cc..fff87da244c0c 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -86,16 +86,6 @@ class HadoopMapReduceCommitProtocol(
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
   @transient private var committer: OutputCommitter = _
 
-  /**
-   * Checks whether there are files to be committed to a valid output location.
-   *
-   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
-   * it is necessary to check whether a valid output path is specified.
-   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
-   * committers not writing to distributed file systems.
-   */
-  private val hasValidPath = Try { new Path(path) }.isSuccess
-
   /**
    * Tracks files staged by this task for absolute output paths. These outputs are not managed by
    * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
@@ -121,13 +111,13 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
-  override def getOutputPath(): Path = new Path(path)
+  override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null)
 
   override def getWorkPath: Path = {
     if (dynamicPartitionOverwrite) {
       stagingDir
     } else {
-      new Path(path)
+      getOutputPath()
     }
   }
 
@@ -206,7 +196,7 @@ class HadoopMapReduceCommitProtocol(
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
 
-    if (hasValidPath) {
+    if (hasOutputPath()) {
       val (allAbsPathFiles, allPartitionPaths) =
         taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
       val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
@@ -270,7 +260,7 @@ class HadoopMapReduceCommitProtocol(
         logWarning(s"Exception while aborting ${jobContext.getJobID}", e)
     }
     try {
-      if (hasValidPath) {
+      if (hasOutputPath()) {
         val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
         fs.delete(stagingDir, true)
       }

From ff2bfb89ea40c2626b430aa6c06614e6a9228503 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 11:30:10 +0800
Subject: [PATCH 24/42] revert API change

---
 .../internal/io/FileCommitProtocol.scala      | 27 -------------------
 .../io/HadoopMapReduceCommitProtocol.scala    | 24 ++++++++---------
 .../InsertIntoHadoopFsRelationCommand.scala   | 10 ++++++-
 .../ManifestFileCommitProtocol.scala          |  4 ---
 .../execution/metric/SQLMetricsSuite.scala    |  4 +--
 5 files changed, 22 insertions(+), 47 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index b32a3979a8d5f..f2f26297e02c7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -57,33 +57,6 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
-  /**
-   * Get the final directory where work will be placed once the job
-   * is committed. This may be null, in which case, there is no output
-   * path to write data to.
-   */
-  def getOutputPath(): Path = null
-
-  /**
-   * Get the directory that the task should write results into.
-   * Warning: there's no guarantee that this work path is on the same
-   * FS as the final output, or that it's visible across machines.
-   * May be null.
-   */
-  def getWorkPath(): Path = null
-
-  /**
-   * Checks whether there are files to be committed to a valid output location.
-   *
-   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
-   * it is necessary to check whether a valid output path is specified.
-   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
-   * committers not writing to distributed file systems.
-   */
-  def hasOutputPath(): Boolean = {
-    getOutputPath() != null;
-  }
-
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index fff87da244c0c..35bad427253ed 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -86,6 +86,16 @@ class HadoopMapReduceCommitProtocol(
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
   @transient private var committer: OutputCommitter = _
 
+  /**
+   * Checks whether there are files to be committed to a valid output location.
+   *
+   * As committing and aborting a job occurs on driver, where `addedAbsPathFiles` is always null,
+   * it is necessary to check whether a valid output path is specified.
+   * [[HadoopMapReduceCommitProtocol#path]] need not be a valid [[org.apache.hadoop.fs.Path]] for
+   * committers not writing to distributed file systems.
+   */
+  private val hasValidPath = Try { new Path(path) }.isSuccess
+
   /**
    * Tracks files staged by this task for absolute output paths. These outputs are not managed by
    * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
@@ -111,16 +121,6 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
-  override def getOutputPath(): Path = Try { new Path(path) }.getOrElse(null)
-
-  override def getWorkPath: Path = {
-    if (dynamicPartitionOverwrite) {
-      stagingDir
-    } else {
-      getOutputPath()
-    }
-  }
-
   override def newTaskTempFile(
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     newTaskTempFile(taskContext, dir, FileNameSpec("", ext))
@@ -196,7 +196,7 @@ class HadoopMapReduceCommitProtocol(
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
 
-    if (hasOutputPath()) {
+    if (hasValidPath) {
       val (allAbsPathFiles, allPartitionPaths) =
         taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
       val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
@@ -260,7 +260,7 @@ class HadoopMapReduceCommitProtocol(
         logWarning(s"Exception while aborting ${jobContext.getJobID}", e)
     }
     try {
-      if (hasOutputPath()) {
+      if (hasValidPath) {
         val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
         fs.delete(stagingDir, true)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 6605bb6c1e870..e0461766097c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -167,6 +167,14 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
+      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
+      // will be renamed from staging path to final output path during commit job
+      val committerOutputPath = if (dynamicPartitionOverwrite) {
+        tmpLocation
+      } else {
+        qualifiedOutputPath
+      }
+
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -174,7 +182,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committer.getWorkPath.toString, customPartitionLocations, outputColumns),
+            committerOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 09681f882853f..46ce33687890d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -56,10 +56,6 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     this.batchId = batchId
   }
 
-  override def getOutputPath: Path = new Path(path)
-
-  override def getWorkPath(): Path = new Path(path)
-
   override def setupJob(jobContext: JobContext): Unit = {
     require(fileLog != null, "setupManifestOptions must be called before this function")
     pendingCommitFiles = new ArrayBuffer[Path]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 08c86e6edbb08..32428fbde0016 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -22,7 +22,6 @@ import java.io.File
 import scala.reflect.{classTag, ClassTag}
 import scala.util.Random
 
-import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.io.FileCommitProtocol
@@ -834,9 +833,8 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
 case class CustomFileCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends SQLHadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite) {
+  extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) {
   override def commitTask(
     taskContext: TaskAttemptContext): FileCommitProtocol.TaskCommitMessage = {
     Thread.sleep(Random.nextInt(100))

From c5a1d16d55591edfc539490d193a77eb38a80ea4 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 14 Oct 2021 11:36:11 +0800
Subject: [PATCH 25/42] update

---
 .../org/apache/spark/internal/io/FileCommitProtocol.scala      | 2 +-
 .../datasources/InsertIntoHadoopFsRelationCommand.scala        | 3 +--
 .../org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala   | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index f2f26297e02c7..c421c44414b7f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -251,7 +251,7 @@ object FileCommitProtocol extends Logging {
     Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null)
   }
 
-  def newVersionExternalTempPath(
+  def externalTempPath(
       path: Path,
       hadoopConf: Configuration,
       stagingDir: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index e0461766097c6..45ba105c501fa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -21,7 +21,6 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.FileCommitProtocol
-import org.apache.spark.internal.io.FileCommitProtocol.newVersionExternalTempPath
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -107,7 +106,7 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val jobId = java.util.UUID.randomUUID().toString
-    val tmpLocation = newVersionExternalTempPath(new Path(outputPath.toString), hadoopConf,
+    val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf,
       hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 8c6c10fee7935..c3e9deacd7e80 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -134,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      val externalTempPath = FileCommitProtocol.newVersionExternalTempPath(
+      val externalTempPath = FileCommitProtocol.externalTempPath(
         path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
       createdTempDir = Some(externalTempPath.getParent)
       externalTempPath

From 8b93bad3608cbf20992beec4451c0f6f20183269 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 19 Oct 2021 23:40:46 +0800
Subject: [PATCH 26/42] Update package.scala

---
 .../main/scala/org/apache/spark/internal/config/package.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 8e24105d7742b..64faac9b4a53d 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -2263,7 +2263,8 @@ package object config {
     ConfigBuilder("spark.exec.stagingDir")
       .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
         "absolute output path, or writing data into partitioned directory when " +
-        "dynamic partition overwrite mode.")
+        "dynamic partition overwrite mode. " +
+        "Default value means staging dir is under table path.")
       .version("3.3.0")
       .internal()
       .stringConf

From a0e0ec9d4cc9a863f1691f991ed151a06a110d9c Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 20 Oct 2021 16:23:18 +0800
Subject: [PATCH 27/42] follow comment

---
 .../internal/io/FileCommitProtocol.scala      | 52 +++++++++----------
 .../io/HadoopMapReduceCommitProtocol.scala    | 27 ++++++----
 .../InsertIntoHadoopFsRelationCommand.scala   | 16 +-----
 .../SQLHadoopMapReduceCommitProtocol.scala    | 16 +++---
 4 files changed, 51 insertions(+), 60 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index c421c44414b7f..8c96fcbe8a44d 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -21,8 +21,6 @@ import java.net.URI
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale, Random}
 
-import scala.util.Try
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.mapreduce._
@@ -57,6 +55,21 @@ import org.apache.spark.util.Utils
 abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
+  /**
+   * Get the final directory where work will be placed once the job
+   * is org.apache.hadoop.shaded.com.itted. This may be null, in which case, there is no output
+   * path to write data to.
+   */
+  def getOutputPath: Path = null
+
+  /**
+   * Get the directory that the task should write results into.
+   * Warning: there's no guarantee that this work path is on the same
+   * FS as the final output, or that it's visible across machines.
+   * May be null.
+   */
+  def getWorkPath: Path = null
+
   /**
    * Setups up a job. Must be called on the driver before any other methods can be invoked.
    */
@@ -211,8 +224,7 @@ object FileCommitProtocol extends Logging {
       className: String,
       jobId: String,
       outputPath: String,
-      dynamicPartitionOverwrite: Boolean = false,
-      stagingDir: Option[Path] = None): FileCommitProtocol = {
+      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
 
     logDebug(s"Creating committer $className; job $jobId; output=$outputPath;" +
       s" dynamic=$dynamicPartitionOverwrite")
@@ -221,34 +233,22 @@ object FileCommitProtocol extends Logging {
     // dynamicPartitionOverwrite: Boolean).
     // If that doesn't exist, try the one with (jobId: string, outputPath: String).
     try {
-      val ctor = clazz.getDeclaredConstructor(
-        classOf[String], classOf[String], classOf[Path], classOf[Boolean])
-      logDebug("Using (String, String, Path,  Boolean) constructor")
-      ctor.newInstance(jobId, outputPath,
-        stagingDir.getOrElse(getStagingDir(outputPath, jobId)),
-        dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      logDebug("Using (String, String, Boolean) constructor")
+      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
     } catch {
       case _: NoSuchMethodException =>
-        try {
-          logDebug("Falling back to (String, String, Boolean) constructor")
-          val ctor = clazz.getDeclaredConstructor(
-            classOf[String], classOf[String], classOf[Boolean])
-          ctor.newInstance(jobId, outputPath,
-            dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
-        } catch {
-          case _: NoSuchMethodException =>
-            logDebug("Falling back to (String, String) constructor")
-            require(!dynamicPartitionOverwrite,
-              "Dynamic Partition Overwrite is enabled but" +
-                s" the committer ${className} does not have the appropriate constructor")
-            val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-            ctor.newInstance(jobId, outputPath)
-        }
+        logDebug("Falling back to (String, String) constructor")
+        require(!dynamicPartitionOverwrite,
+          "Dynamic Partition Overwrite is enabled but" +
+            s" the committer ${className} does not have the appropriate constructor")
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
     }
   }
 
   def getStagingDir(path: String, jobId: String): Path = {
-    Try { new Path(path, ".spark-staging-" + jobId) }.getOrElse(null)
+    new Path(path, ".spark-staging-" + jobId)
   }
 
   def externalTempPath(
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 35bad427253ed..33f5249c936df 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -30,7 +30,6 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 
 /**
@@ -41,9 +40,6 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
- * @param stagingDir The staging directory of this write job. Spark uses it to deal with
- *                   files with absolute output path, or writing data into partitioned directory
- *                   with dynamicPartitionOverwrite=true
  * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
  *                                  dynamically. Suppose final path is /path/to/outputPath, output
  *                                  path of [[FileOutputCommitter]] is an intermediate path, e.g.
@@ -71,16 +67,9 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
 class HadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
-  def this(jobId: String, path: String) =
-    this(jobId, path, getStagingDir(path, jobId))
-
-  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
-
   import FileCommitProtocol._
 
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
@@ -111,6 +100,12 @@ class HadoopMapReduceCommitProtocol(
    */
   @transient private var partitionPaths: mutable.Set[String] = null
 
+  /**
+   * The staging directory of this write job. Spark uses it to deal with files with absolute output
+   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
+   */
+  protected val stagingDir = getStagingDir(path, jobId)
+
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()
     // If OutputFormat is Configurable, we should set conf to it.
@@ -121,6 +116,16 @@ class HadoopMapReduceCommitProtocol(
     format.getOutputCommitter(context)
   }
 
+  override def getOutputPath: Path = {
+    if (dynamicPartitionOverwrite) {
+      stagingDir
+    } else {
+      new Path(path)
+    }
+  }
+
+  override def getWorkPath: Path = getOutputPath
+
   override def newTaskTempFile(
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     newTaskTempFile(taskContext, dir, FileNameSpec("", ext))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 45ba105c501fa..f2a8c2372b551 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition}
@@ -106,14 +105,11 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val jobId = java.util.UUID.randomUUID().toString
-    val tmpLocation = FileCommitProtocol.externalTempPath(new Path(outputPath.toString), hadoopConf,
-      hadoopConf.get(EXEC_STAGING_DIR.key, ".spark-staging"), "spark", jobId)
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = jobId,
       outputPath = outputPath.toString,
-      dynamicPartitionOverwrite = dynamicPartitionOverwrite,
-      stagingDir = Some(tmpLocation))
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = if (mode == SaveMode.Append) {
       true
@@ -166,14 +162,6 @@ case class InsertIntoHadoopFsRelationCommand(
         }
       }
 
-      // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files
-      // will be renamed from staging path to final output path during commit job
-      val committerOutputPath = if (dynamicPartitionOverwrite) {
-        tmpLocation
-      } else {
-        qualifiedOutputPath
-      }
-
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
@@ -181,7 +169,7 @@ case class InsertIntoHadoopFsRelationCommand(
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            committerOutputPath.toString, customPartitionLocations, outputColumns),
+            committer.getOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index a6ecffdcaeb5a..8430a77716630 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -21,9 +21,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.io.FileCommitProtocol.getStagingDir
-import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.internal.config.EXEC_STAGING_DIR
+import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -33,16 +34,13 @@ import org.apache.spark.sql.internal.SQLConf
 class SQLHadoopMapReduceCommitProtocol(
     jobId: String,
     path: String,
-    stagingDir: Path,
     dynamicPartitionOverwrite: Boolean = false)
-  extends HadoopMapReduceCommitProtocol(jobId, path, stagingDir, dynamicPartitionOverwrite)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  def this(jobId: String, path: String) =
-    this(jobId, path, getStagingDir(path, jobId), false)
-
-  def this(jobId: String, path: String, dynamicPartitionOverwrite: Boolean) =
-    this(jobId, path, getStagingDir(path, jobId), dynamicPartitionOverwrite)
+  override val stagingDir: Path =
+    FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
+      SQLConf.get.getConfString(EXEC_STAGING_DIR.key), "spark", jobId)
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)

From 872cd01a9741488f76f8983901b52820946afc46 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 20 Oct 2021 16:26:08 +0800
Subject: [PATCH 28/42] update

---
 .../apache/spark/internal/io/FileCommitProtocol.scala    | 9 +++++----
 .../internal/io/HadoopMapReduceCommitProtocol.scala      | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 8c96fcbe8a44d..bdbef8d63bfbc 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -56,9 +56,9 @@ abstract class FileCommitProtocol extends Logging {
   import FileCommitProtocol._
 
   /**
-   * Get the final directory where work will be placed once the job
-   * is org.apache.hadoop.shaded.com.itted. This may be null, in which case, there is no output
-   * path to write data to.
+   * Get the final directory where the result data will be placed once the job
+   * is committed. This may be null, in which case, there is no output
+   * path to write data to and won't write any data.
    */
   def getOutputPath: Path = null
 
@@ -66,7 +66,8 @@ abstract class FileCommitProtocol extends Logging {
    * Get the directory that the task should write results into.
    * Warning: there's no guarantee that this work path is on the same
    * FS as the final output, or that it's visible across machines.
-   * May be null.
+   * May be null, in which case, there is no output path to write data to
+   * and won't write any data.
    */
   def getWorkPath: Path = null
 
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 33f5249c936df..e40de79fc024e 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -104,7 +104,7 @@ class HadoopMapReduceCommitProtocol(
    * The staging directory of this write job. Spark uses it to deal with files with absolute output
    * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
    */
-  protected val stagingDir = getStagingDir(path, jobId)
+  protected def stagingDir = getStagingDir(path, jobId)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.getConstructor().newInstance()

From d664932164333398fd0a9b9c979b9907d58d8e02 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 20 Oct 2021 18:36:17 +0800
Subject: [PATCH 29/42] update

---
 .../org/apache/spark/internal/config/package.scala  | 11 -----------
 .../org/apache/spark/sql/internal/SQLConf.scala     | 13 +++++++++++++
 .../SQLHadoopMapReduceCommitProtocol.scala          |  4 ++--
 .../spark/sql/sources/StagingInsertSuite.scala      |  3 +--
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 64faac9b4a53d..4b7ba9b19f4de 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -2258,15 +2258,4 @@ package object config {
       .version("3.2.0")
       .stringConf
       .createOptional
-
-  val EXEC_STAGING_DIR =
-    ConfigBuilder("spark.exec.stagingDir")
-      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
-        "absolute output path, or writing data into partitioned directory when " +
-        "dynamic partition overwrite mode. " +
-        "Default value means staging dir is under table path.")
-      .version("3.3.0")
-      .internal()
-      .stringConf
-      .createWithDefault(".spark-staging")
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 98aad1c9c83f0..ccffc89f6d98d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3394,6 +3394,17 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val EXEC_STAGING_DIR =
+    ConfigBuilder("spark.sql.exec.stagingDir")
+      .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
+        "absolute output path, or writing data into partitioned directory when " +
+        "dynamic partition overwrite mode. " +
+        "Default value means staging dir is under table path.")
+      .version("3.3.0")
+      .internal()
+      .stringConf
+      .createWithDefault(".spark-staging")
+
   /**
    * Holds information about keys that have been deprecated.
    *
@@ -4115,6 +4126,8 @@ class SQLConf extends Serializable with Logging {
 
   def inferDictAsStruct: Boolean = getConf(SQLConf.INFER_NESTED_DICT_AS_STRUCT)
 
+  def stagingDir: String = getConf(SQLConf.EXEC_STAGING_DIR)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 8430a77716630..337fd34c37df2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -23,9 +23,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.EXEC_STAGING_DIR
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
@@ -40,7 +40,7 @@ class SQLHadoopMapReduceCommitProtocol(
 
   override val stagingDir: Path =
     FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
-      SQLConf.get.getConfString(EXEC_STAGING_DIR.key), "spark", jobId)
+      SQLConf.get.stagingDir, "spark", jobId)
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
index 162cef0cca3f8..bd935d2567269 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.sources
 
 import org.apache.spark.SparkConf
-import org.apache.spark.internal.config.EXEC_STAGING_DIR
 import org.apache.spark.sql.{QueryTest, Row}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -30,7 +29,7 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession {
   val stagingDir = Utils.createTempDir()
 
   override def sparkConf: SparkConf =
-    super.sparkConf.set(EXEC_STAGING_DIR, stagingDir.getAbsolutePath)
+    super.sparkConf.set(SQLConf.EXEC_STAGING_DIR, stagingDir.getAbsolutePath)
 
   override def beforeAll(): Unit = {
     super.beforeAll()

From 997ba25dcaf52d3417739b1517f0038d05139cdb Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 20 Oct 2021 21:29:23 +0800
Subject: [PATCH 30/42] Update InsertSuite.scala

---
 .../org/apache/spark/sql/hive/InsertSuite.scala   | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index 1dfeedcd9a18b..2289d90e7cd06 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -543,28 +543,23 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
     val inputPath = new Path("/tmp/b/c")
     var stagingDir = "tmp/b"
     val id = TaskRunner.getTaskRunnerID
-    var path = FileCommitProtocol.
-      getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
+    var path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b_hive_") != -1)
 
     stagingDir = "tmp/b/c"
-    path = FileCommitProtocol.getStagingDir(
-      inputPath, conf, stagingDir, "hive", id.toString)
+    path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = "d/e"
-    path = FileCommitProtocol.getStagingDir(
-      inputPath, conf, stagingDir, "hive", id.toString)
+    path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.hive-staging_hive_") != -1)
 
     stagingDir = ".d/e"
-    path = FileCommitProtocol.getStagingDir(
-      inputPath, conf, stagingDir, "hive", id.toString)
+    path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/b/c/.d/e_hive_") != -1)
 
     stagingDir = "/tmp/c/"
-    path = FileCommitProtocol.getStagingDir(
-      inputPath, conf, stagingDir, "hive", id.toString)
+    path = FileCommitProtocol.getStagingDir(inputPath, conf, stagingDir, "hive", id.toString)
     assert(path.toString.indexOf("/tmp/c_hive_") != -1)
   }
 

From 44d63de2f83fb37887c32ebc2d8cc789f1ceed9b Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 21 Oct 2021 00:05:57 +0800
Subject: [PATCH 31/42] Update SQLHadoopMapReduceCommitProtocol.scala

---
 .../execution/datasources/SQLHadoopMapReduceCommitProtocol.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 337fd34c37df2..9e0b2a1a00552 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -25,7 +25,6 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.EXEC_STAGING_DIR
 
 /**
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual

From ed588842aa47337c40efd93618d436d025a0c7ac Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 21 Oct 2021 10:39:27 +0800
Subject: [PATCH 32/42] Update SQLConf.scala

---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 19da5300e9e30..d87971966b4cb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3412,8 +3412,7 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
-  val EXEC_STAGING_DIR =
-    ConfigBuilder("spark.sql.exec.stagingDir")
+  val EXEC_STAGING_DIR = buildConf("spark.sql.exec.stagingDir")
       .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
         "absolute output path, or writing data into partitioned directory when " +
         "dynamic partition overwrite mode. " +

From 632d7255c4ddb1aeb367ed6c0123121348975e7e Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Sun, 24 Oct 2021 01:50:12 +0800
Subject: [PATCH 33/42] Update SQLConf.scala

---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index d87971966b4cb..ba7ff98dc8018 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3414,12 +3414,13 @@ object SQLConf {
 
   val EXEC_STAGING_DIR = buildConf("spark.sql.exec.stagingDir")
       .doc("The staging directory of Spark job. Spark uses it to deal with files with " +
-        "absolute output path, or writing data into partitioned directory when " +
-        "dynamic partition overwrite mode. " +
-        "Default value means staging dir is under table path.")
+        "absolute output path, or writing data into partitioned directory " +
+        "when dynamic partition overwrite mode is on. " +
+        "Default value means staging directory is under table path.")
       .version("3.3.0")
       .internal()
       .stringConf
+      .checkValue(!_.isEmpty, "Should not pass an empty string as staging diretory.")
       .createWithDefault(".spark-staging")
 
   val LEGACY_USE_V1_COMMAND =

From cc8a4c3f6a315023a0d749d69580ba2cce3e2e64 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 10 Mar 2022 10:18:57 +0800
Subject: [PATCH 34/42] Update SQLConf.scala

---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 88d1f9883c0e9..7f66b6986cee4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3562,7 +3562,7 @@ object SQLConf {
       .version("3.3.0")
       .internal()
       .stringConf
-      .checkValue(!_.isEmpty, "Should not pass an empty string as staging diretory.")
+      .checkValue(!_.isEmpty, "Should not pass an empty string as staging directory.")
       .createWithDefault(".spark-staging")
 
   val LEGACY_USE_V1_COMMAND =

From 1c5a70a2c9e2e3a00a198119eb8047c1b7213465 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 10 Mar 2022 10:56:18 +0800
Subject: [PATCH 35/42] Update FileCommitProtocol.scala

---
 .../scala/org/apache/spark/internal/io/FileCommitProtocol.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 096d139a2a0bf..b33899936b7d6 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -321,7 +321,7 @@ object FileCommitProtocol extends Logging {
 
     val dir = fs.makeQualified(
       new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
-    logDebug("Created staging dir = " + dir + " for path = " + inputPath)
+    logDebug(s"Created staging dir = $dir for path = $inputPath")
     dir
   }
 

From 1e8424ecbd56b4e14ccdd2e9ff161a5da92237d5 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 10 Mar 2022 11:49:20 +0800
Subject: [PATCH 36/42] Update SQLHadoopMapReduceCommitProtocol.scala

---
 .../datasources/SQLHadoopMapReduceCommitProtocol.scala          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 9e0b2a1a00552..192fbe6e94e39 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -37,7 +37,7 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  override val stagingDir: Path =
+  @transient override val stagingDir: Path =
     FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
       SQLConf.get.stagingDir, "spark", jobId)
 

From 3da83f1bca65e84ef9dca79308026c1a95b8ed57 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 10 Mar 2022 14:19:59 +0800
Subject: [PATCH 37/42] Update SQLHadoopMapReduceCommitProtocol.scala

---
 .../datasources/SQLHadoopMapReduceCommitProtocol.scala          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 192fbe6e94e39..8f682d0fb005a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -37,7 +37,7 @@ class SQLHadoopMapReduceCommitProtocol(
   extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
     with Serializable with Logging {
 
-  @transient override val stagingDir: Path =
+  @transient override lazy val stagingDir: Path =
     FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
       SQLConf.get.stagingDir, "spark", jobId)
 

From 0a729f0e5345958bd7bbf618e5cb2c9e32f01faa Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 14 Mar 2022 13:47:35 +0800
Subject: [PATCH 38/42] update

---
 .../internal/io/FileCommitProtocol.scala      | 25 ++++++++++++++++---
 .../SQLHadoopMapReduceCommitProtocol.scala    |  2 +-
 .../sql/hive/execution/SaveAsHiveFile.scala   |  4 +--
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index b33899936b7d6..cb24e9ec14fb7 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -216,6 +216,9 @@ abstract class FileCommitProtocol extends Logging {
 
 
 object FileCommitProtocol extends Logging {
+  val USING_SPARK_COMMIT_METHOD = "spark"
+  val USING_HIVE_COMMIT_METHOD = "hive"
+
   class TaskCommitMessage(val obj: Any) extends Serializable
 
   object EmptyTaskCommitMessage extends TaskCommitMessage(null)
@@ -262,7 +265,8 @@ object FileCommitProtocol extends Logging {
       jobId: String): Path = {
     val extURI = path.toUri
     if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId)
+      new Path(getExtTmpPathRelTo(path.getParent, hadoopConf, stagingDir, engineType, jobId),
+        "-ext-10000")
     } else {
       new Path(getExternalScratchDir(extURI, hadoopConf, stagingDir, engineType, jobId),
         "-ext-10000")
@@ -275,8 +279,7 @@ object FileCommitProtocol extends Logging {
       stagingDir: String,
       engineType: String,
       jobId: String): Path = {
-    // Hive uses 10000
-    new Path(getStagingDir(path, hadoopConf, stagingDir, engineType, jobId), "-ext-10000")
+    getStagingDir(path, hadoopConf, stagingDir, engineType, jobId)
   }
 
   private def getExternalScratchDir(
@@ -321,6 +324,18 @@ object FileCommitProtocol extends Logging {
 
     val dir = fs.makeQualified(
       new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
+
+    if (engineType == "SPARK") {
+      val stagingFS = dir.getFileSystem(hadoopConf)
+      // SPARK-36579: Current SQLHadoopMapReduceCommitProtocol's dynamic partition overwriting uses
+      // rename operation to move partition's directories. This operation is not supported between
+      // different FileSystems.
+      if (equalsFileSystem(fs, stagingFS)) {
+        logDebug(s"The staging dir '$stagingPathName' should be in a same filesystem " +
+          s"with table location if we set `spark.sql.exec.stagingDir` under the table " +
+          "directory.")
+      }
+    }
     logDebug(s"Created staging dir = $dir for path = $inputPath")
     dir
   }
@@ -336,6 +351,10 @@ object FileCommitProtocol extends Logging {
     val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
     s"${engineType}_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
   }
+
+  def equalsFileSystem(fs1: FileSystem, fs2: FileSystem): Boolean = {
+    fs1.getUri == fs2.getUri
+  }
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 8f682d0fb005a..134910dcd75cc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -39,7 +39,7 @@ class SQLHadoopMapReduceCommitProtocol(
 
   @transient override lazy val stagingDir: Path =
     FileCommitProtocol.externalTempPath(new Path(path), SparkHadoopUtil.get.conf,
-      SQLConf.get.stagingDir, "spark", jobId)
+      SQLConf.get.stagingDir, FileCommitProtocol.USING_SPARK_COMMIT_METHOD, jobId)
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     var committer = super.setupCommitter(context)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index c3e9deacd7e80..e4ffdffe46cdd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -134,8 +134,8 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      val externalTempPath = FileCommitProtocol.externalTempPath(
-        path, hadoopConf, stagingDir, "hive", TaskRunner.getTaskRunnerID.toString)
+      val externalTempPath = FileCommitProtocol.externalTempPath(path, hadoopConf, stagingDir, 
+        FileCommitProtocol.USING_HIVE_COMMIT_METHOD, TaskRunner.getTaskRunnerID.toString)
       createdTempDir = Some(externalTempPath.getParent)
       externalTempPath
     } else {

From 9392bdb61e57dfae3c385c841b1f74ee2b985baa Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 14 Mar 2022 13:55:04 +0800
Subject: [PATCH 39/42] update

---
 .../spark/internal/io/FileCommitProtocol.scala  | 17 +++++++++--------
 .../sql/hive/execution/SaveAsHiveFile.scala     |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index cb24e9ec14fb7..a8e4ad551059f 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -277,22 +277,22 @@ object FileCommitProtocol extends Logging {
       path: Path,
       hadoopConf: Configuration,
       stagingDir: String,
-      engineType: String,
+      commitMethod: String,
       jobId: String): Path = {
-    getStagingDir(path, hadoopConf, stagingDir, engineType, jobId)
+    getStagingDir(path, hadoopConf, stagingDir, commitMethod, jobId)
   }
 
   private def getExternalScratchDir(
       extURI: URI,
       hadoopConf: Configuration,
       stagingDir: String,
-      engineType: String,
+      commitMethod: String,
       jobId: String): Path = {
     getStagingDir(
       new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath),
       hadoopConf,
       stagingDir,
-      engineType,
+      commitMethod,
       jobId)
   }
 
@@ -300,7 +300,7 @@ object FileCommitProtocol extends Logging {
       inputPath: Path,
       hadoopConf: Configuration,
       stagingDir: String,
-      engineType: String,
+      commitMethod: String,
       jobId: String): Path = {
     val inputPathName: String = inputPath.toString
     val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
@@ -319,13 +319,13 @@ object FileCommitProtocol extends Logging {
       logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " +
         "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " +
         "directory.")
-      stagingPathName = new Path(inputPathName, ".hive-staging").toString
+      stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString
     }
 
     val dir = fs.makeQualified(
-      new Path(stagingPathName + "_" + executionId(engineType) + "-" + jobId))
+      new Path(stagingPathName + "_" + executionId(commitMethod) + "-" + jobId))
 
-    if (engineType == "SPARK") {
+    if (commitMethod == USING_SPARK_COMMIT_METHOD) {
       val stagingFS = dir.getFileSystem(hadoopConf)
       // SPARK-36579: Current SQLHadoopMapReduceCommitProtocol's dynamic partition overwriting uses
       // rename operation to move partition's directories. This operation is not supported between
@@ -334,6 +334,7 @@ object FileCommitProtocol extends Logging {
         logDebug(s"The staging dir '$stagingPathName' should be in a same filesystem " +
           s"with table location if we set `spark.sql.exec.stagingDir` under the table " +
           "directory.")
+        stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString
       }
     }
     logDebug(s"Created staging dir = $dir for path = $inputPath")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index e4ffdffe46cdd..5900e03b08b87 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -134,7 +134,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
     if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) {
       oldVersionExternalTempPath(path, hadoopConf, scratchDir)
     } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) {
-      val externalTempPath = FileCommitProtocol.externalTempPath(path, hadoopConf, stagingDir, 
+      val externalTempPath = FileCommitProtocol.externalTempPath(path, hadoopConf, stagingDir,
         FileCommitProtocol.USING_HIVE_COMMIT_METHOD, TaskRunner.getTaskRunnerID.toString)
       createdTempDir = Some(externalTempPath.getParent)
       externalTempPath

From 1178d1ba15c810fd429bfc4fd9383e3d22a4cf3b Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 14 Mar 2022 15:25:42 +0800
Subject: [PATCH 40/42] update

---
 .../internal/io/FileCommitProtocol.scala      | 11 ++++++----
 .../io/HadoopMapReduceCommitProtocol.scala    |  1 +
 .../SQLHadoopMapReduceCommitProtocol.scala    |  1 +
 ...la => CommitProtocolStagingDirSuite.scala} | 22 ++++++++++++++-----
 4 files changed, 26 insertions(+), 9 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/sources/{StagingInsertSuite.scala => CommitProtocolStagingDirSuite.scala} (78%)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index a8e4ad551059f..43cff753a44a2 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -322,11 +322,8 @@ object FileCommitProtocol extends Logging {
       stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString
     }
 
-    val dir = fs.makeQualified(
-      new Path(stagingPathName + "_" + executionId(commitMethod) + "-" + jobId))
-
     if (commitMethod == USING_SPARK_COMMIT_METHOD) {
-      val stagingFS = dir.getFileSystem(hadoopConf)
+      val stagingFS = new Path(stagingPathName).getFileSystem(hadoopConf)
       // SPARK-36579: Current SQLHadoopMapReduceCommitProtocol's dynamic partition overwriting uses
       // rename operation to move partition's directories. This operation is not supported between
       // different FileSystems.
@@ -337,6 +334,12 @@ object FileCommitProtocol extends Logging {
         stagingPathName = new Path(inputPathName, s".$commitMethod-staging").toString
       }
     }
+    val dir = commitMethod match {
+      case USING_SPARK_COMMIT_METHOD =>
+        new Path(stagingPathName + "-" + jobId)
+      case USING_HIVE_COMMIT_METHOD =>
+        fs.makeQualified(new Path(stagingPathName + "_" + executionId(commitMethod) + "-" + jobId))
+    }
     logDebug(s"Created staging dir = $dir for path = $inputPath")
     dir
   }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 9da91af77bd33..27d5a48f5961d 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -238,6 +238,7 @@ class HadoopMapReduceCommitProtocol(
             // a parent that exists, otherwise we may get unexpected result on the rename.
             fs.mkdirs(finalPartPath.getParent)
           }
+          println(s"When commit job ${stagingDir}")
           val stagingPartPath = new Path(stagingDir, part)
           if (!fs.rename(stagingPartPath, finalPartPath)) {
             throw new IOException(s"Failed to rename $stagingPartPath to $finalPartPath when " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 134910dcd75cc..47312404fbffb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -61,6 +61,7 @@ class SQLHadoopMapReduceCommitProtocol(
         // So, we will use the FileOutputCommitter-specified constructor.
         val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
         val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path)
+        println(s"committerOutputPath = ${committerOutputPath}")
         committer = ctor.newInstance(committerOutputPath, context)
       } else {
         // The specified output committer is just an OutputCommitter.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala
similarity index 78%
rename from sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala
index bd935d2567269..de9c643998f56 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/StagingInsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala
@@ -23,22 +23,24 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
-class StagingInsertSuite extends QueryTest with SharedSparkSession {
+
+abstract class CommitProtocolStagingDirBaseSuite extends QueryTest with SharedSparkSession {
   import testImplicits._
 
-  val stagingDir = Utils.createTempDir()
+  def stagingDir: String
+  def cleanStagingDir(): Unit
 
   override def sparkConf: SparkConf =
-    super.sparkConf.set(SQLConf.EXEC_STAGING_DIR, stagingDir.getAbsolutePath)
+    super.sparkConf.set(SQLConf.EXEC_STAGING_DIR, stagingDir)
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-    stagingDir.delete()
+    cleanStagingDir()
   }
 
   override def afterAll(): Unit = {
     try {
-      Utils.deleteRecursively(stagingDir)
+      cleanStagingDir()
     } finally {
       super.afterAll()
     }
@@ -68,3 +70,13 @@ class StagingInsertSuite extends QueryTest with SharedSparkSession {
     }
   }
 }
+
+class LocalStagingDirSuite extends CommitProtocolStagingDirBaseSuite {
+  val stagingDirFile = Utils.createTempDir()
+  override val stagingDir = stagingDirFile.getAbsolutePath
+
+  println(s"stagingDirFile = ${stagingDirFile}")
+  override def cleanStagingDir(): Unit = {
+    Utils.deleteRecursively(stagingDirFile)
+  }
+}

From 9dba59069a71e405c37a3ef606a4ea88c8a17f18 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 14 Mar 2022 15:26:32 +0800
Subject: [PATCH 41/42] update

---
 .../apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala | 1 -
 .../execution/datasources/SQLHadoopMapReduceCommitProtocol.scala | 1 -
 .../apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala | 1 -
 3 files changed, 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 27d5a48f5961d..9da91af77bd33 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -238,7 +238,6 @@ class HadoopMapReduceCommitProtocol(
             // a parent that exists, otherwise we may get unexpected result on the rename.
             fs.mkdirs(finalPartPath.getParent)
           }
-          println(s"When commit job ${stagingDir}")
           val stagingPartPath = new Path(stagingDir, part)
           if (!fs.rename(stagingPartPath, finalPartPath)) {
             throw new IOException(s"Failed to rename $stagingPartPath to $finalPartPath when " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 47312404fbffb..134910dcd75cc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -61,7 +61,6 @@ class SQLHadoopMapReduceCommitProtocol(
         // So, we will use the FileOutputCommitter-specified constructor.
         val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
         val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path)
-        println(s"committerOutputPath = ${committerOutputPath}")
         committer = ctor.newInstance(committerOutputPath, context)
       } else {
         // The specified output committer is just an OutputCommitter.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala
index de9c643998f56..6704bf322f457 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CommitProtocolStagingDirSuite.scala
@@ -75,7 +75,6 @@ class LocalStagingDirSuite extends CommitProtocolStagingDirBaseSuite {
   val stagingDirFile = Utils.createTempDir()
   override val stagingDir = stagingDirFile.getAbsolutePath
 
-  println(s"stagingDirFile = ${stagingDirFile}")
   override def cleanStagingDir(): Unit = {
     Utils.deleteRecursively(stagingDirFile)
   }

From b998ec9d7d5b514940b39daa0a32f1d4e8bf9e87 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 14 Mar 2022 15:38:25 +0800
Subject: [PATCH 42/42] Update InsertSuite.scala

---
 .../test/scala/org/apache/spark/sql/sources/InsertSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index d7c2f44b79506..b553e6ed566b5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -1116,6 +1116,6 @@ class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLoca
   }
 
   private def isSparkStagingDir(path: Path): Boolean = {
-    path.toString.contains(".spark-staging_")
+    path.toString.contains(".spark-staging-")
   }
 }