From 5f37003b94e68e1a7f22f9f13293e0aae6ae9563 Mon Sep 17 00:00:00 2001
From: "Jungtaek Lim (HeartSaVioR)" <kabhwan.opensource@gmail.com>
Date: Tue, 2 Feb 2021 17:04:03 +0900
Subject: [PATCH 1/2] [SPARK-34326][CORE][SQL] Fix UTs added in SPARK-31793
 depending on the length of temp path

---
 .../org/apache/spark/util/UtilsSuite.scala    |  6 ++++
 .../DataSourceScanExecRedactionSuite.scala    | 32 ++++++++++++++++---
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 8fb408041ca9d..18ff96021153f 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1308,6 +1308,12 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(Utils.buildLocationMetadata(paths, 10) == "[path0, path1]")
     assert(Utils.buildLocationMetadata(paths, 15) == "[path0, path1, path2]")
     assert(Utils.buildLocationMetadata(paths, 25) == "[path0, path1, path2, path3]")
+
+    // edge-case: we should consider the fact non-path chars including '[' and ", " are accounted
+    // 1. second path is not added due to the addition of '['
+    assert(Utils.buildLocationMetadata(paths, 6) == "[path0]")
+    // 2. third path is not added due to the addition of ", "
+    assert(Utils.buildLocationMetadata(paths, 13) == "[path0, path1]")
   }
 
   test("checkHost supports both IPV4 and IPV6") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
index c99be986ddca5..798c2ff529c98 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution
 import java.io.File
 
 import scala.collection.mutable
+import scala.util.Random
 
 import org.apache.hadoop.fs.Path
 
@@ -122,14 +123,27 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
   test("SPARK-31793: FileSourceScanExec metadata should contain limited file paths") {
     withTempPath { path =>
       val dir = path.getCanonicalPath
+
+      // create a sub-directory with long name so that each root path will always exceed the limit
+      // this is to ensure we always test the case for the path truncation
+      // 110 = limit 100 + margin 10 to clearly avoid edge case
+      val dataDir = if (dir.length >= 110) {
+        path
+      } else {
+        val dataDirName = Random.alphanumeric.take(110 - dir.length).toList.mkString
+        val f = new File(path, dataDirName)
+        f.mkdir()
+        f
+      }
+
       val partitionCol = "partitionCol"
       spark.range(10)
         .select("id", "id")
         .toDF("value", partitionCol)
         .write
         .partitionBy(partitionCol)
-        .orc(dir)
-      val paths = (0 to 9).map(i => new File(dir, s"$partitionCol=$i").getCanonicalPath)
+        .orc(dataDir.getCanonicalPath)
+      val paths = (0 to 9).map(i => new File(dataDir, s"$partitionCol=$i").getCanonicalPath)
       val plan = spark.read.orc(paths: _*).queryExecution.executedPlan
       val location = plan collectFirst {
         case f: FileSourceScanExec => f.metadata("Location")
@@ -137,9 +151,17 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
       assert(location.isDefined)
       // The location metadata should at least contain one path
       assert(location.get.contains(paths.head))
-      // If the temp path length is larger than 100, the metadata length should not exceed
-      // twice of the length; otherwise, the metadata length should be controlled within 200.
-      assert(location.get.length < Math.max(paths.head.length, 100) * 2)
+
+      // The location metadata should have bracket wrapping paths
+      assert(location.get.indexOf('[') > -1)
+      assert(location.get.indexOf(']') > -1)
+
+      // extract paths in location metadata (removing classname, brackets, separators)
+      val pathsInLocation = location.get.substring(
+        location.get.indexOf('[') + 1, location.get.indexOf(']')).split(", ").toSeq
+
+      // the only one path should be available
+      assert(pathsInLocation.size == 1)
     }
   }
 }

From c6cd28914e2b582244f5121b9f307aa7987b1e23 Mon Sep 17 00:00:00 2001
From: "Jungtaek Lim (HeartSaVioR)" <kabhwan.opensource@gmail.com>
Date: Wed, 3 Feb 2021 17:42:55 +0900
Subject: [PATCH 2/2] Simplify the test

---
 .../execution/DataSourceScanExecRedactionSuite.scala | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
index 798c2ff529c98..ccac5a00fdb05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
@@ -126,15 +126,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest {
 
       // create a sub-directory with long name so that each root path will always exceed the limit
       // this is to ensure we always test the case for the path truncation
-      // 110 = limit 100 + margin 10 to clearly avoid edge case
-      val dataDir = if (dir.length >= 110) {
-        path
-      } else {
-        val dataDirName = Random.alphanumeric.take(110 - dir.length).toList.mkString
-        val f = new File(path, dataDirName)
-        f.mkdir()
-        f
-      }
+      val dataDirName = Random.alphanumeric.take(100).toList.mkString
+      val dataDir = new File(path, dataDirName)
+      dataDir.mkdir()
 
       val partitionCol = "partitionCol"
       spark.range(10)