From 5f37003b94e68e1a7f22f9f13293e0aae6ae9563 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Tue, 2 Feb 2021 17:04:03 +0900 Subject: [PATCH 1/2] [SPARK-34326][CORE][SQL] Fix UTs added in SPARK-31793 depending on the length of temp path --- .../org/apache/spark/util/UtilsSuite.scala | 6 ++++ .../DataSourceScanExecRedactionSuite.scala | 32 ++++++++++++++++--- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8fb408041ca9d..18ff96021153f 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -1308,6 +1308,12 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.buildLocationMetadata(paths, 10) == "[path0, path1]") assert(Utils.buildLocationMetadata(paths, 15) == "[path0, path1, path2]") assert(Utils.buildLocationMetadata(paths, 25) == "[path0, path1, path2, path3]") + + // edge-case: we should consider the fact non-path chars including '[' and ", " are accounted + // 1. second path is not added due to the addition of '[' + assert(Utils.buildLocationMetadata(paths, 6) == "[path0]") + // 2. third path is not added due to the addition of ", " + assert(Utils.buildLocationMetadata(paths, 13) == "[path0, path1]") } test("checkHost supports both IPV4 and IPV6") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala index c99be986ddca5..798c2ff529c98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution import java.io.File import scala.collection.mutable +import scala.util.Random import org.apache.hadoop.fs.Path @@ -122,14 +123,27 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest { test("SPARK-31793: FileSourceScanExec metadata should contain limited file paths") { withTempPath { path => val dir = path.getCanonicalPath + + // create a sub-directory with long name so that each root path will always exceed the limit + // this is to ensure we always test the case for the path truncation + // 110 = limit 100 + margin 10 to clearly avoid edge case + val dataDir = if (dir.length >= 110) { + path + } else { + val dataDirName = Random.alphanumeric.take(110 - dir.length).toList.mkString + val f = new File(path, dataDirName) + f.mkdir() + f + } + val partitionCol = "partitionCol" spark.range(10) .select("id", "id") .toDF("value", partitionCol) .write .partitionBy(partitionCol) - .orc(dir) - val paths = (0 to 9).map(i => new File(dir, s"$partitionCol=$i").getCanonicalPath) + .orc(dataDir.getCanonicalPath) + val paths = (0 to 9).map(i => new File(dataDir, s"$partitionCol=$i").getCanonicalPath) val plan = spark.read.orc(paths: _*).queryExecution.executedPlan val location = plan collectFirst { case f: FileSourceScanExec => f.metadata("Location") @@ -137,9 +151,17 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest { assert(location.isDefined) // The location metadata should at least contain one path assert(location.get.contains(paths.head)) - // If the temp path length is larger than 100, the metadata length should not exceed - // twice of the length; otherwise, the metadata length should be controlled within 200. - assert(location.get.length < Math.max(paths.head.length, 100) * 2) + + // The location metadata should have bracket wrapping paths + assert(location.get.indexOf('[') > -1) + assert(location.get.indexOf(']') > -1) + + // extract paths in location metadata (removing classname, brackets, separators) + val pathsInLocation = location.get.substring( + location.get.indexOf('[') + 1, location.get.indexOf(']')).split(", ").toSeq + + // the only one path should be available + assert(pathsInLocation.size == 1) } } } From c6cd28914e2b582244f5121b9f307aa7987b1e23 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Wed, 3 Feb 2021 17:42:55 +0900 Subject: [PATCH 2/2] Simplify the test --- .../execution/DataSourceScanExecRedactionSuite.scala | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala index 798c2ff529c98..ccac5a00fdb05 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala @@ -126,15 +126,9 @@ class DataSourceScanExecRedactionSuite extends DataSourceScanRedactionTest { // create a sub-directory with long name so that each root path will always exceed the limit // this is to ensure we always test the case for the path truncation - // 110 = limit 100 + margin 10 to clearly avoid edge case - val dataDir = if (dir.length >= 110) { - path - } else { - val dataDirName = Random.alphanumeric.take(110 - dir.length).toList.mkString - val f = new File(path, dataDirName) - f.mkdir() - f - } + val dataDirName = Random.alphanumeric.take(100).toList.mkString + val dataDir = new File(path, dataDirName) + dataDir.mkdir() val partitionCol = "partitionCol" spark.range(10)