From 5a80e69b5ea2870d34d44625162493858398c4a0 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 11:39:44 +0300 Subject: [PATCH 01/24] Add the test for checking output of expression examples --- .../org/apache/spark/sql/SQLQuerySuite.scala | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index b5d021549c7a8..a92192b9d8b65 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.{AccumulatorSuite, SparkException} import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.catalyst.util.StringUtils +import org.apache.spark.sql.execution.HiveResult.hiveResultString import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.datasources.v2.BatchScanExec @@ -140,6 +141,32 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } } + test("check outputs of expression examples") { + val exampleRe = ">(.+);\n(.+)".r + val ignoreSet = Set( + // One of examples shows getting the current timestamp + "org.apache.spark.sql.catalyst.expressions.UnixTimestamp") + + withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { + spark.sessionState.functionRegistry.listFunction().foreach { funcId => + val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) + val className = info.getClassName + if (!ignoreSet.contains(className)) { + withClue(s"Function '${info.getName}', Expression class '$className'") { + exampleRe.findAllIn(info.getExamples).toList.foreach(_ match { + case exampleRe(sql, output) => + val df = spark.sql(sql) + val actual = hiveResultString(df.queryExecution.executedPlan).mkString("\n").trim + val expected = output.trim + assert(actual === expected) + case other => throw new IllegalArgumentException(other) + }) + } + } + } + } + } + test("SPARK-6743: no columns from cache") { Seq( (83, 0, 38), From 1f52b756c572cb16d3b7900b22be2ac5ddfd478d Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 11:40:07 +0300 Subject: [PATCH 02/24] Fix expected results --- .../expressions/aggregate/ApproximatePercentile.scala | 2 +- .../spark/sql/catalyst/expressions/csvExpressions.scala | 4 ++-- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 2 +- .../spark/sql/catalyst/expressions/jsonExpressions.scala | 4 ++-- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- .../org/apache/spark/sql/catalyst/expressions/xml/xpath.scala | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index ea0ed2e8fa11b..59481ce049165 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -63,7 +63,7 @@ import org.apache.spark.sql.types._ > SELECT _FUNC_(10.0, array(0.5, 0.4, 0.1), 100); [10.0,10.0,10.0] > SELECT _FUNC_(10.0, 0.5, 100); - 10.0 + 10 """, since = "2.1.0") case class ApproximatePercentile( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index 65b10f36373d1..acbf4bdc3a7f4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -40,7 +40,7 @@ import org.apache.spark.unsafe.types.UTF8String examples = """ Examples: > SELECT _FUNC_('1, 0.8', 'a INT, b DOUBLE'); - {"a":1, "b":0.8} + {"a":1,"b":0.8} > SELECT _FUNC_('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) {"time":2015-08-26 00:00:00.0} """, @@ -199,7 +199,7 @@ case class SchemaOfCsv( > SELECT _FUNC_(named_struct('a', 1, 'b', 2)); 1,2 > SELECT _FUNC_(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy')); - "26/08/2015" + 26/08/2015 """, since = "3.0.0") // scalastyle:on line.size.limit diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 2543828b6315f..8fe9d7ea5a347 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -842,7 +842,7 @@ abstract class UnixTime extends ToTimestamp { examples = """ Examples: > SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss'); - 1970-01-01 00:00:00 + 1969-12-31 16:00:00 """, since = "1.5.0") case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[String] = None) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 655e44e4e4919..e9115dbc2f559 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -502,9 +502,9 @@ case class JsonTuple(children: Seq[Expression]) examples = """ Examples: > SELECT _FUNC_('{"a":1, "b":0.8}', 'a INT, b DOUBLE'); - {"a":1, "b":0.8} + {"a":1,"b":0.8} > SELECT _FUNC_('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')); - {"time":"2015-08-26 00:00:00.0"} + {"time":2015-08-26 00:00:00.0} """, since = "2.2.0") // scalastyle:on line.size.limit diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index e4847e9cec3f0..ff0d09b7c7fdb 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -949,7 +949,7 @@ object StringTrimRight { usage = """ _FUNC_(str) - Removes the trailing space characters from `str`. - _FUNC_(trimStr, str) - Removes the trailing string which contains the characters from the trim string from the `str` + _FUNC_(str, trimStr) - Removes the trailing string which contains the characters from the trim string from the `str` """, arguments = """ Arguments: @@ -960,7 +960,7 @@ object StringTrimRight { Examples: > SELECT _FUNC_(' SparkSQL '); SparkSQL - > SELECT _FUNC_('LQSa', 'SSparkSQLS'); + > SELECT _FUNC_('SSparkSQLS', 'SQLS'); SSpark """, since = "1.5.0") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala index aacf1a44e2ad0..073b45af51caf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala @@ -194,7 +194,7 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract { examples = """ Examples: > SELECT _FUNC_('b1b2b3c1c2','a/b/text()'); - ['b1','b2','b3'] + ["b1","b2","b3"] """) // scalastyle:on line.size.limit case class XPathList(xml: Expression, path: Expression) extends XPathExtract { From d90a6aee2c5431b9f2434c75f2647202f772e503 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 11:44:39 +0300 Subject: [PATCH 03/24] Fix JsonTuple --- .../apache/spark/sql/catalyst/expressions/jsonExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index e9115dbc2f559..0c7dcea9d120f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -337,7 +337,7 @@ case class GetJsonObject(json: Expression, path: Expression) examples = """ Examples: > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b'); - 1 2 + 1 2 """) // scalastyle:on line.size.limit case class JsonTuple(children: Seq[Expression]) From b0d2d4b237fb257f6bdeb25c50b027fd7baa3c72 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:13:21 +0300 Subject: [PATCH 04/24] Fix ToUnixTimestamp --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 8fe9d7ea5a347..cf145fb35836c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -631,7 +631,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti examples = """ Examples: > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd'); - 1460041200 + 1460098800 """, since = "1.6.0") case class ToUnixTimestamp( From 8abbd8687d7fcd18cc12b32159b545f438b0c719 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:14:10 +0300 Subject: [PATCH 05/24] Fix Round --- .../apache/spark/sql/catalyst/expressions/mathExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index b9f089ec056c2..c8a720151ef53 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -1291,7 +1291,7 @@ abstract class RoundBase(child: Expression, scale: Expression, examples = """ Examples: > SELECT _FUNC_(2.5, 0); - 3.0 + 3 """) // scalastyle:on line.size.limit case class Round(child: Expression, scale: Expression) From 3dbb9cff45e58116c52413ae4f77ca8a3acfdfc3 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:17:54 +0300 Subject: [PATCH 06/24] Fix Skewness --- .../sql/catalyst/expressions/aggregate/CentralMomentAgg.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala index 1870c58c548c9..ef2c86d21a24c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala @@ -223,7 +223,7 @@ case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) { examples = """ Examples: > SELECT _FUNC_(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col); - 1.1135657469022013 + 1.1135657469022011 > SELECT _FUNC_(col) FROM VALUES (-1000), (-100), (10), (20) AS tab(col); -1.1135657469022011 """, From 9834f04c143c98de93c181a7fa568d5b1fbd402a Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:19:59 +0300 Subject: [PATCH 07/24] Fix StringToMap --- .../spark/sql/catalyst/expressions/complexTypeCreator.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 319a7fc87e59a..cae3c0528e136 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -422,9 +422,9 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateName examples = """ Examples: > SELECT _FUNC_('a:1,b:2,c:3', ',', ':'); - map("a":"1","b":"2","c":"3") + {"a":"1","b":"2","c":"3"} > SELECT _FUNC_('a'); - map("a":null) + {"a":null} """) // scalastyle:on line.size.limit case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression) From c60bf69475c73bcd1dae38a3a3bb027efb9ed640 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:22:56 +0300 Subject: [PATCH 08/24] Fix ArrayForAll --- .../spark/sql/catalyst/expressions/higherOrderFunctions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index ed26bb375de25..3b104e1a78b78 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -463,7 +463,7 @@ case class ArrayExists( > SELECT _FUNC_(array(1, null, 3), x -> x % 2 == 0); false > SELECT _FUNC_(array(2, null, 8), x -> x % 2 == 0); - null + NULL """, since = "3.0.0") case class ArrayForAll( From 201863cc8ddad772e7e41c83ae108a06a632c167 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:39:31 +0300 Subject: [PATCH 09/24] Fix StringTrimLeft --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index ff0d09b7c7fdb..6993ec3f9d714 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -847,7 +847,7 @@ object StringTrimLeft { usage = """ _FUNC_(str) - Removes the leading space characters from `str`. - _FUNC_(trimStr, str) - Removes the leading string contains the characters from the trim string + _FUNC_(str, trimStr) - Removes the leading string contains the characters from the trim string """, arguments = """ Arguments: @@ -858,7 +858,7 @@ object StringTrimLeft { Examples: > SELECT _FUNC_(' SparkSQL '); SparkSQL - > SELECT _FUNC_('Sp', 'SSparkSQLS'); + > SELECT _FUNC_('SparkSQLS', 'Sp'); arkSQLS """, since = "1.5.0") From 6c158d83f6471116c429944a29dd0914fd64ba88 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:47:44 +0300 Subject: [PATCH 10/24] Fix MakeTimestamp --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index cf145fb35836c..79d05395a4306 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1766,7 +1766,7 @@ case class MakeDate(year: Expression, month: Expression, day: Expression) > SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887); 2014-12-28 06:30:45.887 > SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887, 'CET'); - 2014-12-28 10:30:45.887 + 2014-12-27 21:30:45.887 > SELECT _FUNC_(2019, 6, 30, 23, 59, 60) 2019-07-01 00:00:00 > SELECT _FUNC_(2019, 13, 1, 10, 11, 12, 13); From 3dd25a7c6479d446c5f47a3593f1afaf9133dc35 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:55:23 +0300 Subject: [PATCH 11/24] Fix Kurtosis --- .../sql/catalyst/expressions/aggregate/CentralMomentAgg.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala index ef2c86d21a24c..8ce8dfa19c017 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala @@ -245,9 +245,9 @@ case class Skewness(child: Expression) extends CentralMomentAgg(child) { examples = """ Examples: > SELECT _FUNC_(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col); - -0.7014368047529618 + -0.7014368047529627 > SELECT _FUNC_(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col); - 0.19432323191698986 + 0.19432323191699075 """, since = "1.6.0") case class Kurtosis(child: Expression) extends CentralMomentAgg(child) { From 1216b0292be3044306cbc7196367d3a5286a98f5 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 12:59:11 +0300 Subject: [PATCH 12/24] Fix BRound --- .../apache/spark/sql/catalyst/expressions/mathExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index c8a720151ef53..d5b959b91c23d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -1311,7 +1311,7 @@ case class Round(child: Expression, scale: Expression) examples = """ Examples: > SELECT _FUNC_(2.5, 0); - 2.0 + 2 """) // scalastyle:on line.size.limit case class BRound(child: Expression, scale: Expression) From 4be0acd307d43aa98fe01e3f283bbdc0df1a190b Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 13:16:04 +0300 Subject: [PATCH 13/24] Disable Scala style checker for JsonTuple --- .../spark/sql/catalyst/expressions/jsonExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 0c7dcea9d120f..95b5b317b2920 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -331,7 +331,7 @@ case class GetJsonObject(json: Expression, path: Expression) } } -// scalastyle:off line.size.limit +// scalastyle:off @ExpressionDescription( usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.", examples = """ @@ -339,7 +339,7 @@ case class GetJsonObject(json: Expression, path: Expression) > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b'); 1 2 """) -// scalastyle:on line.size.limit +// scalastyle:on case class JsonTuple(children: Seq[Expression]) extends Generator with CodegenFallback { From 4740c8dc4c04431d6ed23f72630172454c61f582 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 13:16:23 +0300 Subject: [PATCH 14/24] Update the ignore list --- .../org/apache/spark/sql/SQLQuerySuite.scala | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index a92192b9d8b65..299311cc453a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -145,7 +145,30 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { val exampleRe = ">(.+);\n(.+)".r val ignoreSet = Set( // One of examples shows getting the current timestamp - "org.apache.spark.sql.catalyst.expressions.UnixTimestamp") + "org.apache.spark.sql.catalyst.expressions.UnixTimestamp", + // Random output without a seed + "org.apache.spark.sql.catalyst.expressions.Rand", + "org.apache.spark.sql.catalyst.expressions.Randn", + "org.apache.spark.sql.catalyst.expressions.Shuffle", + "org.apache.spark.sql.catalyst.expressions.Uuid", + "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection", + // TODO: handle multiline output, look at the DOTALL flag + "org.apache.spark.sql.catalyst.expressions.GroupingID", + "org.apache.spark.sql.catalyst.expressions.Stack", + "org.apache.spark.sql.catalyst.expressions.PosExplode", + "org.apache.spark.sql.catalyst.expressions.Explode", + "org.apache.spark.sql.catalyst.expressions.Cube", + "org.apache.spark.sql.catalyst.expressions.Inline", + "org.apache.spark.sql.catalyst.expressions.Rollup", + "org.apache.spark.sql.catalyst.expressions.Grouping", + // Fails on parsing `SELECT 2 mod 1.8`: + // org.apache.spark.sql.catalyst.parser.ParseException: + // extraneous input '1.8' expecting (line 1, pos 14) + "org.apache.spark.sql.catalyst.expressions.Remainder", + // Fails on `SELECT make_timestamp(2019, 13, 1, 10, 11, 12, 13)`: + // Invalid ID for region-based ZoneId, invalid format: 13 + // java.time.DateTimeException: Invalid ID for region-based ZoneId, invalid format: 13 + "org.apache.spark.sql.catalyst.expressions.MakeTimestamp") withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { spark.sessionState.functionRegistry.listFunction().foreach { funcId => From 3bc35f6d35578f99274804d74eab8882d01df770 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 16:33:10 +0300 Subject: [PATCH 15/24] Support multiline examples --- .../org/apache/spark/sql/SQLQuerySuite.scala | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 299311cc453a2..3f7d6d578f542 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -142,7 +142,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } test("check outputs of expression examples") { - val exampleRe = ">(.+);\n(.+)".r + val exampleRe = """^(.+);\n(?s)(.+)$""".r val ignoreSet = Set( // One of examples shows getting the current timestamp "org.apache.spark.sql.catalyst.expressions.UnixTimestamp", @@ -151,24 +151,24 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { "org.apache.spark.sql.catalyst.expressions.Randn", "org.apache.spark.sql.catalyst.expressions.Shuffle", "org.apache.spark.sql.catalyst.expressions.Uuid", + // The example call methods that return unstable results. "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection", - // TODO: handle multiline output, look at the DOTALL flag - "org.apache.spark.sql.catalyst.expressions.GroupingID", - "org.apache.spark.sql.catalyst.expressions.Stack", - "org.apache.spark.sql.catalyst.expressions.PosExplode", - "org.apache.spark.sql.catalyst.expressions.Explode", - "org.apache.spark.sql.catalyst.expressions.Cube", - "org.apache.spark.sql.catalyst.expressions.Inline", - "org.apache.spark.sql.catalyst.expressions.Rollup", - "org.apache.spark.sql.catalyst.expressions.Grouping", // Fails on parsing `SELECT 2 mod 1.8`: // org.apache.spark.sql.catalyst.parser.ParseException: // extraneous input '1.8' expecting (line 1, pos 14) "org.apache.spark.sql.catalyst.expressions.Remainder", // Fails on `SELECT make_timestamp(2019, 13, 1, 10, 11, 12, 13)`: - // Invalid ID for region-based ZoneId, invalid format: 13 - // java.time.DateTimeException: Invalid ID for region-based ZoneId, invalid format: 13 - "org.apache.spark.sql.catalyst.expressions.MakeTimestamp") + // Invalid ID for region-based ZoneId, invalid format: 13 + // java.time.DateTimeException: Invalid ID for region-based ZoneId, invalid format: 13 + "org.apache.spark.sql.catalyst.expressions.MakeTimestamp", + // Fails on `SELECT '%SystemDrive%\Users\John' like '\%SystemDrive\%\\Users%'` + // the pattern '\%SystemDrive\%\Users%' is invalid, ... + "org.apache.spark.sql.catalyst.expressions.Like", + // Unsupported format of the examples, + // and `SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*'` fails: + // Error in query: + // extraneous input ''%SystemDrive%\\Users.*'' expecting (line 1, pos 41) + "org.apache.spark.sql.catalyst.expressions.RLike") withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { spark.sessionState.functionRegistry.listFunction().foreach { funcId => @@ -176,13 +176,17 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { val className = info.getClassName if (!ignoreSet.contains(className)) { withClue(s"Function '${info.getName}', Expression class '$className'") { - exampleRe.findAllIn(info.getExamples).toList.foreach(_ match { + logTrace(info.getExamples) + info.getExamples.split(" > ").toList.foreach(_ match { case exampleRe(sql, output) => val df = spark.sql(sql) - val actual = hiveResultString(df.queryExecution.executedPlan).mkString("\n").trim - val expected = output.trim + val actual = hiveResultString(df.queryExecution.executedPlan) + .mkString("\n").replaceAll("\n\\s+", "\n").trim + logTrace(s"Actual: $actual") + val expected = output.replaceAll("\n\\s+", "\n").trim + logTrace(s"Expected: $expected") assert(actual === expected) - case other => throw new IllegalArgumentException(other) + case notMatched => logTrace(notMatched) }) } } From 6e68f434dd15645df2e0b3a09e393b7918dc3acd Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 16:33:31 +0300 Subject: [PATCH 16/24] Fix examples --- .../catalyst/expressions/csvExpressions.scala | 2 +- .../sql/catalyst/expressions/generators.scala | 24 ++++---- .../sql/catalyst/expressions/grouping.scala | 60 +++++++++---------- .../expressions/jsonExpressions.scala | 4 +- .../expressions/regexpExpressions.scala | 2 +- .../expressions/stringExpressions.scala | 6 +- 6 files changed, 49 insertions(+), 49 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index acbf4bdc3a7f4..67c24f687af08 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -41,7 +41,7 @@ import org.apache.spark.unsafe.types.UTF8String Examples: > SELECT _FUNC_('1, 0.8', 'a INT, b DOUBLE'); {"a":1,"b":0.8} - > SELECT _FUNC_('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) + > SELECT _FUNC_('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')); {"time":2015-08-26 00:00:00.0} """, since = "3.0.0") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala index 82a7d9825e30a..b0a23c62284d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala @@ -127,16 +127,16 @@ case class UserDefinedGenerator( * 3 NULL * }}} */ -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = "_FUNC_(n, expr1, ..., exprk) - Separates `expr1`, ..., `exprk` into `n` rows. Uses column names col0, col1, etc. by default unless specified otherwise.", examples = """ Examples: > SELECT _FUNC_(2, 1, 2, 3); - 1 2 - 3 NULL + 1 2 + 3 NULL """) -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class Stack(children: Seq[Expression]) extends Generator { private lazy val numRows = children.head.eval().asInstanceOf[Int] @@ -375,16 +375,16 @@ case class Explode(child: Expression) extends ExplodeBase { * 1 20 * }}} */ -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows with positions, or the elements of map `expr` into multiple rows and columns with positions. Unless specified otherwise, uses the column name `pos` for position, `col` for elements of the array or `key` and `value` for elements of the map.", examples = """ Examples: > SELECT _FUNC_(array(10,20)); - 0 10 - 1 20 + 0 10 + 1 20 """) -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class PosExplode(child: Expression) extends ExplodeBase { override val position = true } @@ -392,16 +392,16 @@ case class PosExplode(child: Expression) extends ExplodeBase { /** * Explodes an array of structs into a table. */ -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = "_FUNC_(expr) - Explodes an array of structs into a table. Uses column names col1, col2, etc. by default unless specified otherwise.", examples = """ Examples: > SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b'))); - 1 a - 2 b + 1 a + 2 b """) -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class Inline(child: Expression) extends UnaryExpression with CollectionGenerator { override val inline: Boolean = true override val position: Boolean = false diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala index 221b97bdc7856..2da3f6c74b0e2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala @@ -38,7 +38,7 @@ trait GroupingSet extends Expression with CodegenFallback { override def eval(input: InternalRow): Any = throw new UnsupportedOperationException } -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = """ _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the specified columns @@ -47,19 +47,19 @@ trait GroupingSet extends Expression with CodegenFallback { examples = """ Examples: > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY _FUNC_(name, age); - NULL 2 1 - NULL NULL 2 - Alice 2 1 - Bob 5 1 - NULL 5 1 - Bob NULL 1 - Alice NULL 1 + Bob 5 1 + Alice 2 1 + NULL NULL 2 + NULL 5 1 + Bob NULL 1 + Alice NULL 1 + NULL 2 1 """, since = "2.0.0") -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {} -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = """ _FUNC_([col1[, col2 ..]]) - create a multi-dimensional rollup using the specified columns @@ -68,21 +68,21 @@ case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {} examples = """ Examples: > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY _FUNC_(name, age); - NULL NULL 2 - Alice 2 1 - Bob 5 1 - Bob NULL 1 - Alice NULL 1 + Bob 5 1 + Alice 2 1 + NULL NULL 2 + Bob NULL 1 + Alice NULL 1 """, since = "2.0.0") -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {} /** * Indicates whether a specified column expression in a GROUP BY list is aggregated or not. * GROUPING returns 1 for aggregated or 0 for not aggregated in the result set. */ -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = """ _FUNC_(col) - indicates whether a specified column in a GROUP BY is aggregated or @@ -91,12 +91,12 @@ case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {} examples = """ Examples: > SELECT name, _FUNC_(name), sum(age) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name); - Alice 0 2 - NULL 1 7 - Bob 0 5 + Bob 0 5 + Alice 0 2 + NULL 1 7 """, since = "2.0.0") -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class Grouping(child: Expression) extends Expression with Unevaluable { @transient override lazy val references: AttributeSet = @@ -111,7 +111,7 @@ case class Grouping(child: Expression) extends Expression with Unevaluable { * * If groupByExprs is empty, it means all grouping expressions in GroupingSets. */ -// scalastyle:off line.size.limit +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = """ _FUNC_([col1[, col2 ..]]) - returns the level of grouping, equals to @@ -120,20 +120,20 @@ case class Grouping(child: Expression) extends Expression with Unevaluable { examples = """ Examples: > SELECT name, _FUNC_(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height); - NULL 2 2 165.0 - Alice 0 2 165.0 - NULL 2 5 180.0 - NULL 3 7 172.5 - Bob 0 5 180.0 - Bob 1 5 180.0 - Alice 1 2 165.0 + NULL 2 5 180.0 + Alice 0 2 165.0 + NULL 3 7 172.5 + NULL 2 2 165.0 + Bob 1 5 180.0 + Alice 1 2 165.0 + Bob 0 5 180.0 """, note = """ Input columns should match with grouping columns exactly, or empty (means all the grouping columns). """, since = "2.0.0") -// scalastyle:on line.size.limit +// scalastyle:on line.size.limit line.contains.tab case class GroupingID(groupByExprs: Seq[Expression]) extends Expression with Unevaluable { @transient override lazy val references: AttributeSet = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 95b5b317b2920..a13a6836c6be6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -331,7 +331,7 @@ case class GetJsonObject(json: Expression, path: Expression) } } -// scalastyle:off +// scalastyle:off line.size.limit line.contains.tab @ExpressionDescription( usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.", examples = """ @@ -339,7 +339,7 @@ case class GetJsonObject(json: Expression, path: Expression) > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b'); 1 2 """) -// scalastyle:on +// scalastyle:on line.size.limit line.contains.tab case class JsonTuple(children: Seq[Expression]) extends Generator with CodegenFallback { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 9229ef2039fed..355d521e1c9b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -96,7 +96,7 @@ abstract class StringRegexExpression extends BinaryExpression """, examples = """ Examples: - > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%' + > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'; true """, note = """ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 6993ec3f9d714..211ae3f02a0d8 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1299,11 +1299,11 @@ object ParseUrl { usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", examples = """ Examples: - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST') + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); spark.apache.org - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY') + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); query=1 - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query') + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); 1 """, since = "2.0.0") From 3f1e42cf4faa169ce28b6a340c02b923e0e411db Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 16:39:55 +0300 Subject: [PATCH 17/24] Put common code to unindentAndTrim() --- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 3f7d6d578f542..33fc84e6dfaee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -142,6 +142,9 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } test("check outputs of expression examples") { + def unindentAndTrim(s: String): String = { + s.replaceAll("\n\\s+", "\n").trim + } val exampleRe = """^(.+);\n(?s)(.+)$""".r val ignoreSet = Set( // One of examples shows getting the current timestamp @@ -180,10 +183,10 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { info.getExamples.split(" > ").toList.foreach(_ match { case exampleRe(sql, output) => val df = spark.sql(sql) - val actual = hiveResultString(df.queryExecution.executedPlan) - .mkString("\n").replaceAll("\n\\s+", "\n").trim + val actual = unindentAndTrim( + hiveResultString(df.queryExecution.executedPlan).mkString("\n")) logTrace(s"Actual: $actual") - val expected = output.replaceAll("\n\\s+", "\n").trim + val expected = unindentAndTrim(output) logTrace(s"Expected: $expected") assert(actual === expected) case notMatched => logTrace(notMatched) From c30195e0e35f35cff69daef3d06008c239e8be10 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 21:19:05 +0300 Subject: [PATCH 18/24] Check syntax of examples --- .../org/apache/spark/sql/SQLQuerySuite.scala | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 33fc84e6dfaee..d855ab07cae77 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -145,6 +145,14 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { def unindentAndTrim(s: String): String = { s.replaceAll("\n\\s+", "\n").trim } + val beginSqlStmtRe = " > ".r + val endSqlStmtRe = ";\n".r + def checkExampleSyntax(example: String): Unit = { + val beginStmtNum = beginSqlStmtRe.findAllIn(example).length + val endStmtNum = endSqlStmtRe.findAllIn(example).length + assert(beginStmtNum === endStmtNum, + "The number of ` > ` does not match to the number of `;`") + } val exampleRe = """^(.+);\n(?s)(.+)$""".r val ignoreSet = Set( // One of examples shows getting the current timestamp @@ -154,7 +162,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { "org.apache.spark.sql.catalyst.expressions.Randn", "org.apache.spark.sql.catalyst.expressions.Shuffle", "org.apache.spark.sql.catalyst.expressions.Uuid", - // The example call methods that return unstable results. + // The example calls methods that return unstable results. "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection", // Fails on parsing `SELECT 2 mod 1.8`: // org.apache.spark.sql.catalyst.parser.ParseException: @@ -179,8 +187,10 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { val className = info.getClassName if (!ignoreSet.contains(className)) { withClue(s"Function '${info.getName}', Expression class '$className'") { - logTrace(info.getExamples) - info.getExamples.split(" > ").toList.foreach(_ match { + val example = info.getExamples + logTrace(example) + checkExampleSyntax(example) + example.split(" > ").toList.foreach(_ match { case exampleRe(sql, output) => val df = spark.sql(sql) val actual = unindentAndTrim( From 33665b753928532ef408a90ba1b9e309f6a4f8a1 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 23:08:52 +0300 Subject: [PATCH 19/24] Fix MOD, make_timestamp and LIKE --- .../spark/sql/catalyst/expressions/arithmetic.scala | 2 +- .../catalyst/expressions/datetimeExpressions.scala | 4 ++-- .../sql/catalyst/expressions/regexpExpressions.scala | 2 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 11 ----------- 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index e4276e33acbd2..8d462184ed6c2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -448,7 +448,7 @@ case class IntegralDivide(left: Expression, right: Expression) extends DivModLik usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.", examples = """ Examples: - > SELECT 2 _FUNC_ 1.8; + > SELECT 2 % 1.8; 0.2 > SELECT MOD(2, 1.8); 0.2 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 79d05395a4306..0fffffea1d82a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1767,9 +1767,9 @@ case class MakeDate(year: Expression, month: Expression, day: Expression) 2014-12-28 06:30:45.887 > SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887, 'CET'); 2014-12-27 21:30:45.887 - > SELECT _FUNC_(2019, 6, 30, 23, 59, 60) + > SELECT _FUNC_(2019, 6, 30, 23, 59, 60); 2019-07-01 00:00:00 - > SELECT _FUNC_(2019, 13, 1, 10, 11, 12, 13); + > SELECT _FUNC_(2019, 13, 1, 10, 11, 12, 'PST'); NULL > SELECT _FUNC_(null, 7, 22, 15, 30, 0); NULL diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 355d521e1c9b7..e92fae69af3e4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -96,7 +96,7 @@ abstract class StringRegexExpression extends BinaryExpression """, examples = """ Examples: - > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'; + > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\Users%'; true """, note = """ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index d855ab07cae77..d5b11602f9e6e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -164,17 +164,6 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { "org.apache.spark.sql.catalyst.expressions.Uuid", // The example calls methods that return unstable results. "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection", - // Fails on parsing `SELECT 2 mod 1.8`: - // org.apache.spark.sql.catalyst.parser.ParseException: - // extraneous input '1.8' expecting (line 1, pos 14) - "org.apache.spark.sql.catalyst.expressions.Remainder", - // Fails on `SELECT make_timestamp(2019, 13, 1, 10, 11, 12, 13)`: - // Invalid ID for region-based ZoneId, invalid format: 13 - // java.time.DateTimeException: Invalid ID for region-based ZoneId, invalid format: 13 - "org.apache.spark.sql.catalyst.expressions.MakeTimestamp", - // Fails on `SELECT '%SystemDrive%\Users\John' like '\%SystemDrive\%\\Users%'` - // the pattern '\%SystemDrive\%\Users%' is invalid, ... - "org.apache.spark.sql.catalyst.expressions.Like", // Unsupported format of the examples, // and `SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*'` fails: // Error in query: From 2ca354ee7aa51790f0d5f8aa661df042b4c66c08 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 23:51:44 +0300 Subject: [PATCH 20/24] Fix RLIKE --- .../sql/catalyst/expressions/regexpExpressions.scala | 11 ++++++----- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 7 +------ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index e92fae69af3e4..3911b014bd590 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -170,12 +170,13 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi """, examples = """ Examples: - When spark.sql.parser.escapedStringLiterals is disabled (default). - > SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*' + > SET spark.sql.parser.escapedStringLiterals=true; + spark.sql.parser.escapedStringLiterals true + > SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*'; true - - When spark.sql.parser.escapedStringLiterals is enabled. - > SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\Users.*' + > SET spark.sql.parser.escapedStringLiterals=false; + spark.sql.parser.escapedStringLiterals false + > SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\Users.*'; true """, note = """ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index d5b11602f9e6e..efc1224b0f687 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -163,12 +163,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { "org.apache.spark.sql.catalyst.expressions.Shuffle", "org.apache.spark.sql.catalyst.expressions.Uuid", // The example calls methods that return unstable results. - "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection", - // Unsupported format of the examples, - // and `SELECT '%SystemDrive%\Users\John' _FUNC_ '%SystemDrive%\\Users.*'` fails: - // Error in query: - // extraneous input ''%SystemDrive%\\Users.*'' expecting (line 1, pos 41) - "org.apache.spark.sql.catalyst.expressions.RLike") + "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection") withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { spark.sessionState.functionRegistry.listFunction().foreach { funcId => From 4a328bb508ca94b2d6c1a03b11bafcfb7b1c26ca Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 23:58:33 +0300 Subject: [PATCH 21/24] Disable scalastyle checker for tabs in RLIKE --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 3911b014bd590..79fe05f8087a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -153,6 +153,7 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi } } +// scalastyle:off line.contains.tab @ExpressionDescription( usage = "str _FUNC_ regexp - Returns true if `str` matches `regexp`, or false otherwise.", arguments = """ @@ -183,6 +184,7 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi Use LIKE to match with simple string pattern. """, since = "1.0.0") +// scalastyle:on line.contains.tab case class RLike(left: Expression, right: Expression) extends StringRegexExpression { override def escape(v: String): String = v From dcd9816b5374266dfec64510559bf753c74003f7 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 26 Sep 2019 23:58:50 +0300 Subject: [PATCH 22/24] Fix another test which checks presence of _FUNC_ --- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index efc1224b0f687..bd5bb6c439f6f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -128,7 +128,9 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { // _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)` "org.apache.spark.sql.catalyst.expressions.Remainder", // Examples demonstrate alternative names, see SPARK-20749 - "org.apache.spark.sql.catalyst.expressions.Length") + "org.apache.spark.sql.catalyst.expressions.Length", + // Uses settings without _FUNC_ in `SET spark.sql.parser.escapedStringLiterals=true` + "org.apache.spark.sql.catalyst.expressions.RLike") spark.sessionState.functionRegistry.listFunction().foreach { funcId => val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) val className = info.getClassName From 41e4d7cc844f47bc1ba5b93e8923fa90bc9bda36 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 27 Sep 2019 08:44:43 +0300 Subject: [PATCH 23/24] Run the test in parallel --- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index bd5bb6c439f6f..fa63f3baf1535 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -168,7 +168,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection") withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { - spark.sessionState.functionRegistry.listFunction().foreach { funcId => + spark.sessionState.functionRegistry.listFunction().par.foreach { funcId => val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) val className = info.getClassName if (!ignoreSet.contains(className)) { From cee6709d24c41ebd1c17bad4b2acd26b24d67d08 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 27 Sep 2019 08:50:43 +0300 Subject: [PATCH 24/24] Remove logTrace --- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index fa63f3baf1535..9488bbe3a2539 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -174,18 +174,15 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { if (!ignoreSet.contains(className)) { withClue(s"Function '${info.getName}', Expression class '$className'") { val example = info.getExamples - logTrace(example) checkExampleSyntax(example) example.split(" > ").toList.foreach(_ match { case exampleRe(sql, output) => val df = spark.sql(sql) val actual = unindentAndTrim( hiveResultString(df.queryExecution.executedPlan).mkString("\n")) - logTrace(s"Actual: $actual") val expected = unindentAndTrim(output) - logTrace(s"Expected: $expected") assert(actual === expected) - case notMatched => logTrace(notMatched) + case _ => }) } }