From efb2f4a78503cd9da3cc88ad0b35c81dc2560078 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 19 Sep 2024 23:33:35 -0700 Subject: [PATCH 1/9] add support for duplicate keys in from_json(_, 'variant') --- .../expressions/jsonExpressions.scala | 5 ++- .../spark/sql/VariantEndToEndSuite.scala | 32 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 7005d663a3f96..84ed14dcd09a4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -717,9 +717,12 @@ case class JsonToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) + private val variantAllowDuplicateKeys = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS) + override def nullSafeEval(json: Any): Any = nullableSchema match { case _: VariantType => - VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String]) + VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String], + allowDuplicateKeys = variantAllowDuplicateKeys) case _ => converter(parser.parse(json.asInstanceOf[UTF8String])) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala index 3224baf42f3e5..19d4ac23709b6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.sql +import org.apache.spark.SparkThrowable import org.apache.spark.sql.QueryTest.sameRows import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Cast, Literal} @@ -28,6 +29,7 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarArray import org.apache.spark.types.variant.VariantBuilder +import org.apache.spark.types.variant.VariantUtil._ import org.apache.spark.unsafe.types.VariantVal class VariantEndToEndSuite extends QueryTest with SharedSparkSession { @@ -37,8 +39,10 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { def check(input: String, output: String = null): Unit = { val df = Seq(input).toDF("v") val variantDF = df.select(to_json(parse_json(col("v")))) + val variantDF2 = df.select(to_json(from_json(col("v"), VariantType))) val expected = if (output != null) output else input checkAnswer(variantDF, Seq(Row(expected))) + checkAnswer(variantDF2, Seq(Row(expected))) } check("null") @@ -339,4 +343,32 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { } } } + + test("from_json(_, 'variant') with duplicate keys") { + val json: String = """{"a": 1, "b": 2, "c": "3", "a": 4}""" + withSQLConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS.key -> "true") { + val df = Seq(json).toDF("j") + .selectExpr("from_json(j,'variant')") + val actual = df.collect().head(0).asInstanceOf[VariantVal] + val expectedValue: Array[Byte] = Array(objectHeader(false, 1, 1), + /* size */ 3, + /* id list */ 0, 1, 2, + /* offset list */ 4, 0, 2, 6, + /* field data */ primitiveHeader(INT1), 2, shortStrHeader(1), '3', + primitiveHeader(INT1), 4) + val expectedMetadata: Array[Byte] = Array(VERSION, 3, 0, 1, 2, 3, 'a', 'b', 'c') + assert(actual === new VariantVal(expectedValue, expectedMetadata)) + } + withSQLConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS.key -> "false") { + val df = Seq(json).toDF("j") + .selectExpr("from_json(j,'variant')") + checkError( + exception = intercept[SparkThrowable] { + df.collect() + }, + condition = "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", + parameters = Map("badRecord" -> json, "failFastMode" -> "FAILFAST") + ) + } + } } From 867288371a8ca885a6132d807279ee30e415679a Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 20 Sep 2024 00:55:23 -0700 Subject: [PATCH 2/9] Addressed @MaxGekk's comment --- .../sql/catalyst/expressions/jsonExpressions.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 84ed14dcd09a4..ea5c2ac388a42 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -632,7 +632,8 @@ case class JsonToStructs( schema: DataType, options: Map[String, String], child: Expression, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + variantAllowDuplicateKeys: Boolean = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS)) extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback @@ -717,8 +718,6 @@ case class JsonToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) - private val variantAllowDuplicateKeys = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS) - override def nullSafeEval(json: Any): Any = nullableSchema match { case _: VariantType => VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String], @@ -740,6 +739,12 @@ case class JsonToStructs( copy(child = newChild) } +object JsonToStructs { + def unapply( + j: JsonToStructs): Option[(DataType, Map[String, String], Expression, Option[String])] = + Some((j.schema, j.options, j.child, j.timeZoneId)) +} + /** * Converts a [[StructType]], [[ArrayType]] or [[MapType]] to a JSON output string. */ From 57a71aa5e8a0dd930460e2eb72a6717d51a93b28 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 20 Sep 2024 12:26:11 -0700 Subject: [PATCH 3/9] regenerated golden files --- .../function_from_json.explain | 2 +- .../function_from_json_orphaned.explain | 2 +- ...unction_from_json_with_json_schema.explain | 2 +- .../analyzer-results/ansi/date.sql.out | 2 +- .../analyzer-results/ansi/interval.sql.out | 6 ++-- .../ansi/parse-schema-string.sql.out | 4 +-- .../analyzer-results/ansi/timestamp.sql.out | 2 +- .../sql-tests/analyzer-results/date.sql.out | 2 +- .../analyzer-results/datetime-legacy.sql.out | 4 +-- .../analyzer-results/interval.sql.out | 6 ++-- .../analyzer-results/json-functions.sql.out | 34 +++++++++---------- .../parse-schema-string.sql.out | 4 +-- .../sql-session-variables.sql.out | 2 +- .../subexp-elimination.sql.out | 10 +++--- .../analyzer-results/timestamp.sql.out | 2 +- .../timestampNTZ/timestamp-ansi.sql.out | 2 +- .../timestampNTZ/timestamp.sql.out | 2 +- .../native/stringCastAndExpressions.sql.out | 2 +- 18 files changed, 45 insertions(+), 45 deletions(-) diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain index 1219f11d4696e..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain @@ -1,2 +1,2 @@ -Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain index 1219f11d4696e..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain @@ -1,2 +1,2 @@ -Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain index 1219f11d4696e..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain @@ -1,2 +1,2 @@ -Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out index fd927b99c6456..0e4d2d4e99e26 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out @@ -736,7 +736,7 @@ Project [to_date(26/October/2015, Some(dd/MMMMM/yyyy), Some(America/Los_Angeles) -- !query select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"d":"26/October/2015"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"d":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out index 12756576ded9b..4ed020c46e38b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out @@ -2074,7 +2074,7 @@ SELECT to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), from_csv(to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), 'a interval year, b interval month') -- !query analysis -Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +- OneRowRelation @@ -2085,7 +2085,7 @@ SELECT to_json(map('a', interval 100 day 130 minute)), from_json(to_json(map('a', interval 100 day 130 minute)), 'a interval day to minute') -- !query analysis -Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +- OneRowRelation @@ -2096,7 +2096,7 @@ SELECT to_json(map('a', interval 32 year 10 month)), from_json(to_json(map('a', interval 32 year 10 month)), 'a interval year to month') -- !query analysis -Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out index 45fc3bd03a782..ae8e47ed3665c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out @@ -16,12 +16,12 @@ Project [from_csv(StructField(cube,IntegerType,true), 1, Some(America/Los_Angele -- !query select from_json('{"create":1}', 'create INT') -- !query analysis -Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles)) AS from_json({"create":1})#x] +Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles), false) AS from_json({"create":1})#x] +- OneRowRelation -- !query select from_json('{"cube":1}', 'cube INT') -- !query analysis -Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles)) AS from_json({"cube":1})#x] +Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles), false) AS from_json({"cube":1})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out index bf34490d657e3..560974d28c545 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out @@ -730,7 +730,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out index 48137e06467e8..88c7d7b4e7d72 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out @@ -811,7 +811,7 @@ Project [to_date(26/October/2015, Some(dd/MMMMM/yyyy), Some(America/Los_Angeles) -- !query select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"d":"26/October/2015"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"d":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out index 1e49f4df8267a..4221db822d024 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out @@ -811,7 +811,7 @@ Project [to_date(26/October/2015, Some(dd/MMMMM/yyyy), Some(America/Los_Angeles) -- !query select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"d":"26/October/2015"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"d":"26/October/2015"})#x] +- OneRowRelation @@ -1833,7 +1833,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out index 290e55052931d..d778f9983b78a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out @@ -2074,7 +2074,7 @@ SELECT to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), from_csv(to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), 'a interval year, b interval month') -- !query analysis -Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +- OneRowRelation @@ -2085,7 +2085,7 @@ SELECT to_json(map('a', interval 100 day 130 minute)), from_json(to_json(map('a', interval 100 day 130 minute)), 'a interval day to minute') -- !query analysis -Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +- OneRowRelation @@ -2096,7 +2096,7 @@ SELECT to_json(map('a', interval 32 year 10 month)), from_json(to_json(map('a', interval 32 year 10 month)), 'a interval year to month') -- !query analysis -Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out index 0d7c6b2056231..fef9d0c5b6250 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out @@ -118,14 +118,14 @@ org.apache.spark.sql.AnalysisException -- !query select from_json('{"a":1}', 'a INT') -- !query analysis -Project [from_json(StructField(a,IntegerType,true), {"a":1}, Some(America/Los_Angeles)) AS from_json({"a":1})#x] +Project [from_json(StructField(a,IntegerType,true), {"a":1}, Some(America/Los_Angeles), false) AS from_json({"a":1})#x] +- OneRowRelation -- !query select from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) -- !query analysis -Project [from_json(StructField(time,TimestampType,true), (timestampFormat,dd/MM/yyyy), {"time":"26/08/2015"}, Some(America/Los_Angeles)) AS from_json({"time":"26/08/2015"})#x] +Project [from_json(StructField(time,TimestampType,true), (timestampFormat,dd/MM/yyyy), {"time":"26/08/2015"}, Some(America/Los_Angeles), false) AS from_json({"time":"26/08/2015"})#x] +- OneRowRelation @@ -279,14 +279,14 @@ DropTempViewCommand jsonTable -- !query select from_json('{"a":1, "b":2}', 'map') -- !query analysis -Project [from_json(MapType(StringType,IntegerType,true), {"a":1, "b":2}, Some(America/Los_Angeles)) AS entries#x] +Project [from_json(MapType(StringType,IntegerType,true), {"a":1, "b":2}, Some(America/Los_Angeles), false) AS entries#x] +- OneRowRelation -- !query select from_json('{"a":1, "b":"2"}', 'struct') -- !query analysis -Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), {"a":1, "b":"2"}, Some(America/Los_Angeles)) AS from_json({"a":1, "b":"2"})#x] +Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), {"a":1, "b":"2"}, Some(America/Los_Angeles), false) AS from_json({"a":1, "b":"2"})#x] +- OneRowRelation @@ -300,70 +300,70 @@ Project [schema_of_json({"c1":0, "c2":[1]}) AS schema_of_json({"c1":0, "c2":[1]} -- !query select from_json('{"c1":[1, 2, 3]}', schema_of_json('{"c1":[0]}')) -- !query analysis -Project [from_json(StructField(c1,ArrayType(LongType,true),true), {"c1":[1, 2, 3]}, Some(America/Los_Angeles)) AS from_json({"c1":[1, 2, 3]})#x] +Project [from_json(StructField(c1,ArrayType(LongType,true),true), {"c1":[1, 2, 3]}, Some(America/Los_Angeles), false) AS from_json({"c1":[1, 2, 3]})#x] +- OneRowRelation -- !query select from_json('[1, 2, 3]', 'array') -- !query analysis -Project [from_json(ArrayType(IntegerType,true), [1, 2, 3], Some(America/Los_Angeles)) AS from_json([1, 2, 3])#x] +Project [from_json(ArrayType(IntegerType,true), [1, 2, 3], Some(America/Los_Angeles), false) AS from_json([1, 2, 3])#x] +- OneRowRelation -- !query select from_json('[1, "2", 3]', 'array') -- !query analysis -Project [from_json(ArrayType(IntegerType,true), [1, "2", 3], Some(America/Los_Angeles)) AS from_json([1, "2", 3])#x] +Project [from_json(ArrayType(IntegerType,true), [1, "2", 3], Some(America/Los_Angeles), false) AS from_json([1, "2", 3])#x] +- OneRowRelation -- !query select from_json('[1, 2, null]', 'array') -- !query analysis -Project [from_json(ArrayType(IntegerType,true), [1, 2, null], Some(America/Los_Angeles)) AS from_json([1, 2, null])#x] +Project [from_json(ArrayType(IntegerType,true), [1, 2, null], Some(America/Los_Angeles), false) AS from_json([1, 2, null])#x] +- OneRowRelation -- !query select from_json('[{"a": 1}, {"a":2}]', 'array>') -- !query analysis -Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [{"a": 1}, {"a":2}], Some(America/Los_Angeles)) AS from_json([{"a": 1}, {"a":2}])#x] +Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [{"a": 1}, {"a":2}], Some(America/Los_Angeles), false) AS from_json([{"a": 1}, {"a":2}])#x] +- OneRowRelation -- !query select from_json('{"a": 1}', 'array>') -- !query analysis -Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), {"a": 1}, Some(America/Los_Angeles)) AS from_json({"a": 1})#x] +Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), {"a": 1}, Some(America/Los_Angeles), false) AS from_json({"a": 1})#x] +- OneRowRelation -- !query select from_json('[null, {"a":2}]', 'array>') -- !query analysis -Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [null, {"a":2}], Some(America/Los_Angeles)) AS from_json([null, {"a":2}])#x] +Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [null, {"a":2}], Some(America/Los_Angeles), false) AS from_json([null, {"a":2}])#x] +- OneRowRelation -- !query select from_json('[{"a": 1}, {"b":2}]', 'array>') -- !query analysis -Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, {"b":2}], Some(America/Los_Angeles)) AS from_json([{"a": 1}, {"b":2}])#x] +Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, {"b":2}], Some(America/Los_Angeles), false) AS from_json([{"a": 1}, {"b":2}])#x] +- OneRowRelation -- !query select from_json('[{"a": 1}, 2]', 'array>') -- !query analysis -Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, 2], Some(America/Los_Angeles)) AS from_json([{"a": 1}, 2])#x] +Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, 2], Some(America/Los_Angeles), false) AS from_json([{"a": 1}, 2])#x] +- OneRowRelation -- !query select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp') -- !query analysis -Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), {"d": "2012-12-15", "t": "2012-12-15 15:15:15"}, Some(America/Los_Angeles)) AS from_json({"d": "2012-12-15", "t": "2012-12-15 15:15:15"})#x] +Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), {"d": "2012-12-15", "t": "2012-12-15 15:15:15"}, Some(America/Los_Angeles), false) AS from_json({"d": "2012-12-15", "t": "2012-12-15 15:15:15"})#x] +- OneRowRelation @@ -373,7 +373,7 @@ select from_json( 'd date, t timestamp', map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss')) -- !query analysis -Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), (dateFormat,MM/dd yyyy), (timestampFormat,MM/dd yyyy HH:mm:ss), {"d": "12/15 2012", "t": "12/15 2012 15:15:15"}, Some(America/Los_Angeles)) AS from_json({"d": "12/15 2012", "t": "12/15 2012 15:15:15"})#x] +Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), (dateFormat,MM/dd yyyy), (timestampFormat,MM/dd yyyy HH:mm:ss), {"d": "12/15 2012", "t": "12/15 2012 15:15:15"}, Some(America/Los_Angeles), false) AS from_json({"d": "12/15 2012", "t": "12/15 2012 15:15:15"})#x] +- OneRowRelation @@ -383,7 +383,7 @@ select from_json( 'd date', map('dateFormat', 'MM-dd')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,MM-dd), {"d": "02-29"}, Some(America/Los_Angeles)) AS from_json({"d": "02-29"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,MM-dd), {"d": "02-29"}, Some(America/Los_Angeles), false) AS from_json({"d": "02-29"})#x] +- OneRowRelation @@ -393,7 +393,7 @@ select from_json( 't timestamp', map('timestampFormat', 'MM-dd')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,MM-dd), {"t": "02-29"}, Some(America/Los_Angeles)) AS from_json({"t": "02-29"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,MM-dd), {"t": "02-29"}, Some(America/Los_Angeles), false) AS from_json({"t": "02-29"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out index 45fc3bd03a782..ae8e47ed3665c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out @@ -16,12 +16,12 @@ Project [from_csv(StructField(cube,IntegerType,true), 1, Some(America/Los_Angele -- !query select from_json('{"create":1}', 'create INT') -- !query analysis -Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles)) AS from_json({"create":1})#x] +Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles), false) AS from_json({"create":1})#x] +- OneRowRelation -- !query select from_json('{"cube":1}', 'cube INT') -- !query analysis -Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles)) AS from_json({"cube":1})#x] +Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles), false) AS from_json({"cube":1})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out index eb48f0d9a28f0..7c8908711a8eb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out @@ -2151,7 +2151,7 @@ CreateVariable defaultvalueexpression(cast(a INT as string), 'a INT'), true -- !query SELECT from_json('{"a": 1}', var1) -- !query analysis -Project [from_json(StructField(a,IntegerType,true), {"a": 1}, Some(America/Los_Angeles)) AS from_json({"a": 1})#x] +Project [from_json(StructField(a,IntegerType,true), {"a": 1}, Some(America/Los_Angeles), false) AS from_json({"a": 1})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out index 94073f2751b3e..754b05bfa6fed 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out @@ -15,7 +15,7 @@ AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true -- !query SELECT from_json(a, 'struct').a, from_json(a, 'struct').b, from_json(b, 'array>')[0].a, from_json(b, 'array>')[0].b FROM testData -- !query analysis -Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a AS from_json(a).a#x, from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b AS from_json(a).b#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].a AS from_json(b)[0].a#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b AS from_json(b)[0].b#x] +Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a AS from_json(a).a#x, from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b AS from_json(a).b#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].a AS from_json(b)[0].a#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b AS from_json(b)[0].b#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -27,7 +27,7 @@ Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,tru -- !query SELECT if(from_json(a, 'struct').a > 1, from_json(b, 'array>')[0].a, from_json(b, 'array>')[0].a + 1) FROM testData -- !query analysis -Project [if ((from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 1)) from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].a else (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].a + 1) AS (IF((from_json(a).a > 1), from_json(b)[0].a, (from_json(b)[0].a + 1)))#x] +Project [if ((from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 1)) from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].a else (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].a + 1) AS (IF((from_json(a).a > 1), from_json(b)[0].a, (from_json(b)[0].a + 1)))#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -39,7 +39,7 @@ Project [if ((from_json(StructField(a,IntegerType,true), StructField(b,StringTyp -- !query SELECT if(isnull(from_json(a, 'struct').a), from_json(b, 'array>')[0].b + 1, from_json(b, 'array>')[0].b) FROM testData -- !query analysis -Project [if (isnull(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a)) (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b + 1) else from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b AS (IF((from_json(a).a IS NULL), (from_json(b)[0].b + 1), from_json(b)[0].b))#x] +Project [if (isnull(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a)) (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b + 1) else from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b AS (IF((from_json(a).a IS NULL), (from_json(b)[0].b + 1), from_json(b)[0].b))#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -51,7 +51,7 @@ Project [if (isnull(from_json(StructField(a,IntegerType,true), StructField(b,Str -- !query SELECT case when from_json(a, 'struct').a > 5 then from_json(a, 'struct').b when from_json(a, 'struct').a > 4 then from_json(a, 'struct').b + 1 else from_json(a, 'struct').b + 2 end FROM testData -- !query analysis -Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 5) THEN from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 4) THEN cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b as double) + cast(1 as double)) as string) ELSE cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b as double) + cast(2 as double)) as string) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(a).b WHEN (from_json(a).a > 4) THEN (from_json(a).b + 1) ELSE (from_json(a).b + 2) END#x] +Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 5) THEN from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 4) THEN cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b as double) + cast(1 as double)) as string) ELSE cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b as double) + cast(2 as double)) as string) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(a).b WHEN (from_json(a).a > 4) THEN (from_json(a).b + 1) ELSE (from_json(a).b + 2) END#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -63,7 +63,7 @@ Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,Str -- !query SELECT case when from_json(a, 'struct').a > 5 then from_json(b, 'array>')[0].b when from_json(a, 'struct').a > 4 then from_json(b, 'array>')[0].b + 1 else from_json(b, 'array>')[0].b + 2 end FROM testData -- !query analysis -Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 5) THEN from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 4) THEN (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b + 1) ELSE (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b + 2) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(b)[0].b WHEN (from_json(a).a > 4) THEN (from_json(b)[0].b + 1) ELSE (from_json(b)[0].b + 2) END#x] +Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 5) THEN from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 4) THEN (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b + 1) ELSE (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b + 2) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(b)[0].b WHEN (from_json(a).a > 4) THEN (from_json(b)[0].b + 1) ELSE (from_json(b)[0].b + 2) END#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out index 6ca35b8b141dc..dcfd783b648f8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out @@ -802,7 +802,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out index e50c860270563..ec227afc87fe1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out @@ -745,7 +745,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out index 098abfb3852cf..7475f837250d5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out @@ -805,7 +805,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out index 009e91f7ffacf..22e60d0606382 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out @@ -370,7 +370,7 @@ Project [c0#x] -- !query select from_json(a, 'a INT') from t -- !query analysis -Project [from_json(StructField(a,IntegerType,true), a#x, Some(America/Los_Angeles)) AS from_json(a)#x] +Project [from_json(StructField(a,IntegerType,true), a#x, Some(America/Los_Angeles), false) AS from_json(a)#x] +- SubqueryAlias t +- View (`t`, [a#x]) +- Project [cast(a#x as string) AS a#x] From 2411e272be8c0953f82625285b0075dfbf738d57 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 5 Dec 2024 13:05:43 -0800 Subject: [PATCH 4/9] Prohibit Variant from repartition --- .../resources/error/error-conditions.json | 5 ++++ .../sql/catalyst/analysis/CheckAnalysis.scala | 15 ++++++++++ .../sql/DataFrameSetOperationsSuite.scala | 30 +++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index b18db93f6291a..d513f1ec7d654 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -5375,6 +5375,11 @@ "Cannot have VARIANT type columns in DataFrame which calls set operations (INTERSECT, EXCEPT, etc.), but the type of column is ." ] }, + "PARTITION_BY_VARIANT" : { + "message" : [ + "Cannot use VARIANT type expressions to partition a DataFrame, but the type of expression is ." + ] + }, "SET_PROPERTIES_AND_DBPROPERTIES" : { "message" : [ "set PROPERTIES and DBPROPERTIES at the same time." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index d4b97ff037f36..d9a8b0a7a7fed 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -96,6 +96,13 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case _ => None } + protected def variantExprInPartitionExpression(plan: LogicalPlan): Option[Expression] = + plan match { + case r: RepartitionByExpression => + r.partitionExpressions.find(e => hasVariantType(e.dataType)) + case _ => None + } + private def checkLimitLikeClause(name: String, limitExpr: Expression): Unit = { limitExpr match { case e if !e.foldable => limitExpr.failAnalysis( @@ -849,6 +856,14 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB "colName" -> toSQLId(variantCol.name), "dataType" -> toSQLType(variantCol.dataType))) + case o if variantExprInPartitionExpression(o).isDefined => + val variantExpr = variantExprInPartitionExpression(o).get + o.failAnalysis( + errorClass = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + messageParameters = Map( + "expr" -> variantExpr.sql, + "dataType" -> toSQLType(variantExpr.dataType))) + case o if o.expressions.exists(!_.deterministic) && !operatorAllowsNonDeterministicExpressions(o) && !o.isInstanceOf[Project] && diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index 9c182be0f7dd6..d54fb202c6cc9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -350,6 +350,36 @@ class DataFrameSetOperationsSuite extends QueryTest dates.intersect(widenTypedRows).collect() } + test("SPARK-50503 - cannot partition by variant columns") { + val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + + " as v, id % 5 as id, named_struct('v', parse_json(id::string)) s from range(0, 100, 1, 5)") + // variant column + checkError( + exception = intercept[AnalysisException](df.repartition(5, col("v"))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "v", + "dataType" -> "\"VARIANT\"") + ) + // nested variant column + checkError( + exception = intercept[AnalysisException](df.repartition(5, col("s"))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "s", + "dataType" -> "\"STRUCT\"") + ) + // variant producing expression + checkError( + exception = + intercept[AnalysisException](df.repartition(5, parse_json(col("id").cast("string")))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "parse_json(CAST(id AS STRING))", + "dataType" -> "\"VARIANT\"") + ) + } + test("SPARK-50373 - cannot run set operations with variant type") { val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + " as v, id % 5 as id from range(0, 100, 1, 5)") From 5ce4c07f9325a82cea2d9236aaa5cb916965e328 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 5 Dec 2024 13:19:59 -0800 Subject: [PATCH 5/9] added distribute by test case --- .../src/main/resources/error/error-conditions.json | 2 +- .../spark/sql/DataFrameSetOperationsSuite.scala | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index d513f1ec7d654..497903f183567 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -5377,7 +5377,7 @@ }, "PARTITION_BY_VARIANT" : { "message" : [ - "Cannot use VARIANT type expressions to partition a DataFrame, but the type of expression is ." + "Cannot use VARIANT producing expressions to partition a DataFrame, but the type of expression is ." ] }, "SET_PROPERTIES_AND_DBPROPERTIES" : { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index d54fb202c6cc9..74c89a1fb385d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -378,6 +378,20 @@ class DataFrameSetOperationsSuite extends QueryTest "expr" -> "parse_json(CAST(id AS STRING))", "dataType" -> "\"VARIANT\"") ) + withTempView("tv") { + df.createOrReplaceTempView("tv") + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY v")), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "tv.v", + "dataType" -> "\"VARIANT\""), + context = ExpectedContext( + fragment = "DISTRIBUTE BY v", + start = 17, + stop = 31) + ) + } } test("SPARK-50373 - cannot run set operations with variant type") { From 429bd56b4b680758a9e2dc744d0fed847de3d881 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 5 Dec 2024 13:26:59 -0800 Subject: [PATCH 6/9] added another test --- .../spark/sql/DataFrameSetOperationsSuite.scala | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index 74c89a1fb385d..336631c2b2d46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -378,6 +378,7 @@ class DataFrameSetOperationsSuite extends QueryTest "expr" -> "parse_json(CAST(id AS STRING))", "dataType" -> "\"VARIANT\"") ) + // SQL withTempView("tv") { df.createOrReplaceTempView("tv") checkError( @@ -391,6 +392,17 @@ class DataFrameSetOperationsSuite extends QueryTest start = 17, stop = 31) ) + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY s")), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "tv.s", + "dataType" -> "\"STRUCT\""), + context = ExpectedContext( + fragment = "DISTRIBUTE BY s", + start = 17, + stop = 31) + ) } } From 40436ed47b1ef43aa6b2170e8a98880adca92a13 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 5 Dec 2024 13:42:44 -0800 Subject: [PATCH 7/9] added another test --- .../org/apache/spark/sql/DataFrameSetOperationsSuite.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index 336631c2b2d46..14b364f524b65 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -378,6 +378,13 @@ class DataFrameSetOperationsSuite extends QueryTest "expr" -> "parse_json(CAST(id AS STRING))", "dataType" -> "\"VARIANT\"") ) + // Partitioning by non-variant column works + try { + df.repartition(5, col("id")).collect() + } catch { + case e: Exception => + fail(s"Expected no exception to be thrown but an exception was thrown: ${e.getMessage}") + } // SQL withTempView("tv") { df.createOrReplaceTempView("tv") From 365bdfc13d9a3862dd240de5f3ef95f6e877fa7f Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 5 Dec 2024 13:50:12 -0800 Subject: [PATCH 8/9] moved to different suite --- .../sql/DataFrameSetOperationsSuite.scala | 63 ------------------- .../org/apache/spark/sql/DataFrameSuite.scala | 63 +++++++++++++++++++ 2 files changed, 63 insertions(+), 63 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index 14b364f524b65..9c182be0f7dd6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -350,69 +350,6 @@ class DataFrameSetOperationsSuite extends QueryTest dates.intersect(widenTypedRows).collect() } - test("SPARK-50503 - cannot partition by variant columns") { - val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + - " as v, id % 5 as id, named_struct('v', parse_json(id::string)) s from range(0, 100, 1, 5)") - // variant column - checkError( - exception = intercept[AnalysisException](df.repartition(5, col("v"))), - condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", - parameters = Map( - "expr" -> "v", - "dataType" -> "\"VARIANT\"") - ) - // nested variant column - checkError( - exception = intercept[AnalysisException](df.repartition(5, col("s"))), - condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", - parameters = Map( - "expr" -> "s", - "dataType" -> "\"STRUCT\"") - ) - // variant producing expression - checkError( - exception = - intercept[AnalysisException](df.repartition(5, parse_json(col("id").cast("string")))), - condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", - parameters = Map( - "expr" -> "parse_json(CAST(id AS STRING))", - "dataType" -> "\"VARIANT\"") - ) - // Partitioning by non-variant column works - try { - df.repartition(5, col("id")).collect() - } catch { - case e: Exception => - fail(s"Expected no exception to be thrown but an exception was thrown: ${e.getMessage}") - } - // SQL - withTempView("tv") { - df.createOrReplaceTempView("tv") - checkError( - exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY v")), - condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", - parameters = Map( - "expr" -> "tv.v", - "dataType" -> "\"VARIANT\""), - context = ExpectedContext( - fragment = "DISTRIBUTE BY v", - start = 17, - stop = 31) - ) - checkError( - exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY s")), - condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", - parameters = Map( - "expr" -> "tv.s", - "dataType" -> "\"STRUCT\""), - context = ExpectedContext( - fragment = "DISTRIBUTE BY s", - start = 17, - stop = 31) - ) - } - } - test("SPARK-50373 - cannot run set operations with variant type") { val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + " as v, id % 5 as id from range(0, 100, 1, 5)") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index c1d977dad82d2..0972a63a2495c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -308,6 +308,69 @@ class DataFrameSuite extends QueryTest testData.select("key").collect().toSeq) } + test("SPARK-50503 - cannot partition by variant columns") { + val df = sql("select parse_json(case when id = 0 then 'null' else '1' end)" + + " as v, id % 5 as id, named_struct('v', parse_json(id::string)) s from range(0, 100, 1, 5)") + // variant column + checkError( + exception = intercept[AnalysisException](df.repartition(5, col("v"))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "v", + "dataType" -> "\"VARIANT\"") + ) + // nested variant column + checkError( + exception = intercept[AnalysisException](df.repartition(5, col("s"))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "s", + "dataType" -> "\"STRUCT\"") + ) + // variant producing expression + checkError( + exception = + intercept[AnalysisException](df.repartition(5, parse_json(col("id").cast("string")))), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "parse_json(CAST(id AS STRING))", + "dataType" -> "\"VARIANT\"") + ) + // Partitioning by non-variant column works + try { + df.repartition(5, col("id")).collect() + } catch { + case e: Exception => + fail(s"Expected no exception to be thrown but an exception was thrown: ${e.getMessage}") + } + // SQL + withTempView("tv") { + df.createOrReplaceTempView("tv") + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY v")), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "tv.v", + "dataType" -> "\"VARIANT\""), + context = ExpectedContext( + fragment = "DISTRIBUTE BY v", + start = 17, + stop = 31) + ) + checkError( + exception = intercept[AnalysisException](sql("SELECT * FROM tv DISTRIBUTE BY s")), + condition = "UNSUPPORTED_FEATURE.PARTITION_BY_VARIANT", + parameters = Map( + "expr" -> "tv.s", + "dataType" -> "\"STRUCT\""), + context = ExpectedContext( + fragment = "DISTRIBUTE BY s", + start = 17, + stop = 31) + ) + } + } + test("repartition with SortOrder") { // passing SortOrder expressions to .repartition() should result in an informative error From b9e472e3a6e01a0ad3d062607a708db6fdf9e2c6 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 6 Dec 2024 10:57:41 -0800 Subject: [PATCH 9/9] fixed order of error conditions --- .../src/main/resources/error/error-conditions.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 497903f183567..bca793ba8e449 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -5303,6 +5303,11 @@ "Parameter markers are not allowed in ." ] }, + "PARTITION_BY_VARIANT" : { + "message" : [ + "Cannot use VARIANT producing expressions to partition a DataFrame, but the type of expression is ." + ] + }, "PARTITION_WITH_NESTED_COLUMN_IS_UNSUPPORTED" : { "message" : [ "Invalid partitioning: is missing or is in a map or array." @@ -5375,11 +5380,6 @@ "Cannot have VARIANT type columns in DataFrame which calls set operations (INTERSECT, EXCEPT, etc.), but the type of column is ." ] }, - "PARTITION_BY_VARIANT" : { - "message" : [ - "Cannot use VARIANT producing expressions to partition a DataFrame, but the type of expression is ." - ] - }, "SET_PROPERTIES_AND_DBPROPERTIES" : { "message" : [ "set PROPERTIES and DBPROPERTIES at the same time."