From ec01677fe4b61550dd5e8f4de3b380463236f7fb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 4 Feb 2020 09:12:06 -0800 Subject: [PATCH 1/3] Add legacy config for allowing empty strings for certain types in json parser. --- docs/sql-migration-guide.md | 2 +- .../sql/catalyst/json/JacksonParser.scala | 17 +++++++++-- .../apache/spark/sql/internal/SQLConf.scala | 8 ++++++ .../datasources/json/JsonSuite.scala | 28 +++++++++++++++++++ 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index a5ef1c2b1d045..d61a95bcb7b06 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -37,7 +37,7 @@ license: | - Since Spark 3.0, the Dataset and DataFrame API `unionAll` is not deprecated any more. It is an alias for `union`. - - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`. + - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`. The previous behaviour of allowing empty string can be restored by setting `spark.sql.legacy.json.allowEmptyString.enabled` to `true`. - Since Spark 3.0, the `from_json` functions supports two modes - `PERMISSIVE` and `FAILFAST`. The modes can be set via the `mode` option. The default mode became `PERMISSIVE`. In previous versions, behavior of `from_json` did not conform to either `PERMISSIVE` nor `FAILFAST`, especially in processing of malformed JSON records. For example, the JSON string `{"a" 1}` with the schema `a INT` is converted to `null` by previous versions but Spark 3.0 converts it to `Row(null)`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index c44025ca8bcfd..9606c48c7ce5a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -30,6 +30,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} import org.apache.spark.util.Utils @@ -307,6 +308,8 @@ class JacksonParser( } } + private lazy val allowEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON) + /** * This function throws an exception for failed conversion. For empty string on data types * except for string and binary types, this also throws an exception. @@ -317,8 +320,18 @@ class JacksonParser( // SPARK-25040: Disallow empty strings for data types except for string and binary types. case VALUE_STRING if parser.getTextLength < 1 => - throw new RuntimeException( - s"Failed to parse an empty string for data type ${dataType.catalogString}") + // Legacy mode for prior to Spark 3.0.0. + if (allowEmptyString) { + dataType match { + case FloatType | DoubleType | TimestampType | DateType => + throw new RuntimeException( + s"Failed to parse an empty string for data type ${dataType.catalogString}") + case _ => null + } + } else { + throw new RuntimeException( + s"Failed to parse an empty string for data type ${dataType.catalogString}") + } case token => // We cannot parse this token based on the given data type. So, we throw a diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b94ddbdc0fc9a..2de62f82de03f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1989,6 +1989,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_ALLOW_EMPTY_STRING_IN_JSON = + buildConf("spark.sql.legacy.json.allowEmptyString.enabled") + .internal() + .doc("When set to true, the parser of JSON data source treats empty strings as null for " + + "some data types such as `IntegerType`.") + .booleanConf + .createWithDefault(false) + val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL = buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled") .internal() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index d0e2e001034fb..1cf9426a6e36f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -2453,6 +2453,12 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson checkAnswer(df, Row(expected) :: Nil) } + def emptyStringAsNull(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(null) :: Nil) + } + failedOnEmptyString(BooleanType) failedOnEmptyString(ByteType) failedOnEmptyString(ShortType) @@ -2469,6 +2475,28 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson emptyString(StringType, "") emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) + + // Legacy mode prior to Spark 3.0.0 + withSQLConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON.key -> "true") { + emptyStringAsNull(BooleanType) + emptyStringAsNull(ByteType) + emptyStringAsNull(ShortType) + emptyStringAsNull(IntegerType) + emptyStringAsNull(LongType) + + failedOnEmptyString(FloatType) + failedOnEmptyString(DoubleType) + failedOnEmptyString(TimestampType) + failedOnEmptyString(DateType) + + emptyStringAsNull(DecimalType.SYSTEM_DEFAULT) + emptyStringAsNull(ArrayType(IntegerType)) + emptyStringAsNull(MapType(StringType, IntegerType, true)) + emptyStringAsNull(StructType(StructField("f1", IntegerType, true) :: Nil)) + + emptyString(StringType, "") + emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) + } } test("return partial result for bad records") { From 506c5502a1b319dcc7df72202c13e6f936cc375d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 4 Feb 2020 11:39:26 -0800 Subject: [PATCH 2/3] For review comment. --- docs/sql-migration-guide.md | 2 +- .../org/apache/spark/sql/catalyst/json/JacksonParser.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index d61a95bcb7b06..0c47370283736 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -37,7 +37,7 @@ license: | - Since Spark 3.0, the Dataset and DataFrame API `unionAll` is not deprecated any more. It is an alias for `union`. - - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`. The previous behaviour of allowing empty string can be restored by setting `spark.sql.legacy.json.allowEmptyString.enabled` to `true`. + - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType`, `DoubleType`, `DateType` and `TimestampType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`. The previous behaviour of allowing empty string can be restored by setting `spark.sql.legacy.json.allowEmptyString.enabled` to `true`. - Since Spark 3.0, the `from_json` functions supports two modes - `PERMISSIVE` and `FAILFAST`. The modes can be set via the `mode` option. The default mode became `PERMISSIVE`. In previous versions, behavior of `from_json` did not conform to either `PERMISSIVE` nor `FAILFAST`, especially in processing of malformed JSON records. For example, the JSON string `{"a" 1}` with the schema `a INT` is converted to `null` by previous versions but Spark 3.0 converts it to `Row(null)`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 9606c48c7ce5a..a7e0dd1198782 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -308,7 +308,7 @@ class JacksonParser( } } - private lazy val allowEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON) + private val allowEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON) /** * This function throws an exception for failed conversion. For empty string on data types From fd62b4b048dcb93ab65d2a62130cc3f10dcafd79 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 4 Feb 2020 13:03:42 -0800 Subject: [PATCH 3/3] For review comments. --- .../sql/catalyst/json/JacksonParser.scala | 25 ++++++----- .../apache/spark/sql/internal/SQLConf.scala | 8 ++-- .../datasources/json/JsonSuite.scala | 43 ++++++++++--------- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index a7e0dd1198782..76efa574a99ff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -318,21 +318,20 @@ class JacksonParser( parser: JsonParser, dataType: DataType): PartialFunction[JsonToken, R] = { - // SPARK-25040: Disallow empty strings for data types except for string and binary types. - case VALUE_STRING if parser.getTextLength < 1 => - // Legacy mode for prior to Spark 3.0.0. - if (allowEmptyString) { - dataType match { - case FloatType | DoubleType | TimestampType | DateType => - throw new RuntimeException( - s"Failed to parse an empty string for data type ${dataType.catalogString}") - case _ => null - } - } else { - throw new RuntimeException( - s"Failed to parse an empty string for data type ${dataType.catalogString}") + // SPARK-25040: Disallows empty strings for data types except for string and binary types. + // But treats empty strings as null for certain types if the legacy config is enabled. + case VALUE_STRING if parser.getTextLength < 1 && allowEmptyString => + dataType match { + case FloatType | DoubleType | TimestampType | DateType => + throw new RuntimeException( + s"Failed to parse an empty string for data type ${dataType.catalogString}") + case _ => null } + case VALUE_STRING if parser.getTextLength < 1 => + throw new RuntimeException( + s"Failed to parse an empty string for data type ${dataType.catalogString}") + case token => // We cannot parse this token based on the given data type. So, we throw a // RuntimeException and this exception will be caught by `parse` method. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 2de62f82de03f..5ce5692123805 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1991,11 +1991,11 @@ object SQLConf { val LEGACY_ALLOW_EMPTY_STRING_IN_JSON = buildConf("spark.sql.legacy.json.allowEmptyString.enabled") - .internal() - .doc("When set to true, the parser of JSON data source treats empty strings as null for " + + .internal() + .doc("When set to true, the parser of JSON data source treats empty strings as null for " + "some data types such as `IntegerType`.") - .booleanConf - .createWithDefault(false) + .booleanConf + .createWithDefault(false) val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL = buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 1cf9426a6e36f..b20da2266b0f3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -2436,29 +2436,24 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } } - test("SPARK-25040: empty strings should be disallowed") { - def failedOnEmptyString(dataType: DataType): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - val errMessage = intercept[SparkException] { - df.collect() - }.getMessage - assert(errMessage.contains( - s"Failed to parse an empty string for data type ${dataType.catalogString}")) - } - def emptyString(dataType: DataType, expected: Any): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - checkAnswer(df, Row(expected) :: Nil) - } + private def failedOnEmptyString(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + val errMessage = intercept[SparkException] { + df.collect() + }.getMessage + assert(errMessage.contains( + s"Failed to parse an empty string for data type ${dataType.catalogString}")) + } - def emptyStringAsNull(dataType: DataType): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - checkAnswer(df, Row(null) :: Nil) - } + private def emptyString(dataType: DataType, expected: Any): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(expected) :: Nil) + } + test("SPARK-25040: empty strings should be disallowed") { failedOnEmptyString(BooleanType) failedOnEmptyString(ByteType) failedOnEmptyString(ShortType) @@ -2475,6 +2470,14 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson emptyString(StringType, "") emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) + } + + test("SPARK-25040: allowing empty strings when legacy config is enabled") { + def emptyStringAsNull(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(null) :: Nil) + } // Legacy mode prior to Spark 3.0.0 withSQLConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON.key -> "true") {