From 6877ac360990f9fb08b58c91cb07c1108b824673 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 21 Oct 2020 19:43:42 +0300 Subject: [PATCH 1/6] Set configs to EXCEPTION by default --- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3648615a1eaee..65d976958ffdd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2666,7 +2666,7 @@ object SQLConf { .stringConf .transform(_.toUpperCase(Locale.ROOT)) .checkValues(LegacyBehaviorPolicy.values.map(_.toString)) - .createWithDefault(LegacyBehaviorPolicy.LEGACY.toString) + .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString) val LEGACY_PARQUET_REBASE_MODE_IN_READ = buildConf("spark.sql.legacy.parquet.datetimeRebaseModeInRead") @@ -2696,7 +2696,7 @@ object SQLConf { .stringConf .transform(_.toUpperCase(Locale.ROOT)) .checkValues(LegacyBehaviorPolicy.values.map(_.toString)) - .createWithDefault(LegacyBehaviorPolicy.LEGACY.toString) + .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString) val LEGACY_AVRO_REBASE_MODE_IN_WRITE = buildConf("spark.sql.legacy.avro.datetimeRebaseModeInWrite") From 6c4be00d619a4e8193ef0d5ea86606aec84ee5dc Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 21 Oct 2020 20:06:38 +0300 Subject: [PATCH 2/6] Update the SQL migration guide. --- docs/sql-migration-guide.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 5612e4f1453f1..124b04fb2bede 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -47,6 +47,8 @@ license: | - In Spark 3.1, `IllegalArgumentException` is returned for the incomplete interval literals, e.g. `INTERVAL '1'`, `INTERVAL '1 DAY 2'`, which are invalid. In Spark 3.0, these literals result in `NULL`s. - In Spark 3.1, we remove the built-in Hive 1.2. You need to migrate your custom SerDes to Hive 2.3. See [HIVE-15167](https://issues.apache.org/jira/browse/HIVE-15167) for more details. + + - In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don't fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.parquet.int96RebaseModeInRead` or/and `spark.sql.legacy.parquet.int96RebaseModeInWrite` to `LEGACY`. ## Upgrading from Spark SQL 3.0 to 3.0.1 From 5a6b92bb04b816bd4aa172dca83a226f4928964b Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 21 Oct 2020 21:46:42 +0300 Subject: [PATCH 3/6] Fix ParquetHadoopFsRelationSuite --- .../org/apache/spark/sql/sources/HadoopFsRelationTest.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index cbea74103343e..b65a00457c72c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -155,6 +155,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes withSQLConf( SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString, SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString, + SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString, SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString) { val dataGenerator = RandomDataGenerator.forType( dataType = dataType, From 4bfb96bad3bc0dfd767221c1178053bf59a971b0 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 21 Oct 2020 22:04:55 +0300 Subject: [PATCH 4/6] Fix StatisticsSuite --- .../org/apache/spark/sql/hive/StatisticsSuite.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 52dd2b34a0e95..db0e93787338e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -1513,26 +1513,27 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto Seq(tbl, ext_tbl).foreach { tblName => sql(s"INSERT INTO $tblName VALUES (1, 'a', '2019-12-13')") + val expectedSize = 636 // analyze table sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS NOSCAN") var tableStats = getTableStats(tblName) - assert(tableStats.sizeInBytes == 601) + assert(tableStats.sizeInBytes == expectedSize) assert(tableStats.rowCount.isEmpty) sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS") tableStats = getTableStats(tblName) - assert(tableStats.sizeInBytes == 601) + assert(tableStats.sizeInBytes == expectedSize) assert(tableStats.rowCount.get == 1) // analyze a single partition sql(s"ANALYZE TABLE $tblName PARTITION (ds='2019-12-13') COMPUTE STATISTICS NOSCAN") var partStats = getPartitionStats(tblName, Map("ds" -> "2019-12-13")) - assert(partStats.sizeInBytes == 601) + assert(partStats.sizeInBytes == expectedSize) assert(partStats.rowCount.isEmpty) sql(s"ANALYZE TABLE $tblName PARTITION (ds='2019-12-13') COMPUTE STATISTICS") partStats = getPartitionStats(tblName, Map("ds" -> "2019-12-13")) - assert(partStats.sizeInBytes == 601) + assert(partStats.sizeInBytes == expectedSize) assert(partStats.rowCount.get == 1) } } From f0c4ef190dab0aee1df72bfbbd26dd5e1868f0cd Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 21 Oct 2020 22:11:29 +0300 Subject: [PATCH 5/6] Fix ParquetIOSuite --- .../sql/execution/datasources/parquet/ParquetIOSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 214f36a2df713..dac4e950a7823 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -1022,7 +1022,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession } } Seq( - "2_4_5" -> successInRead _, + "2_4_5" -> failInRead _, "2_4_6" -> successInRead _).foreach { case (version, checkDefaultRead) => withAllParquetReaders { Seq("plain", "dict").foreach { enc => From 33bb5c24dbdaa74fe45f70f172e3943ec3c15aab Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 21 Oct 2020 22:59:50 +0300 Subject: [PATCH 6/6] Fix ParquetFilterSuite --- .../sql/execution/datasources/parquet/ParquetFilterSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 763f9315bfc5b..24a1ba124e56b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -586,7 +586,8 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared Seq(true, false).foreach { java8Api => withSQLConf( SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString, - SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED") { + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED", + SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE.key -> "CORRECTED") { // spark.sql.parquet.outputTimestampType = TIMESTAMP_MILLIS val millisData = Seq( "1000-06-14 08:28:53.123",