From 28d9ac3582f24df8dd9ebfb0199d0df0ebc0bd2f Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 16 Dec 2016 13:53:32 -0800 Subject: [PATCH 1/4] [SPARK-18877][SQL] `inferField` on DecimalType should find a common type with `typeSoFar` --- .../sql/execution/datasources/csv/CSVInferSchema.scala | 4 +++- .../execution/datasources/csv/CSVInferSchemaSuite.scala | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala index 88c608add140f..29437c760a9dc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala @@ -85,7 +85,9 @@ private[csv] object CSVInferSchema { case NullType => tryParseInteger(field, options) case IntegerType => tryParseInteger(field, options) case LongType => tryParseLong(field, options) - case _: DecimalType => tryParseDecimal(field, options) + case _: DecimalType => + // DecimalTypes have different precisions and scales, so we try to find the common type. + findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(NullType) case DoubleType => tryParseDouble(field, options) case TimestampType => tryParseTimestamp(field, options) case BooleanType => tryParseBoolean(field, options) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala index 93f752d107ca3..fcdaaf566233e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala @@ -114,4 +114,11 @@ class CSVInferSchemaSuite extends SparkFunSuite { val options = new CSVOptions(Map("TiMeStampFormat" -> "yyyy-mm")) assert(CSVInferSchema.inferField(TimestampType, "2015-08", options) == TimestampType) } + + test("SPARK-18877: `inferField` on DecimalType should find a common type with `typeSoFar`") { + val options = new CSVOptions(Map.empty[String, String]) + // 9.03E+12 is Decimal(3, -10) and 1.19E+11 is Decimal(3, -9). + assert(CSVInferSchema.inferField(DecimalType(3, -10), "1.19E+11", options) == + DecimalType(4, -9)) + } } From c1e07a9dc3970c76390031f293e485ff28a12136 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 28 Dec 2016 13:59:53 -0800 Subject: [PATCH 2/4] Add another testcase having precision 40 (over 38) and scale 20. --- .../sql/execution/datasources/csv/CSVInferSchemaSuite.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala index fcdaaf566233e..9f84a31a592ab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala @@ -117,8 +117,13 @@ class CSVInferSchemaSuite extends SparkFunSuite { test("SPARK-18877: `inferField` on DecimalType should find a common type with `typeSoFar`") { val options = new CSVOptions(Map.empty[String, String]) + // 9.03E+12 is Decimal(3, -10) and 1.19E+11 is Decimal(3, -9). assert(CSVInferSchema.inferField(DecimalType(3, -10), "1.19E+11", options) == DecimalType(4, -9)) + + // BigDecimal("12345678901234567890.01234567890123456789") is precision 40 and scale 20. + val value = "12345678901234567890.01234567890123456789" + assert(CSVInferSchema.inferField(DecimalType(3, -10), value, options) == DoubleType) } } From 393d3a9ceaa6d92a301b5a2917e28d29518c1638 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 2 Jan 2017 20:12:24 -0800 Subject: [PATCH 3/4] Use `StringType`. --- .../spark/sql/execution/datasources/csv/CSVInferSchema.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala index 29437c760a9dc..adc92fe5a31e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala @@ -87,7 +87,7 @@ private[csv] object CSVInferSchema { case LongType => tryParseLong(field, options) case _: DecimalType => // DecimalTypes have different precisions and scales, so we try to find the common type. - findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(NullType) + findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(StringType) case DoubleType => tryParseDouble(field, options) case TimestampType => tryParseTimestamp(field, options) case BooleanType => tryParseBoolean(field, options) From e59631bd54872a03eaa63cc74d0e245300bbc781 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 2 Jan 2017 22:04:17 -0800 Subject: [PATCH 4/4] Add the corresponding testcase --- .../sql/execution/datasources/csv/CSVInferSchemaSuite.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala index 9f84a31a592ab..8620bb9f65b97 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala @@ -125,5 +125,10 @@ class CSVInferSchemaSuite extends SparkFunSuite { // BigDecimal("12345678901234567890.01234567890123456789") is precision 40 and scale 20. val value = "12345678901234567890.01234567890123456789" assert(CSVInferSchema.inferField(DecimalType(3, -10), value, options) == DoubleType) + + // Seq(s"${Long.MaxValue}1", "2015-12-01 00:00:00") should be StringType + assert(CSVInferSchema.inferField(NullType, s"${Long.MaxValue}1", options) == DecimalType(20, 0)) + assert(CSVInferSchema.inferField(DecimalType(20, 0), "2015-12-01 00:00:00", options) + == StringType) } }