-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14143] Options for parsing NaNs, Infinity and nulls for numeric types #11947
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
93ac6bb
9594ee5
180a900
124873b
161a3eb
3316101
698b4b4
6facd26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -90,6 +90,12 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str | |
|
|
||
| val nullValue = parameters.getOrElse("nullValue", "") | ||
|
|
||
| val nanValue = parameters.getOrElse("nanValue", "NaN") | ||
|
|
||
| val positiveInf = parameters.getOrElse("positiveInf", "Inf") | ||
| val negativeInf = parameters.getOrElse("negativeInf", "-Inf") | ||
|
|
||
|
|
||
| val compressionCodec: Option[String] = { | ||
| val name = parameters.get("compression").orElse(parameters.get("codec")) | ||
| name.map(CompressionCodecs.getCodecClassName) | ||
|
|
@@ -111,3 +117,12 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str | |
|
|
||
| val rowSeparator = "\n" | ||
| } | ||
|
|
||
| object CSVOptions { | ||
|
|
||
| def apply(): CSVOptions = new CSVOptions(Map.empty) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For me, I feel a bit hesitating if this I'd just use |
||
|
|
||
| def apply(paramName: String, paramValue: String): CSVOptions = { | ||
| new CSVOptions(Map(paramName -> paramValue)) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| int,long,float,double | ||
| 8,1000000,1.042,23848545.0374 | ||
| --,34232323,98.343,184721.23987223 | ||
| 34,--,98.343,184721.23987223 | ||
| 34,43323123,--,184721.23987223 | ||
| 34,43323123,223823.9484,-- | ||
| 34,43323123,223823.NAN,NAN | ||
| 34,43323123,223823.INF,INF | ||
| 34,43323123,223823.-INF,-INF |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,8 @@ import org.apache.spark.unsafe.types.UTF8String | |
|
|
||
| class CSVTypeCastSuite extends SparkFunSuite { | ||
|
|
||
| private def assertNull(v: Any) = assert(v == null) | ||
|
|
||
| test("Can parse decimal type values") { | ||
| val stringValues = Seq("10.05", "1,000.01", "158,058,049.001") | ||
| val decimalValues = Seq(10.05, 1000.01, 158058049.001) | ||
|
|
@@ -66,17 +68,21 @@ class CSVTypeCastSuite extends SparkFunSuite { | |
| } | ||
|
|
||
| test("Nullable types are handled") { | ||
| assert(CSVTypeCast.castTo("", IntegerType, nullable = true) == null) | ||
| assert(CSVTypeCast.castTo("", IntegerType, nullable = true, CSVOptions()) == null) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just noticed that third argument in |
||
| } | ||
|
|
||
| test("String type should always return the same as the input") { | ||
| assert(CSVTypeCast.castTo("", StringType, nullable = true) == UTF8String.fromString("")) | ||
| assert(CSVTypeCast.castTo("", StringType, nullable = false) == UTF8String.fromString("")) | ||
| assert( | ||
| CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()) == | ||
| UTF8String.fromString("")) | ||
| assert( | ||
| CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) == | ||
| UTF8String.fromString("")) | ||
| } | ||
|
|
||
| test("Throws exception for empty string with non null type") { | ||
| val exception = intercept[NumberFormatException]{ | ||
| CSVTypeCast.castTo("", IntegerType, nullable = false) | ||
| CSVTypeCast.castTo("", IntegerType, nullable = false, CSVOptions()) | ||
| } | ||
| assert(exception.getMessage.contains("For input string: \"\"")) | ||
| } | ||
|
|
@@ -90,12 +96,12 @@ class CSVTypeCastSuite extends SparkFunSuite { | |
| assert(CSVTypeCast.castTo("1.00", DoubleType) == 1.0) | ||
| assert(CSVTypeCast.castTo("true", BooleanType) == true) | ||
|
|
||
| val dateFormat = new SimpleDateFormat("dd/MM/yyyy hh:mm") | ||
| val options = CSVOptions("dateFormat", "dd/MM/yyyy hh:mm") | ||
| val customTimestamp = "31/01/2015 00:00" | ||
| val expectedTime = dateFormat.parse("31/01/2015 00:00").getTime | ||
| assert(CSVTypeCast.castTo(customTimestamp, TimestampType, dateFormat = dateFormat) | ||
| == expectedTime * 1000L) | ||
| assert(CSVTypeCast.castTo(customTimestamp, DateType, dateFormat = dateFormat) == | ||
| val expectedTime = options.dateFormat.parse("31/01/2015 00:00").getTime | ||
| assert(CSVTypeCast.castTo(customTimestamp, TimestampType, nullable = true, options) == | ||
| expectedTime * 1000L) | ||
| assert(CSVTypeCast.castTo(customTimestamp, DateType, nullable = true, options) == | ||
| DateTimeUtils.millisToDays(expectedTime)) | ||
|
|
||
| val timestamp = "2015-01-01 00:00:00" | ||
|
|
@@ -116,4 +122,63 @@ class CSVTypeCastSuite extends SparkFunSuite { | |
| Locale.setDefault(originalLocale) | ||
| } | ||
| } | ||
|
|
||
| test("Float NaN values are parsed correctly") { | ||
| val floatVal: Float = CSVTypeCast.castTo( | ||
| "nn", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float] | ||
|
|
||
| // Java implements the IEEE-754 floating point standard which guarantees that any comparison | ||
| // against NaN will return false (except != which returns true) | ||
| assert(floatVal != floatVal) | ||
| } | ||
|
|
||
| test("Double NaN values are parsed correctly") { | ||
| val doubleVal: Double = CSVTypeCast.castTo( | ||
| "-", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double] | ||
|
|
||
| assert(doubleVal.isNaN) | ||
| } | ||
|
|
||
| test("Float infinite values can be parsed") { | ||
| val floatVal1 = CSVTypeCast.castTo( | ||
| "max", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float] | ||
|
|
||
| assert(floatVal1 == Float.NegativeInfinity) | ||
|
|
||
| val floatVal2 = CSVTypeCast.castTo( | ||
| "max", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float] | ||
|
|
||
| assert(floatVal2 == Float.PositiveInfinity) | ||
| } | ||
|
|
||
| test("Double infinite values can be parsed") { | ||
| val doubleVal1 = CSVTypeCast.castTo( | ||
| "max", DoubleType, nullable = true, CSVOptions("negativeInf", "max") | ||
| ).asInstanceOf[Double] | ||
|
|
||
| assert(doubleVal1 == Double.NegativeInfinity) | ||
|
|
||
| val doubleVal2 = CSVTypeCast.castTo( | ||
| "max", DoubleType, nullable = true, CSVOptions("positiveInf", "max") | ||
| ).asInstanceOf[Double] | ||
|
|
||
| assert(doubleVal2 == Double.PositiveInfinity) | ||
| } | ||
|
|
||
| test("Type-specific null values are used for casting") { | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-"))) | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-"))) | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-"))) | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-"))) | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-"))) | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-"))) | ||
| assertNull( | ||
| CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-"))) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Also, it looks the use of
TryAPI is discouraged scala-style-guide#exception.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think in this case, in a private and unexposed method, this seem OK. There are many other instances of it in
CSVInferSchema