From df12b0187b4f96e2388034407962de4472edd207 Mon Sep 17 00:00:00 2001 From: Nate Clark Date: Thu, 10 Jun 2021 10:47:40 -0400 Subject: [PATCH] ARROW-13028: [C++][CSV] Add option to attempt 32bit value inference Add a new convert option to attempt parsing integers as 32bit before using 64bit. --- cpp/src/arrow/csv/column_builder_test.cc | 18 ++++++++++++++++++ cpp/src/arrow/csv/converter.cc | 2 +- cpp/src/arrow/csv/inference_internal.h | 6 ++++++ cpp/src/arrow/csv/options.h | 7 +++++++ python/pyarrow/_csv.pyx | 23 +++++++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_csv.py | 19 +++++++++++++++++++ 7 files changed, 75 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc index 7577c883e8c..f03107c9b30 100644 --- a/cpp/src/arrow/csv/column_builder_test.cc +++ b/cpp/src/arrow/csv/column_builder_test.cc @@ -604,5 +604,23 @@ TEST_F(InferringColumnBuilderTest, SingleChunkBinaryAutoDict) { {expected_dictionary}); } +TEST_F(InferringColumnBuilderTest, Infer32BitWithInt32) { + auto options = ConvertOptions::Defaults(); + options.infer_32bit_values = true; + auto tg = TaskGroup::MakeSerial(); + + CheckInferred(tg, {{"1", "-80456", "3465"}}, options, + {ArrayFromJSON(int32(), R"([1, -80456, 3465])")}); +} + +TEST_F(InferringColumnBuilderTest, Infer32BitWithInt64) { + auto options = ConvertOptions::Defaults(); + options.infer_32bit_values = true; + auto tg = TaskGroup::MakeSerial(); + + CheckInferred(tg, {{"1", "-80456", "346525346946"}}, options, + {ArrayFromJSON(int64(), R"([1, -80456, 346525346946])")}); +} + } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 66d05458097..9abf9825928 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -493,7 +493,7 @@ class PrimitiveConverter : public ConcreteConverter { builder.UnsafeAppend(value); return Status::OK(); }; - RETURN_NOT_OK(parser.VisitColumn(col_index, visit)); + RETURN_NOT_OK(parser.VisitColumn(col_index, std::move(visit))); std::shared_ptr res; RETURN_NOT_OK(builder.Finish(&res)); diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h index 1fd6d41b5cc..011294d434f 100644 --- a/cpp/src/arrow/csv/inference_internal.h +++ b/cpp/src/arrow/csv/inference_internal.h @@ -28,6 +28,7 @@ namespace csv { enum class InferKind { Null, + Integer32, Integer, Boolean, Real, @@ -55,6 +56,9 @@ class InferStatus { switch (kind_) { case InferKind::Null: + return SetKind(options_.infer_32bit_values ? InferKind::Integer32 + : InferKind::Integer); + case InferKind::Integer32: return SetKind(InferKind::Integer); case InferKind::Integer: return SetKind(InferKind::Boolean); @@ -111,6 +115,8 @@ class InferStatus { switch (kind_) { case InferKind::Null: return make_converter(null()); + case InferKind::Integer32: + return make_converter(int32()); case InferKind::Integer: return make_converter(int64()); case InferKind::Boolean: diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 38514bcb930..e274b3d4316 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -127,6 +127,13 @@ struct ARROW_EXPORT ConvertOptions { /// built-in ISO-8601 parser. std::vector> timestamp_parsers; + /// If false only 64bit integer values will be used when types are inferred. + /// If true then columns will first be parsed as 32 bit integer values and only be 64bit + /// if it cannot be represented as 32 bit. When using the streaming reader and this + /// option parse errors can occur if a row was inferred as 32 bit but subsequent block + /// has a 64 bit value. + bool infer_32bit_values = false; + /// Create conversion options with default values, including conventional /// values for `null_values`, `true_values` and `false_values` static ConvertOptions Defaults(); diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 950e2d5464c..baad4878246 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -476,6 +476,13 @@ cdef class ConvertOptions(_Weakrefable): produce a column of nulls (whose type is selected using `column_types`, or null by default). This option is ignored if `include_columns` is empty. + infer_32bit_values: bool, optional (default False) + If false only 64bit integer values will be used when types are + inferred. + If true then columns will first be parsed as 32 bit integer values and + only be 64 bit if it cannot be represented as 32 bit. When using the + streaming reader and this option parse errors can occur if a row was + inferred as 32 bit but subsequent block has a 64 bit value. """ # Avoid mistakingly creating attributes __slots__ = () @@ -717,6 +724,22 @@ cdef class ConvertOptions(_Weakrefable): deref(self.options).timestamp_parsers = move(c_parsers) + @property + def infer_32bit_values(self): + """ + If false only 64bit integer values will be used when types are + inferred. + If true then columns will first be parsed as 32 bit integer values and + only be 64 bit if it cannot be represented as 32 bit. When using the + streaming reader and this option parse errors can occur if a row was + inferred as 32 bit but subsequent block has a 64 bit value.r + """ + return deref(self.options).infer_32bit_values + + @infer_32bit_values.setter + def infer_32bit_values(self, value): + deref(self.options).infer_32bit_values = value + @staticmethod cdef ConvertOptions wrap(CCSVConvertOptions options): out = ConvertOptions() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9e023ec08e0..0e177e22dbc 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1616,6 +1616,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: c_bool strings_can_be_null c_bool quoted_strings_can_be_null vector[shared_ptr[CTimestampParser]] timestamp_parsers + c_bool infer_32bit_values c_bool auto_dict_encode int32_t auto_dict_max_cardinality diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 8fd62bb3bc6..332df436d0e 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -599,6 +599,25 @@ def format_msg(msg_format, row, *args): read_options=read_options, convert_options=convert_options) + def test_infer_32bit(self): + convert_options = ConvertOptions() + + csv = b"a\r\n12" + + table = self.read_bytes(csv, convert_options=convert_options) + assert pa.schema([('a', pa.int64())]) == table.schema + assert table.to_pydict() == {'a': [12]} + + convert_options.infer_32bit_values = True + table = self.read_bytes(csv, convert_options=convert_options) + assert pa.schema([('a', pa.int32())]) == table.schema + assert table.to_pydict() == {'a': [12]} + + csv = b"a\r\n127897896615" + table = self.read_bytes(csv, convert_options=convert_options) + assert pa.schema([('a', pa.int64())]) == table.schema + assert table.to_pydict() == {'a': [127897896615]} + class BaseCSVTableRead(BaseTestCSV):