apache · n3world · Jun 10, 2021
diff --git a/cpp/src/arrow/csv/column_builder_test.cc b/cpp/src/arrow/csv/column_builder_test.cc
@@ -604,5 +604,23 @@ TEST_F(InferringColumnBuilderTest, SingleChunkBinaryAutoDict) {
                        {expected_dictionary});
 }
 
+TEST_F(InferringColumnBuilderTest, Infer32BitWithInt32) {
+  auto options = ConvertOptions::Defaults();
+  options.infer_32bit_values = true;
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"1", "-80456", "3465"}}, options,
+                {ArrayFromJSON(int32(), R"([1, -80456, 3465])")});
+}
+
+TEST_F(InferringColumnBuilderTest, Infer32BitWithInt64) {
+  auto options = ConvertOptions::Defaults();
+  options.infer_32bit_values = true;
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"1", "-80456", "346525346946"}}, options,
+                {ArrayFromJSON(int64(), R"([1, -80456, 346525346946])")});
+}
+
 }  // namespace csv
 }  // namespace arrow
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
@@ -493,7 +493,7 @@ class PrimitiveConverter : public ConcreteConverter {
       builder.UnsafeAppend(value);
       return Status::OK();
     };
-    RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
+    RETURN_NOT_OK(parser.VisitColumn(col_index, std::move(visit)));
 
     std::shared_ptr<Array> res;
     RETURN_NOT_OK(builder.Finish(&res));

diff --git a/cpp/src/arrow/csv/inference_internal.h b/cpp/src/arrow/csv/inference_internal.h
@@ -28,6 +28,7 @@ namespace csv {
 
 enum class InferKind {
   Null,
+  Integer32,
   Integer,
   Boolean,
   Real,
@@ -55,6 +56,9 @@ class InferStatus {
 
     switch (kind_) {
       case InferKind::Null:
+        return SetKind(options_.infer_32bit_values ? InferKind::Integer32
+                                                   : InferKind::Integer);
+      case InferKind::Integer32:
         return SetKind(InferKind::Integer);
       case InferKind::Integer:
         return SetKind(InferKind::Boolean);
@@ -111,6 +115,8 @@ class InferStatus {
     switch (kind_) {
       case InferKind::Null:
         return make_converter(null());
+      case InferKind::Integer32:
+        return make_converter(int32());
       case InferKind::Integer:
         return make_converter(int64());
       case InferKind::Boolean:

diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
@@ -127,6 +127,13 @@ struct ARROW_EXPORT ConvertOptions {
   /// built-in ISO-8601 parser.
   std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
 
+  /// If false only 64bit integer values will be used when types are inferred.
+  /// If true then columns will first be parsed as 32 bit integer values and only be 64bit
+  /// if it cannot be represented as 32 bit. When using the streaming reader and this
+  /// option parse errors can occur if a row was inferred as 32 bit but subsequent block
+  /// has a 64 bit value.
+  bool infer_32bit_values = false;
+
   /// Create conversion options with default values, including conventional
   /// values for `null_values`, `true_values` and `false_values`
   static ConvertOptions Defaults();

diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
@@ -476,6 +476,13 @@ cdef class ConvertOptions(_Weakrefable):
         produce a column of nulls (whose type is selected using
         `column_types`, or null by default).
         This option is ignored if `include_columns` is empty.
+    infer_32bit_values: bool, optional (default False)
+        If false only 64bit integer values will be used when types are
+        inferred.
+        If true then columns will first be parsed as 32 bit integer values and
+        only be 64 bit if it cannot be represented as 32 bit. When using the
+        streaming reader and this option parse errors can occur if a row was
+        inferred as 32 bit but subsequent block has a 64 bit value.
     """
     # Avoid mistakingly creating attributes
     __slots__ = ()
@@ -717,6 +724,22 @@ cdef class ConvertOptions(_Weakrefable):
 
         deref(self.options).timestamp_parsers = move(c_parsers)
 
+    @property
+    def infer_32bit_values(self):
+        """
+        If false only 64bit integer values will be used when types are
+        inferred.
+        If true then columns will first be parsed as 32 bit integer values and
+        only be 64 bit if it cannot be represented as 32 bit. When using the
+        streaming reader and this option parse errors can occur if a row was
+        inferred as 32 bit but subsequent block has a 64 bit value.r
+        """
+        return deref(self.options).infer_32bit_values
+
+    @infer_32bit_values.setter
+    def infer_32bit_values(self, value):
+        deref(self.options).infer_32bit_values = value
+
     @staticmethod
     cdef ConvertOptions wrap(CCSVConvertOptions options):
         out = ConvertOptions()

diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
@@ -1616,6 +1616,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         c_bool strings_can_be_null
         c_bool quoted_strings_can_be_null
         vector[shared_ptr[CTimestampParser]] timestamp_parsers
+        c_bool infer_32bit_values
 
         c_bool auto_dict_encode
         int32_t auto_dict_max_cardinality

diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
@@ -599,6 +599,25 @@ def format_msg(msg_format, row, *args):
                             read_options=read_options,
                             convert_options=convert_options)
 
+    def test_infer_32bit(self):
+        convert_options = ConvertOptions()
+
+        csv = b"a\r\n12"
+
+        table = self.read_bytes(csv, convert_options=convert_options)
+        assert pa.schema([('a', pa.int64())]) == table.schema
+        assert table.to_pydict() == {'a': [12]}
+
+        convert_options.infer_32bit_values = True
+        table = self.read_bytes(csv, convert_options=convert_options)
+        assert pa.schema([('a', pa.int32())]) == table.schema
+        assert table.to_pydict() == {'a': [12]}
+
+        csv = b"a\r\n127897896615"
+        table = self.read_bytes(csv, convert_options=convert_options)
+        assert pa.schema([('a', pa.int64())]) == table.schema
+        assert table.to_pydict() == {'a': [127897896615]}
+
 
 class BaseCSVTableRead(BaseTestCSV):