Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions cpp/src/arrow/csv/column_builder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -604,5 +604,23 @@ TEST_F(InferringColumnBuilderTest, SingleChunkBinaryAutoDict) {
{expected_dictionary});
}

TEST_F(InferringColumnBuilderTest, Infer32BitWithInt32) {
auto options = ConvertOptions::Defaults();
options.infer_32bit_values = true;
auto tg = TaskGroup::MakeSerial();

CheckInferred(tg, {{"1", "-80456", "3465"}}, options,
{ArrayFromJSON(int32(), R"([1, -80456, 3465])")});
}

TEST_F(InferringColumnBuilderTest, Infer32BitWithInt64) {
auto options = ConvertOptions::Defaults();
options.infer_32bit_values = true;
auto tg = TaskGroup::MakeSerial();

CheckInferred(tg, {{"1", "-80456", "346525346946"}}, options,
{ArrayFromJSON(int64(), R"([1, -80456, 346525346946])")});
}

} // namespace csv
} // namespace arrow
2 changes: 1 addition & 1 deletion cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ class PrimitiveConverter : public ConcreteConverter {
builder.UnsafeAppend(value);
return Status::OK();
};
RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
RETURN_NOT_OK(parser.VisitColumn(col_index, std::move(visit)));

std::shared_ptr<Array> res;
RETURN_NOT_OK(builder.Finish(&res));
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/csv/inference_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ namespace csv {

enum class InferKind {
Null,
Integer32,
Integer,
Boolean,
Real,
Expand Down Expand Up @@ -55,6 +56,9 @@ class InferStatus {

switch (kind_) {
case InferKind::Null:
return SetKind(options_.infer_32bit_values ? InferKind::Integer32
: InferKind::Integer);
case InferKind::Integer32:
return SetKind(InferKind::Integer);
case InferKind::Integer:
return SetKind(InferKind::Boolean);
Expand Down Expand Up @@ -111,6 +115,8 @@ class InferStatus {
switch (kind_) {
case InferKind::Null:
return make_converter(null());
case InferKind::Integer32:
return make_converter(int32());
case InferKind::Integer:
return make_converter(int64());
case InferKind::Boolean:
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ struct ARROW_EXPORT ConvertOptions {
/// built-in ISO-8601 parser.
std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;

/// If false only 64bit integer values will be used when types are inferred.
/// If true then columns will first be parsed as 32 bit integer values and only be 64bit
/// if it cannot be represented as 32 bit. When using the streaming reader and this
/// option parse errors can occur if a row was inferred as 32 bit but subsequent block
/// has a 64 bit value.
bool infer_32bit_values = false;

/// Create conversion options with default values, including conventional
/// values for `null_values`, `true_values` and `false_values`
static ConvertOptions Defaults();
Expand Down
23 changes: 23 additions & 0 deletions python/pyarrow/_csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,13 @@ cdef class ConvertOptions(_Weakrefable):
produce a column of nulls (whose type is selected using
`column_types`, or null by default).
This option is ignored if `include_columns` is empty.
infer_32bit_values: bool, optional (default False)
If false only 64bit integer values will be used when types are
inferred.
If true then columns will first be parsed as 32 bit integer values and
only be 64 bit if it cannot be represented as 32 bit. When using the
streaming reader and this option parse errors can occur if a row was
inferred as 32 bit but subsequent block has a 64 bit value.
"""
# Avoid mistakingly creating attributes
__slots__ = ()
Expand Down Expand Up @@ -717,6 +724,22 @@ cdef class ConvertOptions(_Weakrefable):

deref(self.options).timestamp_parsers = move(c_parsers)

@property
def infer_32bit_values(self):
"""
If false only 64bit integer values will be used when types are
inferred.
If true then columns will first be parsed as 32 bit integer values and
only be 64 bit if it cannot be represented as 32 bit. When using the
streaming reader and this option parse errors can occur if a row was
inferred as 32 bit but subsequent block has a 64 bit value.r
"""
return deref(self.options).infer_32bit_values

@infer_32bit_values.setter
def infer_32bit_values(self, value):
deref(self.options).infer_32bit_values = value

@staticmethod
cdef ConvertOptions wrap(CCSVConvertOptions options):
out = ConvertOptions()
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1616,6 +1616,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
c_bool strings_can_be_null
c_bool quoted_strings_can_be_null
vector[shared_ptr[CTimestampParser]] timestamp_parsers
c_bool infer_32bit_values

c_bool auto_dict_encode
int32_t auto_dict_max_cardinality
Expand Down
19 changes: 19 additions & 0 deletions python/pyarrow/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,25 @@ def format_msg(msg_format, row, *args):
read_options=read_options,
convert_options=convert_options)

def test_infer_32bit(self):
convert_options = ConvertOptions()

csv = b"a\r\n12"

table = self.read_bytes(csv, convert_options=convert_options)
assert pa.schema([('a', pa.int64())]) == table.schema
assert table.to_pydict() == {'a': [12]}

convert_options.infer_32bit_values = True
table = self.read_bytes(csv, convert_options=convert_options)
assert pa.schema([('a', pa.int32())]) == table.schema
assert table.to_pydict() == {'a': [12]}

csv = b"a\r\n127897896615"
table = self.read_bytes(csv, convert_options=convert_options)
assert pa.schema([('a', pa.int64())]) == table.schema
assert table.to_pydict() == {'a': [127897896615]}


class BaseCSVTableRead(BaseTestCSV):

Expand Down