diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ef4e7fde98d..aa9f3ce4234 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1236,7 +1236,7 @@ Status ListBuilder::Append(const int32_t* offsets, int64_t length, Status ListBuilder::AppendNextOffset() { int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits::max())) { + if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { std::stringstream ss; ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," << " have " << num_values; @@ -1252,14 +1252,14 @@ Status ListBuilder::Append(bool is_valid) { } Status ListBuilder::Init(int64_t elements) { - DCHECK_LT(elements, std::numeric_limits::max()); + DCHECK_LE(elements, kListMaximumElements); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits::max()); + DCHECK_LE(capacity, kListMaximumElements); // one more then requested for offsets RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); @@ -1303,14 +1303,14 @@ BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} Status BinaryBuilder::Init(int64_t elements) { - DCHECK_LT(elements, std::numeric_limits::max()); + DCHECK_LE(elements, kListMaximumElements); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status BinaryBuilder::Resize(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits::max()); + DCHECK_LE(capacity, kListMaximumElements); // one more then requested for offsets RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); @@ -1318,7 +1318,7 @@ Status BinaryBuilder::Resize(int64_t capacity) { Status BinaryBuilder::ReserveData(int64_t elements) { if (value_data_length() + elements > value_data_capacity()) { - if (value_data_length() + elements > std::numeric_limits::max()) { + if (value_data_length() + elements > kBinaryMemoryLimit) { return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary"); } RETURN_NOT_OK(value_data_builder_.Reserve(elements)); @@ -1328,9 +1328,9 @@ Status BinaryBuilder::ReserveData(int64_t elements) { Status BinaryBuilder::AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) { + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " bytes, have " + ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " << num_bytes; return Status::Invalid(ss.str()); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index dabfb750624..cdcee80bedb 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -41,13 +41,16 @@ namespace arrow { class Array; class Decimal128; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + namespace internal { struct ArrayData; } // namespace internal -static constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kMinBuilderCapacity = 1 << 5; /// Base class for all data array builders. // @@ -702,8 +705,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - static constexpr int64_t kMaximumCapacity = std::numeric_limits::max() - 1; - Status AppendNextOffset(); void Reset(); }; diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 4d91e531712..71bf69fc1ed 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -60,8 +60,6 @@ namespace py { using internal::NumPyTypeSize; -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); - // ---------------------------------------------------------------------- // Conversion utilities diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index dff4e1068fc..d448de08be4 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1048,9 +1048,12 @@ def test_bytes_to_binary(self): @pytest.mark.large_memory def test_bytes_exceed_2gb(self): - val = 'x' * (1 << 20) + v1 = b'x' * 100000000 + v2 = b'x' * 147483646 + + # ARROW-2227, hit exactly 2GB on the nose df = pd.DataFrame({ - 'strings': np.array([val] * 4000, dtype=object) + 'strings': [v1] * 20 + [v2] + ['x'] * 20 }) arr = pa.array(df['strings']) assert isinstance(arr, pa.ChunkedArray)