diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 3bc930cc4a8..c3a459b39fc 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -292,8 +292,8 @@ class ARROW_EXPORT ChunkedBinaryBuilder { protected: Status NextChunk(); - int32_t max_chunk_size_; - int32_t chunk_data_size_; + int64_t max_chunk_size_; + int64_t chunk_data_size_; std::unique_ptr builder_; std::vector> chunks_; diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 42334b91439..57e50a03be1 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -629,8 +629,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader { public: ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) : TypedRecordReader(descr, pool), builder_(nullptr) { - // Maximum of 16MB chunks - constexpr int32_t kBinaryChunksize = 1 << 24; + // ARROW-4688(wesm): Using 2^31 - 1 chunks for now + constexpr int32_t kBinaryChunksize = 2147483647; DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); if (descr_->logical_type() == LogicalType::UTF8) { builder_.reset( diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 39479e397c4..1abfc70941d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2091,6 +2091,13 @@ def test_large_table_int32_overflow(): _write_table(table, f) +def _simple_table_roundtrip(table): + stream = pa.BufferOutputStream() + _write_table(table, stream) + buf = stream.getvalue() + return _read_table(buf) + + @pytest.mark.pandas @pytest.mark.large_memory def test_binary_array_overflow_to_chunked(): @@ -2103,22 +2110,36 @@ def test_binary_array_overflow_to_chunked(): df = pd.DataFrame({'byte_col': values}) tbl = pa.Table.from_pandas(df, preserve_index=False) - - buf = io.BytesIO() - _write_table(tbl, buf) - buf.seek(0) - read_tbl = _read_table(buf) - buf = None + read_tbl = _simple_table_roundtrip(tbl) col0_data = read_tbl[0].data assert isinstance(col0_data, pa.ChunkedArray) - # Split up into 16MB chunks. 128 * 16 = 2048, so 129 - assert col0_data.num_chunks == 129 + # Split up into 2GB chunks + assert col0_data.num_chunks == 2 assert tbl.equals(read_tbl) +@pytest.mark.pandas +@pytest.mark.large_memory +def test_list_of_binary_large_cell(): + # ARROW-4688 + data = [] + + # TODO(wesm): handle chunked children + # 2^31 - 1 bytes in a single cell + # data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)]) + + # A little under 2GB in cell each containing approximately 10MB each + data.extend([[b'x' * 1000000] * 10] * 214) + + arr = pa.array(data) + table = pa.Table.from_arrays([arr], ['chunky_cells']) + read_table = _simple_table_roundtrip(table) + assert table.equals(read_table) + + @pytest.mark.pandas def test_index_column_name_duplicate(tempdir): data = {