apache · wesm · Mar 22, 2019 · Mar 24, 2019
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
@@ -292,8 +292,8 @@ class ARROW_EXPORT ChunkedBinaryBuilder {
  protected:
   Status NextChunk();
 
-  int32_t max_chunk_size_;
-  int32_t chunk_data_size_;
+  int64_t max_chunk_size_;
+  int64_t chunk_data_size_;
 
   std::unique_ptr<BinaryBuilder> builder_;
   std::vector<std::shared_ptr<Array>> chunks_;

diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc
@@ -629,8 +629,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType> {
  public:
   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
       : TypedRecordReader<ByteArrayType>(descr, pool), builder_(nullptr) {
-    // Maximum of 16MB chunks
-    constexpr int32_t kBinaryChunksize = 1 << 24;
+    // ARROW-4688(wesm): Using 2^31 - 1 chunks for now
+    constexpr int32_t kBinaryChunksize = 2147483647;
     DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
     if (descr_->logical_type() == LogicalType::UTF8) {
       builder_.reset(

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
@@ -2091,6 +2091,13 @@ def test_large_table_int32_overflow():
     _write_table(table, f)
 
 
+def _simple_table_roundtrip(table):
+    stream = pa.BufferOutputStream()
+    _write_table(table, stream)
+    buf = stream.getvalue()
+    return _read_table(buf)
+
+
 @pytest.mark.pandas
 @pytest.mark.large_memory
 def test_binary_array_overflow_to_chunked():
@@ -2103,22 +2110,36 @@ def test_binary_array_overflow_to_chunked():
     df = pd.DataFrame({'byte_col': values})
 
     tbl = pa.Table.from_pandas(df, preserve_index=False)
-
-    buf = io.BytesIO()
-    _write_table(tbl, buf)
-    buf.seek(0)
-    read_tbl = _read_table(buf)
-    buf = None
+    read_tbl = _simple_table_roundtrip(tbl)
 
     col0_data = read_tbl[0].data
     assert isinstance(col0_data, pa.ChunkedArray)
 
-    # Split up into 16MB chunks. 128 * 16 = 2048, so 129
-    assert col0_data.num_chunks == 129
+    # Split up into 2GB chunks
+    assert col0_data.num_chunks == 2
 
     assert tbl.equals(read_tbl)
 
 
+@pytest.mark.pandas
+@pytest.mark.large_memory
+def test_list_of_binary_large_cell():
+    # ARROW-4688
+    data = []
+
+    # TODO(wesm): handle chunked children
+    # 2^31 - 1 bytes in a single cell
+    # data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
+
+    # A little under 2GB in cell each containing approximately 10MB each
+    data.extend([[b'x' * 1000000] * 10] * 214)
+
+    arr = pa.array(data)
+    table = pa.Table.from_arrays([arr], ['chunky_cells'])
+    read_table = _simple_table_roundtrip(table)
+    assert table.equals(read_table)
+
+
 @pytest.mark.pandas
 def test_index_column_name_duplicate(tempdir):
     data = {