Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/builder_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,8 @@ class ARROW_EXPORT ChunkedBinaryBuilder {
protected:
Status NextChunk();

int32_t max_chunk_size_;
int32_t chunk_data_size_;
int64_t max_chunk_size_;
int64_t chunk_data_size_;

std::unique_ptr<BinaryBuilder> builder_;
std::vector<std::shared_ptr<Array>> chunks_;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/arrow/record_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -629,8 +629,8 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType> {
public:
ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
: TypedRecordReader<ByteArrayType>(descr, pool), builder_(nullptr) {
// Maximum of 16MB chunks
constexpr int32_t kBinaryChunksize = 1 << 24;
// ARROW-4688(wesm): Using 2^31 - 1 chunks for now
constexpr int32_t kBinaryChunksize = 2147483647;
DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
if (descr_->logical_type() == LogicalType::UTF8) {
builder_.reset(
Expand Down
37 changes: 29 additions & 8 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2091,6 +2091,13 @@ def test_large_table_int32_overflow():
_write_table(table, f)


def _simple_table_roundtrip(table):
stream = pa.BufferOutputStream()
_write_table(table, stream)
buf = stream.getvalue()
return _read_table(buf)


@pytest.mark.pandas
@pytest.mark.large_memory
def test_binary_array_overflow_to_chunked():
Expand All @@ -2103,22 +2110,36 @@ def test_binary_array_overflow_to_chunked():
df = pd.DataFrame({'byte_col': values})

tbl = pa.Table.from_pandas(df, preserve_index=False)

buf = io.BytesIO()
_write_table(tbl, buf)
buf.seek(0)
read_tbl = _read_table(buf)
buf = None
read_tbl = _simple_table_roundtrip(tbl)

col0_data = read_tbl[0].data
assert isinstance(col0_data, pa.ChunkedArray)

# Split up into 16MB chunks. 128 * 16 = 2048, so 129
assert col0_data.num_chunks == 129
# Split up into 2GB chunks
assert col0_data.num_chunks == 2

assert tbl.equals(read_tbl)


@pytest.mark.pandas
@pytest.mark.large_memory
def test_list_of_binary_large_cell():
# ARROW-4688
data = []

# TODO(wesm): handle chunked children
# 2^31 - 1 bytes in a single cell
# data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])

# A little under 2GB in cell each containing approximately 10MB each
data.extend([[b'x' * 1000000] * 10] * 214)

arr = pa.array(data)
table = pa.Table.from_arrays([arr], ['chunky_cells'])
read_table = _simple_table_roundtrip(table)
assert table.equals(read_table)


@pytest.mark.pandas
def test_index_column_name_duplicate(tempdir):
data = {
Expand Down