From b88558b5e3dd9f4b119b433c8779e683e1f973a9 Mon Sep 17 00:00:00 2001 From: zanmato Date: Wed, 27 Dec 2023 19:40:43 -0800 Subject: [PATCH 1/8] Give explicit error in ExecBatchBuilder when appending data exceeds offset limit (int32 max) --- cpp/src/arrow/compute/light_array.cc | 38 +++++++++++++++++----------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 66d8477b029..09c48d8e101 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -20,6 +20,7 @@ #include #include "arrow/util/bitmap_ops.h" +#include "arrow/util/macros.h" namespace arrow { namespace compute { @@ -325,8 +326,8 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() { column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); if (!column_metadata.is_fixed_length) { - int min_new_size = static_cast(reinterpret_cast( - buffers_[kFixedLengthBuffer]->data())[num_rows_]); + int32_t min_new_size = + reinterpret_cast(buffers_[kFixedLengthBuffer]->data())[num_rows_]; ARROW_DCHECK(var_len_buf_size_ > 0); if (var_len_buf_size_ < min_new_size) { int new_size = var_len_buf_size_; @@ -465,8 +466,7 @@ void ExecBatchBuilder::Visit(const std::shared_ptr& column, int num_r if (!metadata.is_fixed_length) { const uint8_t* ptr_base = column->buffers[2]->data(); - const uint32_t* offsets = - reinterpret_cast(column->buffers[1]->data()) + column->offset; + const int32_t* offsets = column->GetValues(1); for (int i = 0; i < num_rows; ++i) { uint16_t row_id = row_ids[i]; const uint8_t* field_ptr = ptr_base + offsets[row_id]; @@ -575,18 +575,26 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source // Step 1: calculate target offsets // - uint32_t* offsets = reinterpret_cast(target->mutable_data(1)); - uint32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; - Visit(source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - offsets[num_rows_before + i] = num_bytes; - }); - for (int i = 0; i < num_rows_to_append; ++i) { - uint32_t length = offsets[num_rows_before + i]; - offsets[num_rows_before + i] = sum; - sum += length; + int32_t* offsets = reinterpret_cast(target->mutable_data(1)); + { + int32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; + Visit(source, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + offsets[num_rows_before + i] = num_bytes; + }); + for (int i = 0; i < num_rows_to_append; ++i) { + int32_t length = offsets[num_rows_before + i]; + offsets[num_rows_before + i] = sum; + if (ARROW_PREDICT_FALSE(static_cast(sum) + length > + std::numeric_limits::max())) { + return Status::Invalid("ExecBatchBuilder offset overflow detected for the ", + i + 1, "-th element, last offset ", sum, ", length ", + length); + } + sum += length; + } + offsets[num_rows_before + num_rows_to_append] = sum; } - offsets[num_rows_before + num_rows_to_append] = sum; // Step 2: resize output buffers // From 9797e24ea3cadd751b13875a7ebd6b786e712bb1 Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 28 Dec 2023 00:17:03 -0800 Subject: [PATCH 2/8] Add tests --- cpp/src/arrow/compute/light_array.cc | 21 ++++---- cpp/src/arrow/compute/light_array.h | 2 +- cpp/src/arrow/compute/light_array_test.cc | 58 +++++++++++++++++++++++ cpp/src/arrow/testing/generator.cc | 9 +++- 4 files changed, 78 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 09c48d8e101..c48368f2805 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -19,6 +19,7 @@ #include +#include "arrow/array/builder_binary.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/macros.h" @@ -326,11 +327,11 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() { column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); if (!column_metadata.is_fixed_length) { - int32_t min_new_size = + int64_t min_new_size = reinterpret_cast(buffers_[kFixedLengthBuffer]->data())[num_rows_]; ARROW_DCHECK(var_len_buf_size_ > 0); if (var_len_buf_size_ < min_new_size) { - int new_size = var_len_buf_size_; + int64_t new_size = var_len_buf_size_; while (new_size < min_new_size) { new_size *= 2; } @@ -577,23 +578,23 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source // int32_t* offsets = reinterpret_cast(target->mutable_data(1)); { - int32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; + int64_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; Visit(source, num_rows_to_append, row_ids, [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[num_rows_before + i] = num_bytes; }); for (int i = 0; i < num_rows_to_append; ++i) { int32_t length = offsets[num_rows_before + i]; - offsets[num_rows_before + i] = sum; - if (ARROW_PREDICT_FALSE(static_cast(sum) + length > - std::numeric_limits::max())) { - return Status::Invalid("ExecBatchBuilder offset overflow detected for the ", - i + 1, "-th element, last offset ", sum, ", length ", - length); + offsets[num_rows_before + i] = static_cast(sum); + if (ARROW_PREDICT_FALSE(sum + length > BinaryBuilder::memory_limit())) { + return Status::Invalid("ExecBatchBuilder cannot contain more than ", + BinaryBuilder::memory_limit(), " bytes, current ", sum, + ", appending ", num_rows_before + i + 1, + "-th element of length ", length); } sum += length; } - offsets[num_rows_before + num_rows_to_append] = sum; + offsets[num_rows_before + num_rows_to_append] = static_cast(sum); } // Step 2: resize output buffers diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 84aa86d64bb..67de71bf56c 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -353,7 +353,7 @@ class ARROW_EXPORT ResizableArrayData { MemoryPool* pool_; int num_rows_; int num_rows_allocated_; - int var_len_buf_size_; + int64_t var_len_buf_size_; static constexpr int kMaxBuffers = 3; std::shared_ptr buffers_[kMaxBuffers]; }; diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index d50e9675517..6c23a138e1b 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/array/builder_binary.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -407,6 +408,63 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) { ASSERT_EQ(0, pool->bytes_allocated()); } +TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + constexpr auto eight_mb = 8 * 1024 * 1024; + std::string str_8mb(eight_mb, 'a'); + std::string str_8mb_minus_1(eight_mb - 1, 'b'); + std::string str_8mb_minus_2(eight_mb - 2, + 'b'); // BaseBinaryBuilder::memory_limit() + std::shared_ptr values_8mb = ConstantArrayGenerator::String(1, str_8mb); + std::shared_ptr values_8mb_minus_1 = + ConstantArrayGenerator::String(1, str_8mb_minus_1); + std::shared_ptr values_8mb_minus_2 = + ConstantArrayGenerator::String(1, str_8mb_minus_2); + + ExecBatch batch_8mb({values_8mb}, 1); + ExecBatch batch_8mb_minus_1({values_8mb_minus_1}, 1); + ExecBatch batch_8mb_minus_2({values_8mb_minus_2}, 1); + + auto num_rows = std::numeric_limits::max() / eight_mb; + StringBuilder values_ref_builder; + ASSERT_OK(values_ref_builder.Reserve(num_rows + 1)); + for (int i = 0; i < num_rows; i++) { + ASSERT_OK(values_ref_builder.Append(str_8mb)); + } + ASSERT_OK(values_ref_builder.Append(str_8mb_minus_2)); + ASSERT_OK_AND_ASSIGN(auto values_ref, values_ref_builder.Finish()); + auto values_ref_1 = values_ref->Slice(0, num_rows); + ExecBatch batch_ref_1({values_ref_1}, num_rows); + ExecBatch batch_ref_2({values_ref}, num_rows + 1); + + std::vector first_set_row_ids(num_rows, 0); + std::vector second_set_row_ids(1, 0); + + { + ExecBatchBuilder builder; + ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, first_set_row_ids.data(), + /*num_cols=*/1)); + ASSERT_RAISES(Invalid, builder.AppendSelected(pool, batch_8mb_minus_1, 1, + second_set_row_ids.data(), + /*num_cols=*/1)); + } + + { + ExecBatchBuilder builder; + ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, first_set_row_ids.data(), + /*num_cols=*/1)); + ASSERT_OK(builder.AppendSelected(pool, batch_8mb_minus_2, 1, + second_set_row_ids.data(), + /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(batch_ref_2, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + + ASSERT_EQ(0, pool->bytes_allocated()); +} + TEST(KeyColumnArray, FromExecBatch) { ExecBatch batch = JSONToExecBatch({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc index 36c88c20efe..5ea6a541e89 100644 --- a/cpp/src/arrow/testing/generator.cc +++ b/cpp/src/arrow/testing/generator.cc @@ -38,6 +38,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/string.h" @@ -103,7 +104,13 @@ std::shared_ptr ConstantArrayGenerator::Float64(int64_t size, std::shared_ptr ConstantArrayGenerator::String(int64_t size, std::string value) { - return ConstantArray(size, value); + using BuilderType = typename TypeTraits::BuilderType; + auto type = TypeTraits::type_singleton(); + auto builder_fn = [&](BuilderType* builder) { + DCHECK_OK(builder->Append(std::string_view(value.data()))); + }; + return ArrayFromBuilderVisitor(type, value.size() * size, size, builder_fn) + .ValueOrDie(); } std::shared_ptr ConstantArrayGenerator::Zeroes( From eb91fce42680aef3d3806800997bbbdab4585335 Mon Sep 17 00:00:00 2001 From: "Rossi(Ruoxi) Sun" Date: Thu, 18 Jan 2024 01:27:55 +0800 Subject: [PATCH 3/8] Update cpp/src/arrow/compute/light_array.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/compute/light_array.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index c48368f2805..8a2d3902458 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -327,8 +327,7 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() { column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); if (!column_metadata.is_fixed_length) { - int64_t min_new_size = - reinterpret_cast(buffers_[kFixedLengthBuffer]->data())[num_rows_]; + int64_t min_new_size = buffers_[kFixedLengthBuffer]->data_as()[num_rows_]; ARROW_DCHECK(var_len_buf_size_ > 0); if (var_len_buf_size_ < min_new_size) { int64_t new_size = var_len_buf_size_; From a421d8e911b9d34b750de076ff7f7ac8171cba16 Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 18 Jan 2024 15:16:44 +0800 Subject: [PATCH 4/8] Address comments --- cpp/src/arrow/compute/light_array.cc | 38 +++++++-------- cpp/src/arrow/compute/light_array_test.cc | 58 ++++++++++++----------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 8a2d3902458..ad05eb26dbe 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -19,8 +19,8 @@ #include -#include "arrow/array/builder_binary.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/macros.h" namespace arrow { @@ -470,7 +470,7 @@ void ExecBatchBuilder::Visit(const std::shared_ptr& column, int num_r for (int i = 0; i < num_rows; ++i) { uint16_t row_id = row_ids[i]; const uint8_t* field_ptr = ptr_base + offsets[row_id]; - uint32_t field_length = offsets[row_id + 1] - offsets[row_id]; + int32_t field_length = offsets[row_id + 1] - offsets[row_id]; process_value_fn(i, field_ptr, field_length); } } else { @@ -480,7 +480,7 @@ void ExecBatchBuilder::Visit(const std::shared_ptr& column, int num_r const uint8_t* field_ptr = column->buffers[1]->data() + (column->offset + row_id) * static_cast(metadata.fixed_length); - process_value_fn(i, field_ptr, metadata.fixed_length); + process_value_fn(i, field_ptr, static_cast(metadata.fixed_length)); } } } @@ -576,25 +576,23 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source // Step 1: calculate target offsets // int32_t* offsets = reinterpret_cast(target->mutable_data(1)); - { - int64_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; - Visit(source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { - offsets[num_rows_before + i] = num_bytes; - }); - for (int i = 0; i < num_rows_to_append; ++i) { - int32_t length = offsets[num_rows_before + i]; - offsets[num_rows_before + i] = static_cast(sum); - if (ARROW_PREDICT_FALSE(sum + length > BinaryBuilder::memory_limit())) { - return Status::Invalid("ExecBatchBuilder cannot contain more than ", - BinaryBuilder::memory_limit(), " bytes, current ", sum, - ", appending ", num_rows_before + i + 1, - "-th element of length ", length); - } - sum += length; + int32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; + Visit(source, num_rows_to_append, row_ids, + [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + offsets[num_rows_before + i] = num_bytes; + }); + for (int i = 0; i < num_rows_to_append; ++i) { + int32_t length = offsets[num_rows_before + i]; + offsets[num_rows_before + i] = sum; + int32_t new_sum_maybe_overflow = 0; + if (ARROW_PREDICT_FALSE(internal::AddWithOverflow(sum, length, &new_sum_maybe_overflow))) { + return Status::Invalid("Overflow detected in ExecBatchBuilder when appending ", + num_rows_before + i + 1, "-th element of length ", length, + " bytes to current length ", sum, " bytes"); } - offsets[num_rows_before + num_rows_to_append] = static_cast(sum); + sum = new_sum_maybe_overflow; } + offsets[num_rows_before + num_rows_to_append] = sum; // Step 2: resize output buffers // diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 6c23a138e1b..2befa2b7839 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,7 +20,6 @@ #include #include -#include "arrow/array/builder_binary.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -409,56 +408,59 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) { } TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) { + // Corresponds to GH-39332. std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); constexpr auto eight_mb = 8 * 1024 * 1024; + constexpr auto eight_mb_minus_one = eight_mb - 1; + // String of size 8mb to repetitively fill the heading multiple of 8mbs of an array + // of int32_max bytes. std::string str_8mb(eight_mb, 'a'); - std::string str_8mb_minus_1(eight_mb - 1, 'b'); - std::string str_8mb_minus_2(eight_mb - 2, - 'b'); // BaseBinaryBuilder::memory_limit() + // String of size (8mb - 1) to be the last element of an array of int32_max bytes. + std::string str_8mb_minus_1(eight_mb_minus_one, 'b'); std::shared_ptr values_8mb = ConstantArrayGenerator::String(1, str_8mb); std::shared_ptr values_8mb_minus_1 = ConstantArrayGenerator::String(1, str_8mb_minus_1); - std::shared_ptr values_8mb_minus_2 = - ConstantArrayGenerator::String(1, str_8mb_minus_2); ExecBatch batch_8mb({values_8mb}, 1); ExecBatch batch_8mb_minus_1({values_8mb_minus_1}, 1); - ExecBatch batch_8mb_minus_2({values_8mb_minus_2}, 1); auto num_rows = std::numeric_limits::max() / eight_mb; - StringBuilder values_ref_builder; - ASSERT_OK(values_ref_builder.Reserve(num_rows + 1)); - for (int i = 0; i < num_rows; i++) { - ASSERT_OK(values_ref_builder.Append(str_8mb)); - } - ASSERT_OK(values_ref_builder.Append(str_8mb_minus_2)); - ASSERT_OK_AND_ASSIGN(auto values_ref, values_ref_builder.Finish()); - auto values_ref_1 = values_ref->Slice(0, num_rows); - ExecBatch batch_ref_1({values_ref_1}, num_rows); - ExecBatch batch_ref_2({values_ref}, num_rows + 1); - - std::vector first_set_row_ids(num_rows, 0); - std::vector second_set_row_ids(1, 0); + std::vector body_row_ids(num_rows, 0); + std::vector tail_row_id(1, 0); { + // Building an array of (int32_max + 1) = (8mb * num_rows + 8mb) bytes should raise an + // error of overflow. ExecBatchBuilder builder; - ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, first_set_row_ids.data(), + ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, body_row_ids.data(), /*num_cols=*/1)); - ASSERT_RAISES(Invalid, builder.AppendSelected(pool, batch_8mb_minus_1, 1, - second_set_row_ids.data(), - /*num_cols=*/1)); + std::stringstream ss; + ss << "Invalid: Overflow detected in ExecBatchBuilder when appending " << num_rows + 1 + << "-th element of length " << eight_mb << " bytes to current length " + << eight_mb * num_rows << " bytes"; + ASSERT_RAISES_WITH_MESSAGE( + Invalid, ss.str(), + builder.AppendSelected(pool, batch_8mb, 1, tail_row_id.data(), + /*num_cols=*/1)); } { + // Building an array of int32_max = (8mb * num_rows + 8mb - 1) bytes should succeed. ExecBatchBuilder builder; - ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, first_set_row_ids.data(), + ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, body_row_ids.data(), /*num_cols=*/1)); - ASSERT_OK(builder.AppendSelected(pool, batch_8mb_minus_2, 1, - second_set_row_ids.data(), + ASSERT_OK(builder.AppendSelected(pool, batch_8mb_minus_1, 1, tail_row_id.data(), /*num_cols=*/1)); ExecBatch built = builder.Flush(); - ASSERT_EQ(batch_ref_2, built); + auto datum = built[0]; + ASSERT_TRUE(datum.is_array()); + auto array = datum.array_as(); + ASSERT_EQ(array->length(), num_rows + 1); + for (int i = 0; i < num_rows; ++i) { + ASSERT_EQ(array->GetString(i), str_8mb); + } + ASSERT_EQ(array->GetString(num_rows), str_8mb_minus_1); ASSERT_NE(0, pool->bytes_allocated()); } From f9b7ff8cd66ebfc753be7b6a88b1fc917844e910 Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 18 Jan 2024 15:23:48 +0800 Subject: [PATCH 5/8] Address comment --- cpp/src/arrow/compute/light_array.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index ad05eb26dbe..e950977f24a 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -511,14 +511,14 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source break; case 1: Visit(source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { target->mutable_data(1)[num_rows_before + i] = *ptr; }); break; case 2: Visit( source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { reinterpret_cast(target->mutable_data(1))[num_rows_before + i] = *reinterpret_cast(ptr); }); @@ -526,7 +526,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source case 4: Visit( source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { reinterpret_cast(target->mutable_data(1))[num_rows_before + i] = *reinterpret_cast(ptr); }); @@ -534,7 +534,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source case 8: Visit( source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { reinterpret_cast(target->mutable_data(1))[num_rows_before + i] = *reinterpret_cast(ptr); }); @@ -544,7 +544,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source num_rows_to_append - NumRowsToSkip(source, num_rows_to_append, row_ids, sizeof(uint64_t)); Visit(source, num_rows_to_process, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { uint64_t* dst = reinterpret_cast( target->mutable_data(1) + static_cast(num_bytes) * (num_rows_before + i)); @@ -558,7 +558,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source if (num_rows_to_append > num_rows_to_process) { Visit(source, num_rows_to_append - num_rows_to_process, row_ids + num_rows_to_process, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { uint64_t* dst = reinterpret_cast( target->mutable_data(1) + static_cast(num_bytes) * @@ -578,7 +578,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source int32_t* offsets = reinterpret_cast(target->mutable_data(1)); int32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before]; Visit(source, num_rows_to_append, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { offsets[num_rows_before + i] = num_bytes; }); for (int i = 0; i < num_rows_to_append; ++i) { @@ -604,7 +604,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source num_rows_to_append - NumRowsToSkip(source, num_rows_to_append, row_ids, sizeof(uint64_t)); Visit(source, num_rows_to_process, row_ids, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { uint64_t* dst = reinterpret_cast(target->mutable_data(2) + offsets[num_rows_before + i]); const uint64_t* src = reinterpret_cast(ptr); @@ -614,7 +614,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source } }); Visit(source, num_rows_to_append - num_rows_to_process, row_ids + num_rows_to_process, - [&](int i, const uint8_t* ptr, uint32_t num_bytes) { + [&](int i, const uint8_t* ptr, int32_t num_bytes) { uint64_t* dst = reinterpret_cast( target->mutable_data(2) + offsets[num_rows_before + num_rows_to_process + i]); From c0db9bd93713d049aa17c2147915206c21fdf97e Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 18 Jan 2024 15:37:10 +0800 Subject: [PATCH 6/8] Fix lint --- cpp/src/arrow/compute/light_array.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index e950977f24a..87169eef57e 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -585,7 +585,8 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source int32_t length = offsets[num_rows_before + i]; offsets[num_rows_before + i] = sum; int32_t new_sum_maybe_overflow = 0; - if (ARROW_PREDICT_FALSE(internal::AddWithOverflow(sum, length, &new_sum_maybe_overflow))) { + if (ARROW_PREDICT_FALSE( + internal::AddWithOverflow(sum, length, &new_sum_maybe_overflow))) { return Status::Invalid("Overflow detected in ExecBatchBuilder when appending ", num_rows_before + i + 1, "-th element of length ", length, " bytes to current length ", sum, " bytes"); From c1ae59115051ef9653362078215bea3b789302ba Mon Sep 17 00:00:00 2001 From: zanmato Date: Thu, 18 Jan 2024 17:18:44 +0800 Subject: [PATCH 7/8] Fix compiler error --- cpp/src/arrow/compute/light_array.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 87169eef57e..b225e04b05c 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -586,7 +586,7 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source offsets[num_rows_before + i] = sum; int32_t new_sum_maybe_overflow = 0; if (ARROW_PREDICT_FALSE( - internal::AddWithOverflow(sum, length, &new_sum_maybe_overflow))) { + arrow::internal::AddWithOverflow(sum, length, &new_sum_maybe_overflow))) { return Status::Invalid("Overflow detected in ExecBatchBuilder when appending ", num_rows_before + i + 1, "-th element of length ", length, " bytes to current length ", sum, " bytes"); From 34d2d41d9d599e53c1d5fcd215bc1dd959786299 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 18 Jan 2024 10:54:17 +0100 Subject: [PATCH 8/8] Skip test on 32-bit systems --- cpp/src/arrow/compute/light_array_test.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 2befa2b7839..ecc5f3ad379 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -408,7 +408,11 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) { } TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) { - // Corresponds to GH-39332. + // GH-39332: check appending variable-length data past 2GB. + if constexpr (sizeof(void*) == 4) { + GTEST_SKIP() << "Test only works on 64-bit platforms"; + } + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); constexpr auto eight_mb = 8 * 1024 * 1024;