diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 5c247a6dc66..e593cf7e6c4 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -473,6 +473,70 @@ class TestStringBuilder : public TestBuilder { CheckStringArray(*result_, strings, is_valid, reps); } + void TestExtendCurrent() { + std::vector strings = {"", "bbbb", "aaaaa", "", "ccc"}; + std::vector is_valid = {1, 1, 1, 0, 1}; + + int N = static_cast(strings.size()); + int reps = 10; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (!is_valid[i]) { + ASSERT_OK(builder_->AppendNull()); + } else if (strings[i].length() > 3) { + ASSERT_OK(builder_->Append(strings[i].substr(0, 3))); + ASSERT_OK(builder_->ExtendCurrent(strings[i].substr(3))); + } else { + ASSERT_OK(builder_->Append(strings[i])); + } + } + } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 12, result_->value_data()->size()); + + CheckStringArray(*result_, strings, is_valid, reps); + } + + void TestExtendCurrentUnsafe() { + std::vector strings = {"", "bbbb", "aaaaa", "", "ccc"}; + std::vector is_valid = {1, 1, 1, 0, 1}; + + int N = static_cast(strings.size()); + int reps = 13; + int64_t total_length = 0; + for (const auto& s : strings) { + total_length += static_cast(s.size()); + } + + ASSERT_OK(builder_->Reserve(N * reps)); + ASSERT_OK(builder_->ReserveData(total_length * reps)); + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (!is_valid[i]) { + builder_->UnsafeAppendNull(); + } else if (strings[i].length() > 3) { + builder_->UnsafeAppend(strings[i].substr(0, 3)); + builder_->UnsafeExtendCurrent(strings[i].substr(3)); + } else { + builder_->UnsafeAppend(strings[i]); + } + } + } + ASSERT_EQ(builder_->value_data_length(), total_length * reps); + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 12, result_->value_data()->size()); + + CheckStringArray(*result_, strings, is_valid, reps); + } + void TestVectorAppend() { std::vector strings = {"", "bb", "a", "", "ccc"}; std::vector valid_bytes = {1, 1, 1, 0, 1}; @@ -608,6 +672,12 @@ TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); } TYPED_TEST(TestStringBuilder, TestScalarAppendUnsafe) { this->TestScalarAppendUnsafe(); } +TYPED_TEST(TestStringBuilder, TestExtendCurrent) { this->TestExtendCurrent(); } + +TYPED_TEST(TestStringBuilder, TestExtendCurrentUnsafe) { + this->TestExtendCurrentUnsafe(); +} + TYPED_TEST(TestStringBuilder, TestVectorAppend) { this->TestVectorAppend(); } TYPED_TEST(TestStringBuilder, TestAppendCStringsWithValidBytes) { diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index bc49c7d6787..a60031258ad 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -77,6 +77,23 @@ class BaseBinaryBuilder : public ArrayBuilder { return Append(value.data(), static_cast(value.size())); } + /// Extend the last appended value by appending more data at the end + /// + /// Unlike Append, this does not create a new offset. + Status ExtendCurrent(const uint8_t* value, offset_type length) { + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + return Status::OK(); + } + + Status ExtendCurrent(util::string_view value) { + return ExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); ARROW_RETURN_NOT_OK(Reserve(length)); @@ -133,12 +150,28 @@ class BaseBinaryBuilder : public ArrayBuilder { UnsafeAppend(value.data(), static_cast(value.size())); } + /// Like ExtendCurrent, but do not check capacity + void UnsafeExtendCurrent(const uint8_t* value, offset_type length) { + value_data_builder_.UnsafeAppend(value, length); + } + + void UnsafeExtendCurrent(util::string_view value) { + UnsafeExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + void UnsafeAppendNull() { const int64_t num_bytes = value_data_builder_.length(); offsets_builder_.UnsafeAppend(static_cast(num_bytes)); UnsafeAppendToBitmap(false); } + void UnsafeAppendEmptyValue() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(true); + } + /// \brief Append a sequence of strings in one shot. /// /// \param[in] values a vector of strings diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index f74bb245d77..0f94baaedfc 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -210,8 +210,9 @@ Status Function::Validate() const { if (arity_.is_varargs && arg_count == arity_.num_args + 1) { return Status::OK(); } - return Status::Invalid("In function '", name_, - "': ", "number of argument names != function arity"); + return Status::Invalid( + "In function '", name_, + "': ", "number of argument names for function documentation != function arity"); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 6d5c837f514..913c4dacf56 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -1193,15 +1193,15 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) { } // similar to GenerateTypeAgnosticPrimitive, but for variable types -template