diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index a97bf134604..682baab208d 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -397,6 +397,32 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) { } } +void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) { + std::unique_ptr builder; + auto null_scalar = MakeNullScalar(scalar->type); + ASSERT_OK(MakeBuilder(pool, scalar->type, &builder)); + ASSERT_OK(builder->AppendScalar(*scalar)); + ASSERT_OK(builder->AppendScalar(*scalar)); + ASSERT_OK(builder->AppendScalar(*null_scalar)); + ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); + ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2)); + ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); + + std::shared_ptr out; + FinishAndCheckPadding(builder.get(), &out); + ASSERT_OK(out->ValidateFull()); + ASSERT_EQ(out->length(), 9); + ASSERT_EQ(out->null_count(), 4); + for (const auto index : {0, 1, 3, 5, 6}) { + ASSERT_FALSE(out->IsNull(index)); + ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index)); + AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true); + } + for (const auto index : {2, 4, 7, 8}) { + ASSERT_TRUE(out->IsNull(index)); + } +} + TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_OK_AND_ASSIGN(auto null_array, MakeArrayFromScalar(NullScalar(), 5)); ASSERT_OK(null_array->ValidateFull()); @@ -447,6 +473,10 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_EQ(array->null_count(), 0); } } + + for (auto scalar : scalars) { + AssertAppendScalar(pool_, scalar); + } } TEST_F(TestArray, TestMakeArrayFromDictionaryScalar) { @@ -481,6 +511,8 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) { ASSERT_OK_AND_ASSIGN(auto item, array->GetScalar(i)); ASSERT_TRUE(item->Equals(scalar)); } + + AssertAppendScalar(pool_, std::make_shared(scalar)); } TEST_F(TestArray, ValidateBuffersPrimitive) { diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index b92cc285894..c892e3d664b 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -24,8 +24,11 @@ #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -92,6 +95,162 @@ Status ArrayBuilder::Advance(int64_t elements) { return null_bitmap_builder_.Advance(elements); } +namespace { +struct AppendScalarImpl { + template + enable_if_t::value || is_decimal_type::value || + is_fixed_size_binary_type::value, + Status> + Visit(const T&) { + auto builder = internal::checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + builder->UnsafeAppend(scalar->value); + } else { + builder->UnsafeAppendNull(); + } + } + } + return Status::OK(); + } + + template + enable_if_base_binary Visit(const T&) { + int64_t data_size = 0; + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + data_size += scalar->value->size(); + } + } + + auto builder = internal::checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); + RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size)); + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + builder->UnsafeAppend(util::string_view{*scalar->value}); + } else { + builder->UnsafeAppendNull(); + } + } + } + return Status::OK(); + } + + template + enable_if_list_like Visit(const T&) { + auto builder = internal::checked_cast::BuilderType*>(builder_); + int64_t num_children = 0; + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if (!(*scalar)->is_valid) continue; + num_children += + internal::checked_cast(**scalar).value->length(); + } + RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_)); + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if ((*scalar)->is_valid) { + RETURN_NOT_OK(builder->Append()); + const Array& list = + *internal::checked_cast(**scalar).value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } + } + } + return Status::OK(); + } + + Status Visit(const StructType& type) { + auto* builder = internal::checked_cast(builder_); + auto count = n_repeats_ * (scalars_end_ - scalars_begin_); + RETURN_NOT_OK(builder->Reserve(count)); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count)); + } + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) { + const auto& scalar = internal::checked_cast(**s); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + if (!scalar.is_valid || !scalar.value[field_index]) { + RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); + } else { + RETURN_NOT_OK(builder->field_builder(field_index) + ->AppendScalar(*scalar.value[field_index])); + } + } + RETURN_NOT_OK(builder->Append(scalar.is_valid)); + } + } + return Status::OK(); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("AppendScalar for type ", type); + } + + Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); } + + const std::shared_ptr* scalars_begin_; + const std::shared_ptr* scalars_end_; + int64_t n_repeats_; + ArrayBuilder* builder_; +}; +} // namespace + +Status ArrayBuilder::AppendScalar(const Scalar& scalar) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; + return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert(); +} + +Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; + return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert(); +} + +Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { + if (scalars.empty()) return Status::OK(); + const auto ty = type(); + for (const auto& scalar : scalars) { + if (!scalar->type->Equals(ty)) { + return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), + " to builder for type ", type()->ToString()); + } + } + return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(), + /*n_repeats=*/1, this} + .Convert(); +} + Status ArrayBuilder::Finish(std::shared_ptr* out) { std::shared_ptr internal_data; RETURN_NOT_OK(FinishInternal(&internal_data)); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 15c726241b5..8e60c306796 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -116,6 +116,11 @@ class ARROW_EXPORT ArrayBuilder { /// This method is useful when appending null values to a parent nested type. virtual Status AppendEmptyValues(int64_t length) = 0; + /// \brief Append a value from a scalar + Status AppendScalar(const Scalar& scalar); + Status AppendScalar(const Scalar& scalar, int64_t n_repeats); + Status AppendScalars(const ScalarVector& scalars); + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index c1c664a1249..7653eeca5c4 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -467,6 +467,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status Append(const Buffer& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(util::string_view(s)); + return Status::OK(); + } + + Status Append(const std::shared_ptr& s) { return Append(*s); } + template Status Append(const std::array& value) { ARROW_RETURN_NOT_OK(Reserve(1)); @@ -502,6 +510,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); } + + void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 40d6ce1ba9a..455cb3df7b1 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -29,6 +29,7 @@ #include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/array/data.h" #include "arrow/array/util.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc index 71fad394d00..33371d55c6d 100644 --- a/cpp/src/arrow/testing/generator.cc +++ b/cpp/src/arrow/testing/generator.cc @@ -95,88 +95,16 @@ std::shared_ptr ConstantArrayGenerator::String(int64_t size, return ConstantArray(size, value); } -struct ScalarVectorToArrayImpl { - template ::BuilderType, - typename ScalarType = typename TypeTraits::ScalarType> - Status UseBuilder(const AppendScalar& append) { - BuilderType builder(type_, default_memory_pool()); - for (const auto& s : scalars_) { - if (s->is_valid) { - RETURN_NOT_OK(append(internal::checked_cast(*s), &builder)); - } else { - RETURN_NOT_OK(builder.AppendNull()); - } - } - return builder.FinishInternal(&data_); - } - - struct AppendValue { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - return builder->Append(s.value); - } - }; - - struct AppendBuffer { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - const Buffer& buffer = *s.value; - return builder->Append(util::string_view{buffer}); - } - }; - - template - enable_if_primitive_ctype Visit(const T&) { - return UseBuilder(AppendValue{}); - } - - template - enable_if_has_string_view Visit(const T&) { - return UseBuilder(AppendBuffer{}); - } - - Status Visit(const StructType& type) { - data_ = ArrayData::Make(type_, static_cast(scalars_.size()), - {/*null_bitmap=*/nullptr}); - data_->child_data.resize(type_->num_fields()); - - ScalarVector field_scalars(scalars_.size()); - - for (int field_index = 0; field_index < type.num_fields(); ++field_index) { - for (size_t i = 0; i < scalars_.size(); ++i) { - field_scalars[i] = - internal::checked_cast(scalars_[i].get())->value[field_index]; - } - - ARROW_ASSIGN_OR_RAISE(data_->child_data[field_index], - ScalarVectorToArrayImpl{}.Convert(field_scalars)); - } - return Status::OK(); - } - - Status Visit(const DataType& type) { - return Status::NotImplemented("ScalarVectorToArray for type ", type); - } - - Result> Convert(const ScalarVector& scalars) && { - if (scalars.size() == 0) { - return Status::NotImplemented("ScalarVectorToArray with no scalars"); - } - scalars_ = std::move(scalars); - type_ = scalars_[0]->type; - RETURN_NOT_OK(VisitTypeInline(*type_, this)); - return std::move(data_); - } - - std::shared_ptr type_; - ScalarVector scalars_; - std::shared_ptr data_; -}; - Result> ScalarVectorToArray(const ScalarVector& scalars) { - ARROW_ASSIGN_OR_RAISE(auto data, ScalarVectorToArrayImpl{}.Convert(scalars)); - return MakeArray(std::move(data)); + if (scalars.empty()) { + return Status::NotImplemented("ScalarVectorToArray with no scalars"); + } + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), scalars[0]->type, &builder)); + RETURN_NOT_OK(builder->AppendScalars(scalars)); + std::shared_ptr out; + RETURN_NOT_OK(builder->Finish(&out)); + return out; } } // namespace arrow