From 30053ec09ba0c7c48982f96cd08cfda035a62156 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 22 Jun 2021 17:53:39 -0400 Subject: [PATCH 1/9] ARROW-11932: [C++] Implement ArrayBuilder::AppendScalar --- cpp/src/arrow/array/array_test.cc | 33 ++++++++ cpp/src/arrow/array/builder_base.cc | 116 ++++++++++++++++++++++++++++ cpp/src/arrow/array/builder_base.h | 4 + cpp/src/arrow/array/builder_dict.h | 1 + cpp/src/arrow/testing/generator.cc | 90 +++------------------ 5 files changed, 163 insertions(+), 81 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index a97bf134604..97431a736e6 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -397,6 +397,33 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) { } } +void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) { + std::unique_ptr builder; + auto null_scalar = MakeNullScalar(scalar->type); + ASSERT_OK(MakeBuilder(pool, scalar->type, &builder)); + ASSERT_OK(builder->AppendScalar(*scalar)); + ASSERT_OK(builder->AppendScalar(*scalar)); + ASSERT_OK(builder->AppendScalar(*null_scalar)); + ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); + + std::shared_ptr out; + FinishAndCheckPadding(builder.get(), &out); + ASSERT_OK(out->ValidateFull()); + ASSERT_EQ(out->length(), 5); + ASSERT_EQ(out->null_count(), 2); + ASSERT_FALSE(out->IsNull(0)); + ASSERT_FALSE(out->IsNull(1)); + ASSERT_TRUE(out->IsNull(2)); + ASSERT_FALSE(out->IsNull(3)); + ASSERT_TRUE(out->IsNull(4)); + ASSERT_OK_AND_ASSIGN(auto scalar0, out->GetScalar(0)); + ASSERT_OK_AND_ASSIGN(auto scalar1, out->GetScalar(1)); + ASSERT_OK_AND_ASSIGN(auto scalar3, out->GetScalar(3)); + AssertScalarsEqual(*scalar, *scalar0, /*verbose=*/true); + AssertScalarsEqual(*scalar, *scalar1, /*verbose=*/true); + AssertScalarsEqual(*scalar, *scalar3, /*verbose=*/true); +} + TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_OK_AND_ASSIGN(auto null_array, MakeArrayFromScalar(NullScalar(), 5)); ASSERT_OK(null_array->ValidateFull()); @@ -447,6 +474,10 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_EQ(array->null_count(), 0); } } + + for (auto scalar : scalars) { + AssertAppendScalar(pool_, scalar); + } } TEST_F(TestArray, TestMakeArrayFromDictionaryScalar) { @@ -481,6 +512,8 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) { ASSERT_OK_AND_ASSIGN(auto item, array->GetScalar(i)); ASSERT_TRUE(item->Equals(scalar)); } + + AssertAppendScalar(pool_, std::make_shared(scalar)); } TEST_F(TestArray, ValidateBuffersPrimitive) { diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index b92cc285894..59202241111 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -24,8 +24,11 @@ #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -92,6 +95,119 @@ Status ArrayBuilder::Advance(int64_t elements) { return null_bitmap_builder_.Advance(elements); } +struct AppendScalarImpl { + template ::BuilderType, + typename ScalarType = typename TypeTraits::ScalarType> + Status UseBuilder(const AppendScalar& append) { + for (const auto scalar : scalars_) { + if (scalar->is_valid) { + RETURN_NOT_OK(append(internal::checked_cast(*scalar), + static_cast(builder_))); + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } + } + return Status::OK(); + } + + struct AppendValue { + template + Status operator()(const ScalarType& s, BuilderType* builder) const { + return builder->Append(s.value); + } + }; + + struct AppendBuffer { + template + Status operator()(const ScalarType& s, BuilderType* builder) const { + const Buffer& buffer = *s.value; + return builder->Append(util::string_view{buffer}); + } + }; + + struct AppendList { + template + Status operator()(const ScalarType& s, BuilderType* builder) const { + RETURN_NOT_OK(builder->Append()); + const Array& list = *s.value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + return Status::OK(); + } + }; + + template + enable_if_has_c_type Visit(const T&) { + return UseBuilder(AppendValue{}); + } + + template + enable_if_has_string_view Visit(const T&) { + return UseBuilder(AppendBuffer{}); + } + + template + enable_if_decimal Visit(const T&) { + return UseBuilder(AppendValue{}); + } + + template + enable_if_list_like Visit(const T&) { + return UseBuilder(AppendList{}); + } + + Status Visit(const StructType& type) { + auto* builder = static_cast(builder_); + for (const auto s : scalars_) { + const auto& scalar = internal::checked_cast(*s); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + if (!scalar.is_valid || !scalar.value[field_index]) { + RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); + } else { + RETURN_NOT_OK(builder->field_builder(field_index) + ->AppendScalar(*scalar.value[field_index])); + } + } + RETURN_NOT_OK(builder->Append(scalar.is_valid)); + } + return Status::OK(); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("AppendScalar for type ", type); + } + + Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); } + + std::vector scalars_; + ArrayBuilder* builder_; +}; + +Status ArrayBuilder::AppendScalar(const Scalar& scalar) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + return AppendScalarImpl{{&scalar}, this}.Convert(); +} + +Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { + if (scalars.empty()) return Status::OK(); + std::vector refs; + refs.reserve(scalars.size()); + for (const auto& scalar : scalars) { + if (!scalar->type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), + " to builder for type ", type()->ToString()); + } + refs.push_back(scalar.get()); + } + return AppendScalarImpl{refs, this}.Convert(); +} + Status ArrayBuilder::Finish(std::shared_ptr* out) { std::shared_ptr internal_data; RETURN_NOT_OK(FinishInternal(&internal_data)); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 15c726241b5..e0ac4d68e1a 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -116,6 +116,10 @@ class ARROW_EXPORT ArrayBuilder { /// This method is useful when appending null values to a parent nested type. virtual Status AppendEmptyValues(int64_t length) = 0; + /// \brief Append a value from a scalar + virtual Status AppendScalar(const Scalar& scalar); + virtual Status AppendScalars(const ScalarVector& scalars); + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 40d6ce1ba9a..455cb3df7b1 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -29,6 +29,7 @@ #include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/array/data.h" #include "arrow/array/util.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc index 71fad394d00..33371d55c6d 100644 --- a/cpp/src/arrow/testing/generator.cc +++ b/cpp/src/arrow/testing/generator.cc @@ -95,88 +95,16 @@ std::shared_ptr ConstantArrayGenerator::String(int64_t size, return ConstantArray(size, value); } -struct ScalarVectorToArrayImpl { - template ::BuilderType, - typename ScalarType = typename TypeTraits::ScalarType> - Status UseBuilder(const AppendScalar& append) { - BuilderType builder(type_, default_memory_pool()); - for (const auto& s : scalars_) { - if (s->is_valid) { - RETURN_NOT_OK(append(internal::checked_cast(*s), &builder)); - } else { - RETURN_NOT_OK(builder.AppendNull()); - } - } - return builder.FinishInternal(&data_); - } - - struct AppendValue { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - return builder->Append(s.value); - } - }; - - struct AppendBuffer { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - const Buffer& buffer = *s.value; - return builder->Append(util::string_view{buffer}); - } - }; - - template - enable_if_primitive_ctype Visit(const T&) { - return UseBuilder(AppendValue{}); - } - - template - enable_if_has_string_view Visit(const T&) { - return UseBuilder(AppendBuffer{}); - } - - Status Visit(const StructType& type) { - data_ = ArrayData::Make(type_, static_cast(scalars_.size()), - {/*null_bitmap=*/nullptr}); - data_->child_data.resize(type_->num_fields()); - - ScalarVector field_scalars(scalars_.size()); - - for (int field_index = 0; field_index < type.num_fields(); ++field_index) { - for (size_t i = 0; i < scalars_.size(); ++i) { - field_scalars[i] = - internal::checked_cast(scalars_[i].get())->value[field_index]; - } - - ARROW_ASSIGN_OR_RAISE(data_->child_data[field_index], - ScalarVectorToArrayImpl{}.Convert(field_scalars)); - } - return Status::OK(); - } - - Status Visit(const DataType& type) { - return Status::NotImplemented("ScalarVectorToArray for type ", type); - } - - Result> Convert(const ScalarVector& scalars) && { - if (scalars.size() == 0) { - return Status::NotImplemented("ScalarVectorToArray with no scalars"); - } - scalars_ = std::move(scalars); - type_ = scalars_[0]->type; - RETURN_NOT_OK(VisitTypeInline(*type_, this)); - return std::move(data_); - } - - std::shared_ptr type_; - ScalarVector scalars_; - std::shared_ptr data_; -}; - Result> ScalarVectorToArray(const ScalarVector& scalars) { - ARROW_ASSIGN_OR_RAISE(auto data, ScalarVectorToArrayImpl{}.Convert(scalars)); - return MakeArray(std::move(data)); + if (scalars.empty()) { + return Status::NotImplemented("ScalarVectorToArray with no scalars"); + } + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), scalars[0]->type, &builder)); + RETURN_NOT_OK(builder->AppendScalars(scalars)); + std::shared_ptr out; + RETURN_NOT_OK(builder->Finish(&out)); + return out; } } // namespace arrow From 333d80992d77fadb3535a51dfb74abb990d68c27 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 24 Jun 2021 11:45:26 -0400 Subject: [PATCH 2/9] ARROW-11932: [C++] Add ArrayBuilder::AppendScalar(const Scalar&, int64_t) --- cpp/src/arrow/array/array_test.cc | 25 +++++++-------- cpp/src/arrow/array/builder_base.cc | 49 ++++++++++++++++++----------- cpp/src/arrow/array/builder_base.h | 5 +-- 3 files changed, 46 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 97431a736e6..682baab208d 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -405,23 +405,22 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) ASSERT_OK(builder->AppendScalar(*scalar)); ASSERT_OK(builder->AppendScalar(*null_scalar)); ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); + ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2)); + ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); std::shared_ptr out; FinishAndCheckPadding(builder.get(), &out); ASSERT_OK(out->ValidateFull()); - ASSERT_EQ(out->length(), 5); - ASSERT_EQ(out->null_count(), 2); - ASSERT_FALSE(out->IsNull(0)); - ASSERT_FALSE(out->IsNull(1)); - ASSERT_TRUE(out->IsNull(2)); - ASSERT_FALSE(out->IsNull(3)); - ASSERT_TRUE(out->IsNull(4)); - ASSERT_OK_AND_ASSIGN(auto scalar0, out->GetScalar(0)); - ASSERT_OK_AND_ASSIGN(auto scalar1, out->GetScalar(1)); - ASSERT_OK_AND_ASSIGN(auto scalar3, out->GetScalar(3)); - AssertScalarsEqual(*scalar, *scalar0, /*verbose=*/true); - AssertScalarsEqual(*scalar, *scalar1, /*verbose=*/true); - AssertScalarsEqual(*scalar, *scalar3, /*verbose=*/true); + ASSERT_EQ(out->length(), 9); + ASSERT_EQ(out->null_count(), 4); + for (const auto index : {0, 1, 3, 5, 6}) { + ASSERT_FALSE(out->IsNull(index)); + ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index)); + AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true); + } + for (const auto index : {2, 4, 7, 8}) { + ASSERT_TRUE(out->IsNull(index)); + } } TEST_F(TestArray, TestMakeArrayFromScalar) { diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 59202241111..8dbad29008a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -100,12 +100,14 @@ struct AppendScalarImpl { typename BuilderType = typename TypeTraits::BuilderType, typename ScalarType = typename TypeTraits::ScalarType> Status UseBuilder(const AppendScalar& append) { - for (const auto scalar : scalars_) { - if (scalar->is_valid) { - RETURN_NOT_OK(append(internal::checked_cast(*scalar), - static_cast(builder_))); - } else { - RETURN_NOT_OK(builder_->AppendNull()); + for (int64_t i = 0; i < n_repeats_; i++) { + for (const auto scalar : scalars_) { + if (scalar->is_valid) { + RETURN_NOT_OK(append(internal::checked_cast(*scalar), + static_cast(builder_))); + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } } } return Status::OK(); @@ -160,18 +162,20 @@ struct AppendScalarImpl { } Status Visit(const StructType& type) { - auto* builder = static_cast(builder_); - for (const auto s : scalars_) { - const auto& scalar = internal::checked_cast(*s); - for (int field_index = 0; field_index < type.num_fields(); ++field_index) { - if (!scalar.is_valid || !scalar.value[field_index]) { - RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); - } else { - RETURN_NOT_OK(builder->field_builder(field_index) - ->AppendScalar(*scalar.value[field_index])); + for (int64_t i = 0; i < n_repeats_; i++) { + auto* builder = static_cast(builder_); + for (const auto s : scalars_) { + const auto& scalar = internal::checked_cast(*s); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + if (!scalar.is_valid || !scalar.value[field_index]) { + RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); + } else { + RETURN_NOT_OK(builder->field_builder(field_index) + ->AppendScalar(*scalar.value[field_index])); + } } + RETURN_NOT_OK(builder->Append(scalar.is_valid)); } - RETURN_NOT_OK(builder->Append(scalar.is_valid)); } return Status::OK(); } @@ -183,6 +187,7 @@ struct AppendScalarImpl { Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); } std::vector scalars_; + int64_t n_repeats_; ArrayBuilder* builder_; }; @@ -191,7 +196,15 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar) { return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), " to builder for type ", type()->ToString()); } - return AppendScalarImpl{{&scalar}, this}.Convert(); + return AppendScalarImpl{{&scalar}, /*n_repeats=*/1, this}.Convert(); +} + +Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + return AppendScalarImpl{{&scalar}, n_repeats, this}.Convert(); } Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { @@ -205,7 +218,7 @@ Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { } refs.push_back(scalar.get()); } - return AppendScalarImpl{refs, this}.Convert(); + return AppendScalarImpl{refs, /*n_repeats=*/1, this}.Convert(); } Status ArrayBuilder::Finish(std::shared_ptr* out) { diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index e0ac4d68e1a..8e60c306796 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -117,8 +117,9 @@ class ARROW_EXPORT ArrayBuilder { virtual Status AppendEmptyValues(int64_t length) = 0; /// \brief Append a value from a scalar - virtual Status AppendScalar(const Scalar& scalar); - virtual Status AppendScalars(const ScalarVector& scalars); + Status AppendScalar(const Scalar& scalar); + Status AppendScalar(const Scalar& scalar, int64_t n_repeats); + Status AppendScalars(const ScalarVector& scalars); /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use From f0bd23f7cfec9c7ba5ad55bfba554e2f2420e25f Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 28 Jun 2021 10:29:41 -0400 Subject: [PATCH 3/9] ARROW-11932: [C++] Reserve() in AppendScalar --- cpp/src/arrow/array/builder_base.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 8dbad29008a..584c244d989 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -95,6 +95,7 @@ Status ArrayBuilder::Advance(int64_t elements) { return null_bitmap_builder_.Advance(elements); } +namespace { struct AppendScalarImpl { template ::BuilderType, @@ -190,6 +191,7 @@ struct AppendScalarImpl { int64_t n_repeats_; ArrayBuilder* builder_; }; +} // namespace Status ArrayBuilder::AppendScalar(const Scalar& scalar) { if (!scalar.type->Equals(type())) { @@ -204,11 +206,13 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), " to builder for type ", type()->ToString()); } + RETURN_NOT_OK(Reserve(n_repeats)); return AppendScalarImpl{{&scalar}, n_repeats, this}.Convert(); } Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { if (scalars.empty()) return Status::OK(); + RETURN_NOT_OK(Reserve(scalars.size())); std::vector refs; refs.reserve(scalars.size()); for (const auto& scalar : scalars) { From 8450a4c0cac0383d47d5d07da30496a5aecb6005 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 28 Jun 2021 15:44:55 -0400 Subject: [PATCH 4/9] ARROW-11932: [C++] Don't require an allocation in AppendScalar --- cpp/src/arrow/array/builder_base.cc | 32 +++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 584c244d989..b127dd15415 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -102,9 +102,10 @@ struct AppendScalarImpl { typename ScalarType = typename TypeTraits::ScalarType> Status UseBuilder(const AppendScalar& append) { for (int64_t i = 0; i < n_repeats_; i++) { - for (const auto scalar : scalars_) { - if (scalar->is_valid) { - RETURN_NOT_OK(append(internal::checked_cast(*scalar), + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if ((*scalar)->is_valid) { + RETURN_NOT_OK(append(internal::checked_cast(**scalar), static_cast(builder_))); } else { RETURN_NOT_OK(builder_->AppendNull()); @@ -165,8 +166,8 @@ struct AppendScalarImpl { Status Visit(const StructType& type) { for (int64_t i = 0; i < n_repeats_; i++) { auto* builder = static_cast(builder_); - for (const auto s : scalars_) { - const auto& scalar = internal::checked_cast(*s); + for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) { + const auto& scalar = internal::checked_cast(**s); for (int field_index = 0; field_index < type.num_fields(); ++field_index) { if (!scalar.is_valid || !scalar.value[field_index]) { RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); @@ -185,9 +186,10 @@ struct AppendScalarImpl { return Status::NotImplemented("AppendScalar for type ", type); } - Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); } + Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); } - std::vector scalars_; + const std::shared_ptr* scalars_begin_; + const std::shared_ptr* scalars_end_; int64_t n_repeats_; ArrayBuilder* builder_; }; @@ -198,7 +200,8 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar) { return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), " to builder for type ", type()->ToString()); } - return AppendScalarImpl{{&scalar}, /*n_repeats=*/1, this}.Convert(); + std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; + return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert(); } Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { @@ -207,22 +210,21 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { " to builder for type ", type()->ToString()); } RETURN_NOT_OK(Reserve(n_repeats)); - return AppendScalarImpl{{&scalar}, n_repeats, this}.Convert(); + std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; + return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert(); } Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { if (scalars.empty()) return Status::OK(); - RETURN_NOT_OK(Reserve(scalars.size())); - std::vector refs; - refs.reserve(scalars.size()); + const auto ty = type(); for (const auto& scalar : scalars) { - if (!scalar->type->Equals(type())) { + if (!scalar->type->Equals(ty)) { return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), " to builder for type ", type()->ToString()); } - refs.push_back(scalar.get()); } - return AppendScalarImpl{refs, /*n_repeats=*/1, this}.Convert(); + return AppendScalarImpl{&*scalars.cbegin(), &*scalars.cend(), /*n_repeats=*/1, this} + .Convert(); } Status ArrayBuilder::Finish(std::shared_ptr* out) { From 3a7037cff5eeec3174c71692a5782f81d567b83a Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 28 Jun 2021 17:04:50 -0400 Subject: [PATCH 5/9] ARROW-11932: [C++] Fix test on MSVC --- cpp/src/arrow/array/builder_base.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index b127dd15415..ab04e099904 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -223,7 +223,8 @@ Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { " to builder for type ", type()->ToString()); } } - return AppendScalarImpl{&*scalars.cbegin(), &*scalars.cend(), /*n_repeats=*/1, this} + return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(), + /*n_repeats=*/1, this} .Convert(); } From af2d7f34804772b0b28e092f6509a0654cf986c7 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 29 Jun 2021 11:04:21 -0400 Subject: [PATCH 6/9] Update cpp/src/arrow/array/builder_base.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/array/builder_base.cc | 50 +++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index ab04e099904..a61f92f0fc2 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -144,20 +144,52 @@ struct AppendScalarImpl { }; template - enable_if_has_c_type Visit(const T&) { - return UseBuilder(AppendValue{}); - } + enable_if_fixed_width Visit(const T&) { + auto builder = checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_))); - template - enable_if_has_string_view Visit(const T&) { - return UseBuilder(AppendBuffer{}); + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + builder->UnsafeAppend(scalar->value); + } else { + builder->UnsafeAppendNull(); + } + } + } + return Status::OK(); } template - enable_if_decimal Visit(const T&) { - return UseBuilder(AppendValue{}); - } + enable_if_base_binary Visit(const T&) { + int64_t data_size = 0; + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + data_size += scalar->value->size(); + } + } + + auto builder = checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_))); + RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size)); + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + builder->UnsafeAppend(util::string_view{*scalar->value}); + } else { + builder->UnsafeAppendNull(); + } + } + } + return Status::OK(); + } template enable_if_list_like Visit(const T&) { return UseBuilder(AppendList{}); From bb62115ed4b66e3e799c507841defd5d57a60045 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 29 Jun 2021 11:31:21 -0400 Subject: [PATCH 7/9] ARROW-11932: [C++] Reserve in more cases --- cpp/src/arrow/array/builder_base.cc | 110 +++++++++++++--------------- 1 file changed, 51 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index a61f92f0fc2..d5d3b4e1288 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -97,61 +97,16 @@ Status ArrayBuilder::Advance(int64_t elements) { namespace { struct AppendScalarImpl { - template ::BuilderType, - typename ScalarType = typename TypeTraits::ScalarType> - Status UseBuilder(const AppendScalar& append) { - for (int64_t i = 0; i < n_repeats_; i++) { - for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; - scalar++) { - if ((*scalar)->is_valid) { - RETURN_NOT_OK(append(internal::checked_cast(**scalar), - static_cast(builder_))); - } else { - RETURN_NOT_OK(builder_->AppendNull()); - } - } - } - return Status::OK(); - } - - struct AppendValue { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - return builder->Append(s.value); - } - }; - - struct AppendBuffer { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - const Buffer& buffer = *s.value; - return builder->Append(util::string_view{buffer}); - } - }; - - struct AppendList { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - RETURN_NOT_OK(builder->Append()); - const Array& list = *s.value; - for (int64_t i = 0; i < list.length(); i++) { - ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); - RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); - } - return Status::OK(); - } - }; - template - enable_if_fixed_width Visit(const T&) { - auto builder = checked_cast::BuilderType*>(builder_); - RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_))); + enable_if_t::value || is_decimal_type::value, Status> Visit(const T&) { + auto builder = internal::checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); for (int64_t i = 0; i < n_repeats_; i++) { for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; raw++) { - auto scalar = checked_cast::ScalarType*>(raw->get()); + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); if (scalar->is_valid) { builder->UnsafeAppend(scalar->value); } else { @@ -163,24 +118,26 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_has_string_view Visit(const T&) { int64_t data_size = 0; for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; - raw++) { - auto scalar = checked_cast::ScalarType*>(raw->get()); + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); if (scalar->is_valid) { data_size += scalar->value->size(); } } - auto builder = checked_cast::BuilderType*>(builder_); - RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_))); + auto builder = internal::checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size)); for (int64_t i = 0; i < n_repeats_; i++) { for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; raw++) { - auto scalar = checked_cast::ScalarType*>(raw->get()); + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); if (scalar->is_valid) { builder->UnsafeAppend(util::string_view{*scalar->value}); } else { @@ -190,14 +147,50 @@ struct AppendScalarImpl { } return Status::OK(); } + + struct AppendList { + template + Status operator()(const ScalarType& s, BuilderType* builder) const { + RETURN_NOT_OK(builder->Append()); + const Array& list = *s.value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + return Status::OK(); + } + }; + template enable_if_list_like Visit(const T&) { - return UseBuilder(AppendList{}); + auto builder = internal::checked_cast::BuilderType*>(builder_); + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if ((*scalar)->is_valid) { + RETURN_NOT_OK(builder->Append()); + const Array& list = + *internal::checked_cast(**scalar).value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } + } + } + return Status::OK(); } Status Visit(const StructType& type) { + auto* builder = internal::checked_cast(builder_); + auto count = n_repeats_ * (scalars_end_ - scalars_begin_); + RETURN_NOT_OK(builder->Reserve(count)); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count)); + } for (int64_t i = 0; i < n_repeats_; i++) { - auto* builder = static_cast(builder_); for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) { const auto& scalar = internal::checked_cast(**s); for (int field_index = 0; field_index < type.num_fields(); ++field_index) { @@ -241,7 +234,6 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), " to builder for type ", type()->ToString()); } - RETURN_NOT_OK(Reserve(n_repeats)); std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert(); } From 4a41940d240d3e0ed37ec0398e04023b03d0b799 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 29 Jun 2021 14:29:44 -0400 Subject: [PATCH 8/9] ARROW-11932: [C++] Handle fixed size binary with other fixed size types --- cpp/src/arrow/array/builder_base.cc | 7 +++++-- cpp/src/arrow/array/builder_binary.h | 12 ++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index d5d3b4e1288..fc1c3a5057a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -98,7 +98,10 @@ Status ArrayBuilder::Advance(int64_t elements) { namespace { struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value, Status> Visit(const T&) { + enable_if_t::value || is_decimal_type::value || + is_fixed_size_binary_type::value, + Status> + Visit(const T&) { auto builder = internal::checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -118,7 +121,7 @@ struct AppendScalarImpl { } template - enable_if_has_string_view Visit(const T&) { + enable_if_base_binary Visit(const T&) { int64_t data_size = 0; for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; raw++) { diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index c1c664a1249..7653eeca5c4 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -467,6 +467,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status Append(const Buffer& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(util::string_view(s)); + return Status::OK(); + } + + Status Append(const std::shared_ptr& s) { return Append(*s); } + template Status Append(const std::array& value) { ARROW_RETURN_NOT_OK(Reserve(1)); @@ -502,6 +510,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); } + + void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); From 2bc15af32daefa053339e41cba2e068a583d1290 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 29 Jun 2021 14:37:13 -0400 Subject: [PATCH 9/9] ARROW-11932: [C++] Clean up implementation for lists --- cpp/src/arrow/array/builder_base.cc | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index fc1c3a5057a..c892e3d664b 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -151,22 +151,18 @@ struct AppendScalarImpl { return Status::OK(); } - struct AppendList { - template - Status operator()(const ScalarType& s, BuilderType* builder) const { - RETURN_NOT_OK(builder->Append()); - const Array& list = *s.value; - for (int64_t i = 0; i < list.length(); i++) { - ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); - RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); - } - return Status::OK(); - } - }; - template enable_if_list_like Visit(const T&) { auto builder = internal::checked_cast::BuilderType*>(builder_); + int64_t num_children = 0; + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if (!(*scalar)->is_valid) continue; + num_children += + internal::checked_cast(**scalar).value->length(); + } + RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_)); + for (int64_t i = 0; i < n_repeats_; i++) { for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; scalar++) {