From 9371743ab2ae29b7fb68780ced2ab5dccbcdd575 Mon Sep 17 00:00:00 2001 From: ZMZ Date: Wed, 9 Jun 2021 10:55:34 +0800 Subject: [PATCH 1/5] add support for take implementation on dense union type --- .../arrow/compute/kernels/vector_selection.cc | 61 +++++++++++++++++++ .../compute/kernels/vector_selection_test.cc | 52 ++++++++++++++++ cpp/src/arrow/type_traits.h | 2 + 3 files changed, 115 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 6376ae10404..50f0be6424c 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -1668,6 +1668,66 @@ struct ListImpl : public Selection, Type> { } }; +struct DenseUnionImpl : public Selection { + using Base = Selection; + LIFT_BASE_MEMBERS(); + + typename TypeTraits::ValueOffsetBuilderType value_offset_builder; + typename TypeTraits::ChildIdBuilderType child_id_builder; + std::vector type_codes; + + DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, + Datum* out) + : Base(ctx, batch, output_length, out), + value_offset_builder(ctx->memory_pool()), + child_id_builder(ctx->memory_pool()) { + DenseUnionArray typed_values(this->values); + type_codes = typed_values.union_type()->type_codes(); + } + + template + Status GenerateOutput() { + DenseUnionArray typed_values(this->values); + Adapter adapter(this); + RETURN_NOT_OK(adapter.Generate( + [&](int64_t index) { + auto child_id = typed_values.child_id(index); + child_id_builder.UnsafeAppend(type_codes[child_id]); + auto value_offset = typed_values.value_offset(index); + value_offset_builder.UnsafeAppend(value_offset); + return Status::OK(); + }, + // TODO: not able to handle null case + VisitNoop)); + return Status::OK(); + } + + Status Init() override { + RETURN_NOT_OK(child_id_builder.Reserve(output_length)); + RETURN_NOT_OK(value_offset_builder.Reserve(output_length)); + return Status::OK(); + } + + Status Finish() override { + std::shared_ptr child_ids; + std::shared_ptr value_offsets; + RETURN_NOT_OK(child_id_builder.Finish(&child_ids)); + RETURN_NOT_OK(value_offset_builder.Finish(&value_offsets)); + + DenseUnionArray typed_values(this->values); + auto num_fields = typed_values.num_fields(); + ArrayVector child_arrays; + child_arrays.reserve(num_fields); + BufferVector buffers = {nullptr, checked_cast(*child_ids).values(), + checked_cast(*value_offsets).values()}; + *out = ArrayData(typed_values.type(), child_ids->length(), std::move(buffers), 0); + for (int i = 0; i < typed_values.num_fields(); i++) { + out->child_data.push_back(typed_values.field(i)->data()); + } + return Status::OK(); + } +}; + struct FSLImpl : public Selection { Int64Builder child_index_builder; @@ -2170,6 +2230,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) { {InputType::Array(Type::LIST), TakeExec>}, {InputType::Array(Type::LARGE_LIST), TakeExec>}, {InputType::Array(Type::FIXED_SIZE_LIST), TakeExec}, + {InputType::Array(Type::DENSE_UNION), TakeExec}, {InputType::Array(Type::STRUCT), TakeExec}, // TODO: Reuse ListType kernel for MAP {InputType::Array(Type::MAP), TakeExec>}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index f428da0fe35..f53efbb6ddc 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -1279,6 +1279,58 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) { struct_type, R"([{"a": 1}, {"a": 2, "b": "hello"}])", "[0, 1, 0]"); } +class TestTakeKernelWithDenseUnion : public TestTakeKernelTyped { + public: + void AssertTake(const std::shared_ptr& type, const std::string& values, + const std::string& indices, const std::string& type_codes, + const std::string& value_offsets) { + auto union_array = + std::static_pointer_cast(ArrayFromJSON(type, values)); + ArrayVector children; + children.reserve(type->num_fields()); + std::vector field_names; + field_names.reserve(type->num_fields()); + for (int i = 0; i < type->num_fields(); i++) { + children.push_back(union_array->field(i)); + field_names.push_back(type->field(i)->name()); + } + ASSIGN_OR_ABORT( + std::shared_ptr expected, + DenseUnionArray::Make(*ArrayFromJSON(int8(), type_codes), + *ArrayFromJSON(int32(), value_offsets), children, + field_names, union_array->union_type()->type_codes())); + + std::shared_ptr actual; + + for (auto index_type : {int8(), uint32()}) { + ASSERT_OK(TakeJSON(type, values, index_type, indices, &actual)); + ASSERT_OK(actual->Validate()); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); + } + } +}; + +TEST_F(TestTakeKernelWithDenseUnion, TakeDenseUnion) { + auto union_type = dense_union({field("a", int32()), field("b", utf8())}, {2, 5}); + auto union_json = R"([ + null, + [2, 222], + [5, "hello"], + [5, "eh"], + null, + [2, 111] + ])"; + CheckTake(union_type, union_json, "[]", "[]"); + AssertTake(union_type, union_json, "[3, 1, 3, 1, 3]", "[5, 2, 5, 2, 5]", + "[1, 1, 1, 1, 1]"); + AssertTake(union_type, union_json, "[4, 2, 1]", "[2, 5, 2]", "[2, 0, 1]"); + CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json); + AssertTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", "[2, 2, 5, 5, 2, 2]", + "[0, 1, 0, 1, 2, 3]"); + AssertTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", "[2, 5, 5, 5, 5, 5, 5]", + "[0, 0, 0, 0, 0, 0, 0]"); +} + class TestTakeKernelWithUnion : public TestTakeKernelTyped {}; // TODO: Restore Union take functionality diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 86664bbb162..8205108c087 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -435,6 +435,8 @@ struct TypeTraits { using ArrayType = DenseUnionArray; using BuilderType = DenseUnionBuilder; using ScalarType = DenseUnionScalar; + using ValueOffsetBuilderType = Int32Builder; + using ChildIdBuilderType = Int8Builder; constexpr static bool is_parameter_free = false; }; From dd359f9c05011fc4624eb82f7ce4412634e50278 Mon Sep 17 00:00:00 2001 From: Zimo Zhang Date: Wed, 7 Jul 2021 22:32:39 +0800 Subject: [PATCH 2/5] update compute take implementation for dense union --- .../arrow/compute/kernels/vector_selection.cc | 39 ++++++--- .../compute/kernels/vector_selection_test.cc | 85 ++++--------------- cpp/src/arrow/type_traits.h | 2 - 3 files changed, 43 insertions(+), 83 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 50f0be6424c..44fd7c5836b 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -1672,9 +1672,10 @@ struct DenseUnionImpl : public Selection { using Base = Selection; LIFT_BASE_MEMBERS(); - typename TypeTraits::ValueOffsetBuilderType value_offset_builder; - typename TypeTraits::ChildIdBuilderType child_id_builder; + Int32Builder value_offset_builder; + Int8Builder child_id_builder; std::vector type_codes; + std::vector> child_indices_builders; DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out) @@ -1683,6 +1684,11 @@ struct DenseUnionImpl : public Selection { child_id_builder(ctx->memory_pool()) { DenseUnionArray typed_values(this->values); type_codes = typed_values.union_type()->type_codes(); + child_indices_builders.reserve(type_codes.size()); + for (size_t i = 0; i < type_codes.size(); i++) { + child_indices_builders.push_back( + std::make_shared(ctx->memory_pool())); + } } template @@ -1691,14 +1697,22 @@ struct DenseUnionImpl : public Selection { Adapter adapter(this); RETURN_NOT_OK(adapter.Generate( [&](int64_t index) { - auto child_id = typed_values.child_id(index); + int8_t child_id = typed_values.child_id(index); child_id_builder.UnsafeAppend(type_codes[child_id]); - auto value_offset = typed_values.value_offset(index); - value_offset_builder.UnsafeAppend(value_offset); + int32_t value_offset = typed_values.value_offset(index); + value_offset_builder.UnsafeAppend(child_indices_builders[child_id]->length()); + RETURN_NOT_OK(child_indices_builders[child_id]->Reserve(1)); + child_indices_builders[child_id]->UnsafeAppend(value_offset); return Status::OK(); }, - // TODO: not able to handle null case - VisitNoop)); + [&]() { + int8_t child_id = 0; + child_id_builder.UnsafeAppend(type_codes[child_id]); + value_offset_builder.UnsafeAppend(child_indices_builders[child_id]->length()); + RETURN_NOT_OK(child_indices_builders[child_id]->Reserve(1)); + child_indices_builders[child_id]->UnsafeAppendNull(); + return Status::OK(); + })); return Status::OK(); } @@ -1716,13 +1730,15 @@ struct DenseUnionImpl : public Selection { DenseUnionArray typed_values(this->values); auto num_fields = typed_values.num_fields(); - ArrayVector child_arrays; - child_arrays.reserve(num_fields); BufferVector buffers = {nullptr, checked_cast(*child_ids).values(), checked_cast(*value_offsets).values()}; *out = ArrayData(typed_values.type(), child_ids->length(), std::move(buffers), 0); - for (int i = 0; i < typed_values.num_fields(); i++) { - out->child_data.push_back(typed_values.field(i)->data()); + for (auto i = 0; i < num_fields; i++) { + std::shared_ptr child_indices_array; + RETURN_NOT_OK(child_indices_builders[i]->Finish(&child_indices_array)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr child_array, + Take(*typed_values.field(i), *child_indices_array)); + out->child_data.push_back(child_array->data()); } return Status::OK(); } @@ -2201,6 +2217,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) { {InputType::Array(Type::LIST), FilterExec>}, {InputType::Array(Type::LARGE_LIST), FilterExec>}, {InputType::Array(Type::FIXED_SIZE_LIST), FilterExec}, + {InputType::Array(Type::DENSE_UNION), FilterExec}, {InputType::Array(Type::STRUCT), StructFilter}, // TODO: Reuse ListType kernel for MAP {InputType::Array(Type::MAP), FilterExec>}, diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index f53efbb6ddc..116b24fd659 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -607,10 +607,10 @@ TEST_F(TestFilterKernelWithStruct, FilterStruct) { class TestFilterKernelWithUnion : public TestFilterKernel {}; -TEST_F(TestFilterKernelWithUnion, DISABLED_FilterUnion) { - for (auto union_ : UnionTypeFactories()) { - auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}); - auto union_json = R"([ +// TODO: Restore Sparse Union take functionality +TEST_F(TestFilterKernelWithUnion, FilterUnion) { + auto union_type = dense_union({field("a", int32()), field("b", utf8())}, {2, 5}); + auto union_json = R"([ null, [2, 222], [5, "hello"], @@ -618,20 +618,19 @@ TEST_F(TestFilterKernelWithUnion, DISABLED_FilterUnion) { null, [2, 111] ])"; - this->AssertFilter(union_type, union_json, "[0, 0, 0, 0, 0, 0]", "[]"); - this->AssertFilter(union_type, union_json, "[0, 1, 1, null, 0, 1]", R"([ + this->AssertFilter(union_type, union_json, "[0, 0, 0, 0, 0, 0]", "[]"); + this->AssertFilter(union_type, union_json, "[0, 1, 1, null, 0, 1]", R"([ [2, 222], [5, "hello"], null, [2, 111] ])"); - this->AssertFilter(union_type, union_json, "[1, 0, 1, 0, 1, 0]", R"([ + this->AssertFilter(union_type, union_json, "[1, 0, 1, 0, 1, 0]", R"([ null, [5, "hello"], null ])"); - this->AssertFilter(union_type, union_json, "[1, 1, 1, 1, 1, 1]", union_json); - } + this->AssertFilter(union_type, union_json, "[1, 1, 1, 1, 1, 1]", union_json); } class TestFilterKernelWithRecordBatch : public TestFilterKernel { @@ -1279,38 +1278,10 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) { struct_type, R"([{"a": 1}, {"a": 2, "b": "hello"}])", "[0, 1, 0]"); } -class TestTakeKernelWithDenseUnion : public TestTakeKernelTyped { - public: - void AssertTake(const std::shared_ptr& type, const std::string& values, - const std::string& indices, const std::string& type_codes, - const std::string& value_offsets) { - auto union_array = - std::static_pointer_cast(ArrayFromJSON(type, values)); - ArrayVector children; - children.reserve(type->num_fields()); - std::vector field_names; - field_names.reserve(type->num_fields()); - for (int i = 0; i < type->num_fields(); i++) { - children.push_back(union_array->field(i)); - field_names.push_back(type->field(i)->name()); - } - ASSIGN_OR_ABORT( - std::shared_ptr expected, - DenseUnionArray::Make(*ArrayFromJSON(int8(), type_codes), - *ArrayFromJSON(int32(), value_offsets), children, - field_names, union_array->union_type()->type_codes())); - - std::shared_ptr actual; - - for (auto index_type : {int8(), uint32()}) { - ASSERT_OK(TakeJSON(type, values, index_type, indices, &actual)); - ASSERT_OK(actual->Validate()); - AssertArraysEqual(*expected, *actual, /*verbose=*/true); - } - } -}; +class TestTakeKernelWithUnion : public TestTakeKernelTyped {}; -TEST_F(TestTakeKernelWithDenseUnion, TakeDenseUnion) { +// TODO: Restore Sparse Union take functionality +TEST_F(TestTakeKernelWithUnion, TakeUnion) { auto union_type = dense_union({field("a", int32()), field("b", utf8())}, {2, 5}); auto union_json = R"([ null, @@ -1321,45 +1292,20 @@ TEST_F(TestTakeKernelWithDenseUnion, TakeDenseUnion) { [2, 111] ])"; CheckTake(union_type, union_json, "[]", "[]"); - AssertTake(union_type, union_json, "[3, 1, 3, 1, 3]", "[5, 2, 5, 2, 5]", - "[1, 1, 1, 1, 1]"); - AssertTake(union_type, union_json, "[4, 2, 1]", "[2, 5, 2]", "[2, 0, 1]"); - CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json); - AssertTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", "[2, 2, 5, 5, 2, 2]", - "[0, 1, 0, 1, 2, 3]"); - AssertTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", "[2, 5, 5, 5, 5, 5, 5]", - "[0, 0, 0, 0, 0, 0, 0]"); -} - -class TestTakeKernelWithUnion : public TestTakeKernelTyped {}; - -// TODO: Restore Union take functionality -TEST_F(TestTakeKernelWithUnion, DISABLED_TakeUnion) { - for (auto union_ : UnionTypeFactories()) { - auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}); - auto union_json = R"([ - null, - [2, 222], - [5, "hello"], - [5, "eh"], - null, - [2, 111] - ])"; - CheckTake(union_type, union_json, "[]", "[]"); - CheckTake(union_type, union_json, "[3, 1, 3, 1, 3]", R"([ + CheckTake(union_type, union_json, "[3, 1, 3, 1, 3]", R"([ [5, "eh"], [2, 222], [5, "eh"], [2, 222], [5, "eh"] ])"); - CheckTake(union_type, union_json, "[4, 2, 1]", R"([ + CheckTake(union_type, union_json, "[4, 2, 1]", R"([ null, [5, "hello"], [2, 222] ])"); - CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json); - CheckTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ + CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json); + CheckTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ null, [5, "hello"], [5, "hello"], @@ -1368,7 +1314,6 @@ TEST_F(TestTakeKernelWithUnion, DISABLED_TakeUnion) { [5, "hello"], [5, "hello"] ])"); - } } class TestPermutationsWithTake : public TestBase { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 8205108c087..86664bbb162 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -435,8 +435,6 @@ struct TypeTraits { using ArrayType = DenseUnionArray; using BuilderType = DenseUnionBuilder; using ScalarType = DenseUnionScalar; - using ValueOffsetBuilderType = Int32Builder; - using ChildIdBuilderType = Int8Builder; constexpr static bool is_parameter_free = false; }; From a97374043d1f191882935784eca0d07fde45aadb Mon Sep 17 00:00:00 2001 From: Zimo Zhang Date: Thu, 8 Jul 2021 10:01:48 +0800 Subject: [PATCH 3/5] 1. use typed buffer builder for child id and value offset buffers; 2. update variable declarations --- cpp/src/arrow/array/builder_base.h | 1 + .../arrow/compute/kernels/vector_selection.cc | 62 +++++++++---------- .../compute/kernels/vector_selection_test.cc | 10 ++- 3 files changed, 35 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 15c726241b5..5f447bbad11 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -51,6 +51,7 @@ class ARROW_EXPORT ArrayBuilder { explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {} virtual ~ArrayBuilder() = default; + ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder); /// For nested types. Since the objects are owned by this class instance, we /// skip shared pointers and just return a raw pointer diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 44fd7c5836b..40696df1c76 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -1672,22 +1672,20 @@ struct DenseUnionImpl : public Selection { using Base = Selection; LIFT_BASE_MEMBERS(); - Int32Builder value_offset_builder; - Int8Builder child_id_builder; - std::vector type_codes; - std::vector> child_indices_builders; + TypedBufferBuilder value_offset_buffer_builder_; + TypedBufferBuilder child_id_buffer_builder_; + std::vector type_codes_; + std::vector child_indices_builders_; DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out) : Base(ctx, batch, output_length, out), - value_offset_builder(ctx->memory_pool()), - child_id_builder(ctx->memory_pool()) { - DenseUnionArray typed_values(this->values); - type_codes = typed_values.union_type()->type_codes(); - child_indices_builders.reserve(type_codes.size()); - for (size_t i = 0; i < type_codes.size(); i++) { - child_indices_builders.push_back( - std::make_shared(ctx->memory_pool())); + value_offset_buffer_builder_(ctx->memory_pool()), + child_id_buffer_builder_(ctx->memory_pool()), + type_codes_(checked_cast(*this->values->type).type_codes()), + child_indices_builders_(type_codes_.size()) { + for (auto& child_indices_builder : child_indices_builders_) { + child_indices_builder = Int32Builder(ctx->memory_pool()); } } @@ -1698,44 +1696,44 @@ struct DenseUnionImpl : public Selection { RETURN_NOT_OK(adapter.Generate( [&](int64_t index) { int8_t child_id = typed_values.child_id(index); - child_id_builder.UnsafeAppend(type_codes[child_id]); + child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]); int32_t value_offset = typed_values.value_offset(index); - value_offset_builder.UnsafeAppend(child_indices_builders[child_id]->length()); - RETURN_NOT_OK(child_indices_builders[child_id]->Reserve(1)); - child_indices_builders[child_id]->UnsafeAppend(value_offset); + value_offset_buffer_builder_.UnsafeAppend( + child_indices_builders_[child_id].length()); + RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1)); + child_indices_builders_[child_id].UnsafeAppend(value_offset); return Status::OK(); }, [&]() { int8_t child_id = 0; - child_id_builder.UnsafeAppend(type_codes[child_id]); - value_offset_builder.UnsafeAppend(child_indices_builders[child_id]->length()); - RETURN_NOT_OK(child_indices_builders[child_id]->Reserve(1)); - child_indices_builders[child_id]->UnsafeAppendNull(); + child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]); + value_offset_buffer_builder_.UnsafeAppend( + child_indices_builders_[child_id].length()); + RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1)); + child_indices_builders_[child_id].UnsafeAppendNull(); return Status::OK(); })); return Status::OK(); } Status Init() override { - RETURN_NOT_OK(child_id_builder.Reserve(output_length)); - RETURN_NOT_OK(value_offset_builder.Reserve(output_length)); + RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length)); + RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length)); return Status::OK(); } Status Finish() override { - std::shared_ptr child_ids; - std::shared_ptr value_offsets; - RETURN_NOT_OK(child_id_builder.Finish(&child_ids)); - RETURN_NOT_OK(value_offset_builder.Finish(&value_offsets)); - + ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish()); + ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer, + value_offset_buffer_builder_.Finish()); DenseUnionArray typed_values(this->values); auto num_fields = typed_values.num_fields(); - BufferVector buffers = {nullptr, checked_cast(*child_ids).values(), - checked_cast(*value_offsets).values()}; - *out = ArrayData(typed_values.type(), child_ids->length(), std::move(buffers), 0); + BufferVector buffers = {nullptr, child_ids_buffer, value_offsets_buffer}; + *out = + ArrayData(typed_values.type(), child_ids_buffer->size(), std::move(buffers), 0); for (auto i = 0; i < num_fields; i++) { - std::shared_ptr child_indices_array; - RETURN_NOT_OK(child_indices_builders[i]->Finish(&child_indices_array)); + ARROW_ASSIGN_OR_RAISE(auto child_indices_array, + child_indices_builders_[i].Finish()); ARROW_ASSIGN_OR_RAISE(std::shared_ptr child_array, Take(*typed_values.field(i), *child_indices_array)); out->child_data.push_back(child_array->data()); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index 116b24fd659..1f35f0f6233 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -607,15 +607,14 @@ TEST_F(TestFilterKernelWithStruct, FilterStruct) { class TestFilterKernelWithUnion : public TestFilterKernel {}; -// TODO: Restore Sparse Union take functionality TEST_F(TestFilterKernelWithUnion, FilterUnion) { auto union_type = dense_union({field("a", int32()), field("b", utf8())}, {2, 5}); auto union_json = R"([ - null, + [2, null], [2, 222], [5, "hello"], [5, "eh"], - null, + [2, null], [2, 111] ])"; this->AssertFilter(union_type, union_json, "[0, 0, 0, 0, 0, 0]", "[]"); @@ -1280,15 +1279,14 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) { class TestTakeKernelWithUnion : public TestTakeKernelTyped {}; -// TODO: Restore Sparse Union take functionality TEST_F(TestTakeKernelWithUnion, TakeUnion) { auto union_type = dense_union({field("a", int32()), field("b", utf8())}, {2, 5}); auto union_json = R"([ - null, + [2, null], [2, 222], [5, "hello"], [5, "eh"], - null, + [2, null], [2, 111] ])"; CheckTake(union_type, union_json, "[]", "[]"); From 49d44e88d1eefe621489e2f1b02bc66d45c92919 Mon Sep 17 00:00:00 2001 From: Zimo Zhang Date: Fri, 9 Jul 2021 09:32:14 +0800 Subject: [PATCH 4/5] update case with null --- .../arrow/compute/kernels/vector_selection.cc | 7 ++-- .../compute/kernels/vector_selection_test.cc | 34 +++++++++++-------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 40696df1c76..f46e432bdcc 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -1728,9 +1728,10 @@ struct DenseUnionImpl : public Selection { value_offset_buffer_builder_.Finish()); DenseUnionArray typed_values(this->values); auto num_fields = typed_values.num_fields(); - BufferVector buffers = {nullptr, child_ids_buffer, value_offsets_buffer}; - *out = - ArrayData(typed_values.type(), child_ids_buffer->size(), std::move(buffers), 0); + auto num_rows = child_ids_buffer->size(); + BufferVector buffers{nullptr, std::move(child_ids_buffer), + std::move(value_offsets_buffer)}; + *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0); for (auto i = 0; i < num_fields; i++) { ARROW_ASSIGN_OR_RAISE(auto child_indices_array, child_indices_builders_[i].Finish()); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index 1f35f0f6233..e367d888d00 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -615,21 +615,23 @@ TEST_F(TestFilterKernelWithUnion, FilterUnion) { [5, "hello"], [5, "eh"], [2, null], - [2, 111] + [2, 111], + [5, null] ])"; - this->AssertFilter(union_type, union_json, "[0, 0, 0, 0, 0, 0]", "[]"); - this->AssertFilter(union_type, union_json, "[0, 1, 1, null, 0, 1]", R"([ + this->AssertFilter(union_type, union_json, "[0, 0, 0, 0, 0, 0, 0]", "[]"); + this->AssertFilter(union_type, union_json, "[0, 1, 1, null, 0, 1, 1]", R"([ [2, 222], [5, "hello"], - null, - [2, 111] + [2, null], + [2, 111], + [5, null] ])"); - this->AssertFilter(union_type, union_json, "[1, 0, 1, 0, 1, 0]", R"([ - null, + this->AssertFilter(union_type, union_json, "[1, 0, 1, 0, 1, 0, 0]", R"([ + [2, null], [5, "hello"], - null + [2, null] ])"); - this->AssertFilter(union_type, union_json, "[1, 1, 1, 1, 1, 1]", union_json); + this->AssertFilter(union_type, union_json, "[1, 1, 1, 1, 1, 1, 1]", union_json); } class TestFilterKernelWithRecordBatch : public TestFilterKernel { @@ -1287,7 +1289,8 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) { [5, "hello"], [5, "eh"], [2, null], - [2, 111] + [2, 111], + [5, null] ])"; CheckTake(union_type, union_json, "[]", "[]"); CheckTake(union_type, union_json, "[3, 1, 3, 1, 3]", R"([ @@ -1297,14 +1300,15 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) { [2, 222], [5, "eh"] ])"); - CheckTake(union_type, union_json, "[4, 2, 1]", R"([ - null, + CheckTake(union_type, union_json, "[4, 2, 1, 6]", R"([ + [2, null], [5, "hello"], - [2, 222] + [2, 222], + [5, null] ])"); - CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json); + CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5, 6]", union_json); CheckTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", R"([ - null, + [2, null], [5, "hello"], [5, "hello"], [5, "hello"], From b26cf65ba16fd231c8c30ba5cc0d3a2d648282ed Mon Sep 17 00:00:00 2001 From: Zimo Zhang Date: Sat, 10 Jul 2021 09:12:07 +0800 Subject: [PATCH 5/5] use explicit cast with builder append --- cpp/src/arrow/compute/kernels/vector_selection.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index f46e432bdcc..5845a7ee2d0 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -1699,7 +1699,7 @@ struct DenseUnionImpl : public Selection { child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]); int32_t value_offset = typed_values.value_offset(index); value_offset_buffer_builder_.UnsafeAppend( - child_indices_builders_[child_id].length()); + static_cast(child_indices_builders_[child_id].length())); RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1)); child_indices_builders_[child_id].UnsafeAppend(value_offset); return Status::OK(); @@ -1708,7 +1708,7 @@ struct DenseUnionImpl : public Selection { int8_t child_id = 0; child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]); value_offset_buffer_builder_.UnsafeAppend( - child_indices_builders_[child_id].length()); + static_cast(child_indices_builders_[child_id].length())); RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1)); child_indices_builders_[child_id].UnsafeAppendNull(); return Status::OK();