From 099c6458c62f98ea48cef3e5518393861f76058a Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 2 Dec 2021 21:55:04 +0100 Subject: [PATCH 1/6] Support concatenation of UnionArrays --- cpp/src/arrow/array/array_nested.cc | 4 -- cpp/src/arrow/array/concatenate.cc | 82 ++++++++++++++++++++++++- cpp/src/arrow/array/concatenate_test.cc | 72 ++++++++++++++++++---- cpp/src/arrow/testing/random_test.cc | 3 - 4 files changed, 142 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index c80273e9adf..bb0361d58f7 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -708,10 +708,6 @@ DenseUnionArray::DenseUnionArray(std::shared_ptr type, int64_t length, Result> DenseUnionArray::Make( const Array& type_ids, const Array& value_offsets, ArrayVector children, std::vector field_names, std::vector type_codes) { - if (value_offsets.length() == 0) { - return Status::Invalid("UnionArray offsets must have non-zero length"); - } - if (value_offsets.type_id() != Type::INT32) { return Status::TypeError("UnionArray offsets must be signed int32"); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 0631fe6915d..65af44c20f7 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -26,6 +26,7 @@ #include #include "arrow/array/array_base.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -349,7 +350,86 @@ class ConcatenateImpl { } Status Visit(const UnionType& u) { - return Status::NotImplemented("concatenation of ", u); + // This implementation assumes that all input arrays are valid union arrays + // with same number of variants. + + // Concatenate the type buffers. + ARROW_ASSIGN_OR_RAISE(auto type_buffers, Buffers(1, sizeof(int8_t))); + RETURN_NOT_OK(ConcatenateBuffers(type_buffers, pool_).Value(&out_->buffers[1])); + + // Concatenate the child data. For sparse unions the child data is sliced + // based on the offset and length of the array data. For dense unions the + // child data is not sliced because this makes constructing the concatenated + // offsets buffer more simple. We could however choose to modify this and + // slice the child arrays and reflect this in the concatenated offsets + // buffer. + switch (u.mode()) { + case UnionMode::SPARSE: { + for (int i = 0; i < u.num_fields(); i++) { + ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i)); + RETURN_NOT_OK( + ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + } + break; + } + case UnionMode::DENSE: { + for (int i = 0; i < u.num_fields(); i++) { + ArrayDataVector child_data(in_.size()); + for (size_t j = 0; j < in_.size(); j++) { + child_data[j] = in_[j]->child_data[i]; + } + RETURN_NOT_OK( + ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + } + break; + } + } + + // Concatenate offsets buffers for dense union arrays. + if (u.mode() == UnionMode::DENSE) { + // The number of offset values is equal to the number of type_ids in the + // concatenated type buffers. + TypedBufferBuilder builder; + RETURN_NOT_OK(builder.Reserve(out_->length)); + + // Initialize a vector for child array lengths. These are updated during + // iteration over the input arrays to track the concatenated child array + // lengths. These lengths are used as offsets for the concatenated offsets + // buffer. + std::vector offset_map(u.num_fields()); + + // Iterate over all input arrays. + for (size_t i = 0; i < in_.size(); i++) { + // Get sliced type ids and offsets. + auto type_ids = in_[i]->GetValues(1); + auto offset_values = in_[i]->GetValues(2); + + // Iterate over all elements in the type buffer and append the updated + // offset to the concatenated offsets buffer. + for (auto j = 0; j < in_[i]->length; j++) { + int32_t offset; + if (internal::AddWithOverflow(offset_map[type_ids[j]], offset_values[j], + &offset)) { + return Status::Invalid("Offset value overflow when concatenating arrays"); + } + RETURN_NOT_OK(builder.Append(offset)); + } + + // Increment the offsets in the offset map for the next iteration. + for (int j = 0; j < u.num_fields(); j++) { + int32_t length; + if (internal::AddWithOverflow(offset_map[j], in_[i]->child_data[j]->length, + &length)) { + return Status::Invalid("Offset value overflow when concatenating arrays"); + } + offset_map[j] = length; + } + } + + ARROW_ASSIGN_OR_RAISE(out_->buffers[2], builder.Finish()); + } + + return Status::OK(); } Status Visit(const ExtensionType& e) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 218a40ea066..96a5eb00fc4 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -364,27 +364,77 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { AssertArraysEqual(*expected, *concat_actual); } -TEST_F(ConcatenateTest, DISABLED_UnionType) { +TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); + auto foo = this->GeneratePrimitive(size, 0); auto bar = this->GeneratePrimitive(size, null_probability); auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, null_probability); + auto type_ids = rng_.Numeric(size, 0, 2, 0); ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz})); }); // dense mode - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, null_probability); - auto value_offsets = rng_.Numeric(size, 0, size, 0); - ASSERT_OK_AND_ASSIGN( - *out, DenseUnionArray::Make(*type_ids, *value_offsets, {foo, bar, baz})); + Check([this](int32_t size, double null_probabilities, std::shared_ptr* out) { + *out = rng_.ArrayOf(dense_union({ + field("a", uint32()), + field("b", boolean()), + field("c", int8()), + }), + size, null_probabilities); }); } +TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { + auto type_ids = ArrayFromJSON(int8(), "[0]"); + auto offsets = ArrayFromJSON(int32(), "[2147483646]"); + auto child_array = ArrayFromJSON(uint8(), "[0, 1]"); + ASSERT_OK_AND_ASSIGN(auto array, + DenseUnionArray::Make(*type_ids, *offsets, {child_array})); + ArrayVector arrays({array, array}); + ASSERT_RAISES(Invalid, Concatenate(arrays, default_memory_pool())); +} + +TEST_F(ConcatenateTest, DenseUnionType) { + auto type_ids = ArrayFromJSON(int8(), "[0, 0, 1, 1, 0, 1]"); + auto offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2]"); + auto child_one = ArrayFromJSON(boolean(), "[true, true, false]"); + auto child_two = ArrayFromJSON(int8(), "[1, 2, 3]"); + ASSERT_OK_AND_ASSIGN( + auto array, DenseUnionArray::Make(*type_ids, *offsets, {child_one, child_two})); + ASSERT_OK(array->ValidateFull()); + + ArrayVector arrays({array, array}); + ASSERT_OK_AND_ASSIGN(auto concat_arrays, Concatenate(arrays, default_memory_pool())); + ASSERT_OK(concat_arrays->ValidateFull()); + + auto type_ids_expected = ArrayFromJSON(int8(), "[0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1]"); + auto offsets_expected = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 3, 4, 3, 4, 5, 5]"); + auto child_one_expected = + ArrayFromJSON(boolean(), "[true, true, false, true, true, false]"); + auto child_two_expected = ArrayFromJSON(int8(), "[1, 2, 3, 1, 2, 3]"); + ASSERT_OK_AND_ASSIGN(auto expected, + DenseUnionArray::Make(*type_ids_expected, *offsets_expected, + {child_one_expected, child_two_expected})); + ASSERT_OK(expected->ValidateFull()); + AssertArraysEqual(*expected, *concat_arrays); + + ArrayVector sliced_arrays({array->Slice(1, 4), array}); + ASSERT_OK_AND_ASSIGN(auto concat_sliced_arrays, + Concatenate(sliced_arrays, default_memory_pool())); + ASSERT_OK(concat_sliced_arrays->ValidateFull()); + + auto type_ids_sliced = ArrayFromJSON(int8(), "[0, 1, 1, 0, 0, 0, 1, 1, 0, 1]"); + auto offsets_sliced = ArrayFromJSON(int32(), "[1, 0, 1, 2, 3, 4, 3, 4, 5, 5]"); + auto child_one_sliced = + ArrayFromJSON(boolean(), "[true, true, false, true, true, false]"); + auto child_two_sliced = ArrayFromJSON(int8(), "[1, 2, 3, 1, 2, 3]"); + ASSERT_OK_AND_ASSIGN(auto expected_sliced, + DenseUnionArray::Make(*type_ids_sliced, *offsets_sliced, + {child_one_sliced, child_two_sliced})); + ASSERT_OK(expected_sliced->ValidateFull()); + AssertArraysEqual(*expected_sliced, *concat_sliced_arrays); +} + TEST_F(ConcatenateTest, OffsetOverflow) { auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); fake_long->data()->GetMutableValues(1)[1] = diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 6b20d821d48..261998d4590 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -66,9 +66,6 @@ TEST_P(RandomArrayTest, GenerateBatch) { TEST_P(RandomArrayTest, GenerateZeroLengthArray) { auto field = GetField(); - if (field->type()->id() == Type::type::DENSE_UNION) { - GTEST_SKIP() << "Cannot generate zero-length dense union arrays"; - } auto array = GenerateArray(*field, 0, 0xDEADBEEF); AssertTypeEqual(field->type(), array->type()); ASSERT_EQ(0, array->length()); From 15204d4ed5f762b4f1bbd92747ee70aada93b7cd Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 3 Dec 2021 11:18:26 +0100 Subject: [PATCH 2/6] Handle child array length overflow --- cpp/src/arrow/array/concatenate.cc | 12 ++++++++---- cpp/src/arrow/array/concatenate_test.cc | 12 ++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 65af44c20f7..f45c0abe8a9 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -417,12 +417,16 @@ class ConcatenateImpl { // Increment the offsets in the offset map for the next iteration. for (int j = 0; j < u.num_fields(); j++) { - int32_t length; - if (internal::AddWithOverflow(offset_map[j], in_[i]->child_data[j]->length, - &length)) { + int64_t length; + if (internal::AddWithOverflow(static_cast(offset_map[j]), + in_[i]->child_data[j]->length, &length)) { return Status::Invalid("Offset value overflow when concatenating arrays"); } - offset_map[j] = length; + // Make sure we can safely downcast to int32_t. + if (length > std::numeric_limits::max()) { + return Status::Invalid("Length overflow when concatenating arrays"); + } + offset_map[j] = static_cast(length); } } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 96a5eb00fc4..0d765b10bfe 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -385,6 +385,7 @@ TEST_F(ConcatenateTest, UnionType) { } TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { + // Offset overflow auto type_ids = ArrayFromJSON(int8(), "[0]"); auto offsets = ArrayFromJSON(int32(), "[2147483646]"); auto child_array = ArrayFromJSON(uint8(), "[0, 1]"); @@ -392,6 +393,17 @@ TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { DenseUnionArray::Make(*type_ids, *offsets, {child_array})); ArrayVector arrays({array, array}); ASSERT_RAISES(Invalid, Concatenate(arrays, default_memory_pool())); + + // Length overflow + auto type_ids_ok = ArrayFromJSON(int8(), "[0]"); + auto offsets_ok = ArrayFromJSON(int32(), "[0]"); + auto child_array_overflow = + this->rng_.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); + ASSERT_OK_AND_ASSIGN( + auto array_overflow, + DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow})); + ArrayVector arrays_overflow({array_overflow, array_overflow}); + ASSERT_RAISES(Invalid, Concatenate(arrays_overflow, default_memory_pool())); } TEST_F(ConcatenateTest, DenseUnionType) { From 4473426bda99166d7a5c01c10be945d387fd3c65 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 6 Dec 2021 10:12:14 +0100 Subject: [PATCH 3/6] Use ArrayFromJSON for dense union concatenate test This makes it more readable. Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/array/concatenate_test.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0d765b10bfe..8db087775bf 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -435,16 +435,18 @@ TEST_F(ConcatenateTest, DenseUnionType) { Concatenate(sliced_arrays, default_memory_pool())); ASSERT_OK(concat_sliced_arrays->ValidateFull()); - auto type_ids_sliced = ArrayFromJSON(int8(), "[0, 1, 1, 0, 0, 0, 1, 1, 0, 1]"); - auto offsets_sliced = ArrayFromJSON(int32(), "[1, 0, 1, 2, 3, 4, 3, 4, 5, 5]"); - auto child_one_sliced = - ArrayFromJSON(boolean(), "[true, true, false, true, true, false]"); - auto child_two_sliced = ArrayFromJSON(int8(), "[1, 2, 3, 1, 2, 3]"); - ASSERT_OK_AND_ASSIGN(auto expected_sliced, - DenseUnionArray::Make(*type_ids_sliced, *offsets_sliced, - {child_one_sliced, child_two_sliced})); - ASSERT_OK(expected_sliced->ValidateFull()); - AssertArraysEqual(*expected_sliced, *concat_sliced_arrays); + AssertArraysEqual(*ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + [0, true], + [1, 1], + [1, 2], + [0, true], + [0, false], + [0, true], + [1, 3], + [1, 1], + [0, true], + [1, 3] + ])"), *concat_sliced_arrays); } TEST_F(ConcatenateTest, OffsetOverflow) { From 57ce777aafce0df0350741bf38aa95a6d3e6f39d Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 6 Dec 2021 10:13:22 +0100 Subject: [PATCH 4/6] Use ArrayOf for sparse union array concatenate test --- cpp/src/arrow/array/concatenate_test.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 8db087775bf..3e0c4cc9393 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -367,20 +367,20 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, 0); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, 0); - ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz})); + *out = rng_.ArrayOf(sparse_union({ + field("a", float64()), + field("b", boolean()), + }), + size, null_probability); }); // dense mode - Check([this](int32_t size, double null_probabilities, std::shared_ptr* out) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = rng_.ArrayOf(dense_union({ field("a", uint32()), field("b", boolean()), field("c", int8()), }), - size, null_probabilities); + size, null_probability); }); } From cff97cb43ba87263d3a3756dd0e63ae889b826c5 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 6 Dec 2021 10:39:34 +0100 Subject: [PATCH 5/6] Add more tests for concatenation of dense union arrays --- cpp/src/arrow/array/concatenate_test.cc | 106 ++++++++++++++++++------ 1 file changed, 80 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 3e0c4cc9393..c2d398bce4e 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -407,46 +407,100 @@ TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { } TEST_F(ConcatenateTest, DenseUnionType) { - auto type_ids = ArrayFromJSON(int8(), "[0, 0, 1, 1, 0, 1]"); - auto offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2]"); - auto child_one = ArrayFromJSON(boolean(), "[true, true, false]"); - auto child_two = ArrayFromJSON(int8(), "[1, 2, 3]"); - ASSERT_OK_AND_ASSIGN( - auto array, DenseUnionArray::Make(*type_ids, *offsets, {child_one, child_two})); + auto array = ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3] + ])"); ASSERT_OK(array->ValidateFull()); - ArrayVector arrays({array, array}); - ASSERT_OK_AND_ASSIGN(auto concat_arrays, Concatenate(arrays, default_memory_pool())); + // Test concatenation of an unsliced array. + ASSERT_OK_AND_ASSIGN(auto concat_arrays, + Concatenate({array, array}, default_memory_pool())); ASSERT_OK(concat_arrays->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3], + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3] + ])"), + *concat_arrays); - auto type_ids_expected = ArrayFromJSON(int8(), "[0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1]"); - auto offsets_expected = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 3, 4, 3, 4, 5, 5]"); - auto child_one_expected = - ArrayFromJSON(boolean(), "[true, true, false, true, true, false]"); - auto child_two_expected = ArrayFromJSON(int8(), "[1, 2, 3, 1, 2, 3]"); - ASSERT_OK_AND_ASSIGN(auto expected, - DenseUnionArray::Make(*type_ids_expected, *offsets_expected, - {child_one_expected, child_two_expected})); - ASSERT_OK(expected->ValidateFull()); - AssertArraysEqual(*expected, *concat_arrays); - - ArrayVector sliced_arrays({array->Slice(1, 4), array}); + // Test concatenation of a sliced array with an unsliced array. ASSERT_OK_AND_ASSIGN(auto concat_sliced_arrays, - Concatenate(sliced_arrays, default_memory_pool())); + Concatenate({array->Slice(1, 4), array}, default_memory_pool())); ASSERT_OK(concat_sliced_arrays->ValidateFull()); - - AssertArraysEqual(*ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ [0, true], [1, 1], [1, 2], + [0, false], + [0, true], [0, true], + [1, 1], + [1, 2], [0, false], + [1, 3] + ])"), + *concat_sliced_arrays); + + // Test concatenation of an unsliced array, but whose children are sliced. + auto type_ids = ArrayFromJSON(int8(), "[1, 1, 0, 0, 0]"); + auto offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2]"); + auto child_one = + ArrayFromJSON(boolean(), "[false, true, true, true, false, false, false]"); + auto child_two = ArrayFromJSON(int8(), "[0, 1, 1, 0, 0, 0, 0]"); + ASSERT_OK_AND_ASSIGN( + auto array_sliced_children, + DenseUnionArray::Make(*type_ids, *offsets, + {child_one->Slice(1, 3), child_two->Slice(1, 2)})); + ASSERT_OK(array_sliced_children->ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto concat_sliced_children, + Concatenate({array_sliced_children, array_sliced_children}, default_memory_pool())); + ASSERT_OK(concat_sliced_children->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("0", boolean()), field("1", int8())}), R"([ + [1, 1], + [1, 1], + [0, true], + [0, true], [0, true], - [1, 3], + [1, 1], [1, 1], [0, true], - [1, 3] - ])"), *concat_sliced_arrays); + [0, true], + [0, true] + ])"), + *concat_sliced_children); + + // Test concatenation of a sliced array, whose children also have an offset. + ASSERT_OK_AND_ASSIGN(auto concat_sliced_array_sliced_children, + Concatenate({array_sliced_children->Slice(1, 1), + array_sliced_children->Slice(2, 3)}, + default_memory_pool())); + ASSERT_OK(concat_sliced_array_sliced_children->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("0", boolean()), field("1", int8())}), R"([ + [1, 1], + [0, true], + [0, true], + [0, true] + ])"), + *concat_sliced_array_sliced_children); } TEST_F(ConcatenateTest, OffsetOverflow) { From a269df9c402d6dcc7e1a618dbfdda70849702585 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 6 Dec 2021 10:53:42 +0100 Subject: [PATCH 6/6] Handle concatenation of dense union arrays with type codes other than 0..n --- cpp/src/arrow/array/concatenate.cc | 4 ++-- cpp/src/arrow/array/concatenate_test.cc | 26 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f45c0abe8a9..f7720bd9dd0 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -408,8 +408,8 @@ class ConcatenateImpl { // offset to the concatenated offsets buffer. for (auto j = 0; j < in_[i]->length; j++) { int32_t offset; - if (internal::AddWithOverflow(offset_map[type_ids[j]], offset_values[j], - &offset)) { + if (internal::AddWithOverflow(offset_map[u.child_ids()[type_ids[j]]], + offset_values[j], &offset)) { return Status::Invalid("Offset value overflow when concatenating arrays"); } RETURN_NOT_OK(builder.Append(offset)); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index c2d398bce4e..cf385da44a2 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -501,6 +501,32 @@ TEST_F(ConcatenateTest, DenseUnionType) { [0, true] ])"), *concat_sliced_array_sliced_children); + + // Test concatenation of dense union array with types codes other than 0..n + auto array_type_codes = + ArrayFromJSON(dense_union({field("", int32()), field("", utf8())}, {2, 5}), R"([ + [2, 42], + [5, "Hello world!"], + [2, 42] + ])"); + ASSERT_OK(array_type_codes->ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto concat_array_type_codes, + Concatenate({array_type_codes, array_type_codes, array_type_codes->Slice(1, 1)}, + default_memory_pool())); + ASSERT_OK(concat_array_type_codes->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("", int32()), field("", utf8())}, {2, 5}), + R"([ + [2, 42], + [5, "Hello world!"], + [2, 42], + [2, 42], + [5, "Hello world!"], + [2, 42], + [5, "Hello world!"] + ])"), + *concat_array_type_codes); } TEST_F(ConcatenateTest, OffsetOverflow) {