diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index c80273e9adf..bb0361d58f7 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -708,10 +708,6 @@ DenseUnionArray::DenseUnionArray(std::shared_ptr type, int64_t length, Result> DenseUnionArray::Make( const Array& type_ids, const Array& value_offsets, ArrayVector children, std::vector field_names, std::vector type_codes) { - if (value_offsets.length() == 0) { - return Status::Invalid("UnionArray offsets must have non-zero length"); - } - if (value_offsets.type_id() != Type::INT32) { return Status::TypeError("UnionArray offsets must be signed int32"); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 0631fe6915d..f7720bd9dd0 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -26,6 +26,7 @@ #include #include "arrow/array/array_base.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -349,7 +350,90 @@ class ConcatenateImpl { } Status Visit(const UnionType& u) { - return Status::NotImplemented("concatenation of ", u); + // This implementation assumes that all input arrays are valid union arrays + // with same number of variants. + + // Concatenate the type buffers. + ARROW_ASSIGN_OR_RAISE(auto type_buffers, Buffers(1, sizeof(int8_t))); + RETURN_NOT_OK(ConcatenateBuffers(type_buffers, pool_).Value(&out_->buffers[1])); + + // Concatenate the child data. For sparse unions the child data is sliced + // based on the offset and length of the array data. For dense unions the + // child data is not sliced because this makes constructing the concatenated + // offsets buffer more simple. We could however choose to modify this and + // slice the child arrays and reflect this in the concatenated offsets + // buffer. + switch (u.mode()) { + case UnionMode::SPARSE: { + for (int i = 0; i < u.num_fields(); i++) { + ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i)); + RETURN_NOT_OK( + ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + } + break; + } + case UnionMode::DENSE: { + for (int i = 0; i < u.num_fields(); i++) { + ArrayDataVector child_data(in_.size()); + for (size_t j = 0; j < in_.size(); j++) { + child_data[j] = in_[j]->child_data[i]; + } + RETURN_NOT_OK( + ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + } + break; + } + } + + // Concatenate offsets buffers for dense union arrays. + if (u.mode() == UnionMode::DENSE) { + // The number of offset values is equal to the number of type_ids in the + // concatenated type buffers. + TypedBufferBuilder builder; + RETURN_NOT_OK(builder.Reserve(out_->length)); + + // Initialize a vector for child array lengths. These are updated during + // iteration over the input arrays to track the concatenated child array + // lengths. These lengths are used as offsets for the concatenated offsets + // buffer. + std::vector offset_map(u.num_fields()); + + // Iterate over all input arrays. + for (size_t i = 0; i < in_.size(); i++) { + // Get sliced type ids and offsets. + auto type_ids = in_[i]->GetValues(1); + auto offset_values = in_[i]->GetValues(2); + + // Iterate over all elements in the type buffer and append the updated + // offset to the concatenated offsets buffer. + for (auto j = 0; j < in_[i]->length; j++) { + int32_t offset; + if (internal::AddWithOverflow(offset_map[u.child_ids()[type_ids[j]]], + offset_values[j], &offset)) { + return Status::Invalid("Offset value overflow when concatenating arrays"); + } + RETURN_NOT_OK(builder.Append(offset)); + } + + // Increment the offsets in the offset map for the next iteration. + for (int j = 0; j < u.num_fields(); j++) { + int64_t length; + if (internal::AddWithOverflow(static_cast(offset_map[j]), + in_[i]->child_data[j]->length, &length)) { + return Status::Invalid("Offset value overflow when concatenating arrays"); + } + // Make sure we can safely downcast to int32_t. + if (length > std::numeric_limits::max()) { + return Status::Invalid("Length overflow when concatenating arrays"); + } + offset_map[j] = static_cast(length); + } + } + + ARROW_ASSIGN_OR_RAISE(out_->buffers[2], builder.Finish()); + } + + return Status::OK(); } Status Visit(const ExtensionType& e) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 218a40ea066..cf385da44a2 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -364,27 +364,171 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { AssertArraysEqual(*expected, *concat_actual); } -TEST_F(ConcatenateTest, DISABLED_UnionType) { +TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, null_probability); - ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz})); + *out = rng_.ArrayOf(sparse_union({ + field("a", float64()), + field("b", boolean()), + }), + size, null_probability); }); // dense mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, null_probability); - auto value_offsets = rng_.Numeric(size, 0, size, 0); - ASSERT_OK_AND_ASSIGN( - *out, DenseUnionArray::Make(*type_ids, *value_offsets, {foo, bar, baz})); + *out = rng_.ArrayOf(dense_union({ + field("a", uint32()), + field("b", boolean()), + field("c", int8()), + }), + size, null_probability); }); } +TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { + // Offset overflow + auto type_ids = ArrayFromJSON(int8(), "[0]"); + auto offsets = ArrayFromJSON(int32(), "[2147483646]"); + auto child_array = ArrayFromJSON(uint8(), "[0, 1]"); + ASSERT_OK_AND_ASSIGN(auto array, + DenseUnionArray::Make(*type_ids, *offsets, {child_array})); + ArrayVector arrays({array, array}); + ASSERT_RAISES(Invalid, Concatenate(arrays, default_memory_pool())); + + // Length overflow + auto type_ids_ok = ArrayFromJSON(int8(), "[0]"); + auto offsets_ok = ArrayFromJSON(int32(), "[0]"); + auto child_array_overflow = + this->rng_.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); + ASSERT_OK_AND_ASSIGN( + auto array_overflow, + DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow})); + ArrayVector arrays_overflow({array_overflow, array_overflow}); + ASSERT_RAISES(Invalid, Concatenate(arrays_overflow, default_memory_pool())); +} + +TEST_F(ConcatenateTest, DenseUnionType) { + auto array = ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3] + ])"); + ASSERT_OK(array->ValidateFull()); + + // Test concatenation of an unsliced array. + ASSERT_OK_AND_ASSIGN(auto concat_arrays, + Concatenate({array, array}, default_memory_pool())); + ASSERT_OK(concat_arrays->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3], + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3] + ])"), + *concat_arrays); + + // Test concatenation of a sliced array with an unsliced array. + ASSERT_OK_AND_ASSIGN(auto concat_sliced_arrays, + Concatenate({array->Slice(1, 4), array}, default_memory_pool())); + ASSERT_OK(concat_sliced_arrays->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([ + [0, true], + [1, 1], + [1, 2], + [0, false], + [0, true], + [0, true], + [1, 1], + [1, 2], + [0, false], + [1, 3] + ])"), + *concat_sliced_arrays); + + // Test concatenation of an unsliced array, but whose children are sliced. + auto type_ids = ArrayFromJSON(int8(), "[1, 1, 0, 0, 0]"); + auto offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2]"); + auto child_one = + ArrayFromJSON(boolean(), "[false, true, true, true, false, false, false]"); + auto child_two = ArrayFromJSON(int8(), "[0, 1, 1, 0, 0, 0, 0]"); + ASSERT_OK_AND_ASSIGN( + auto array_sliced_children, + DenseUnionArray::Make(*type_ids, *offsets, + {child_one->Slice(1, 3), child_two->Slice(1, 2)})); + ASSERT_OK(array_sliced_children->ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto concat_sliced_children, + Concatenate({array_sliced_children, array_sliced_children}, default_memory_pool())); + ASSERT_OK(concat_sliced_children->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("0", boolean()), field("1", int8())}), R"([ + [1, 1], + [1, 1], + [0, true], + [0, true], + [0, true], + [1, 1], + [1, 1], + [0, true], + [0, true], + [0, true] + ])"), + *concat_sliced_children); + + // Test concatenation of a sliced array, whose children also have an offset. + ASSERT_OK_AND_ASSIGN(auto concat_sliced_array_sliced_children, + Concatenate({array_sliced_children->Slice(1, 1), + array_sliced_children->Slice(2, 3)}, + default_memory_pool())); + ASSERT_OK(concat_sliced_array_sliced_children->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("0", boolean()), field("1", int8())}), R"([ + [1, 1], + [0, true], + [0, true], + [0, true] + ])"), + *concat_sliced_array_sliced_children); + + // Test concatenation of dense union array with types codes other than 0..n + auto array_type_codes = + ArrayFromJSON(dense_union({field("", int32()), field("", utf8())}, {2, 5}), R"([ + [2, 42], + [5, "Hello world!"], + [2, 42] + ])"); + ASSERT_OK(array_type_codes->ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto concat_array_type_codes, + Concatenate({array_type_codes, array_type_codes, array_type_codes->Slice(1, 1)}, + default_memory_pool())); + ASSERT_OK(concat_array_type_codes->ValidateFull()); + AssertArraysEqual( + *ArrayFromJSON(dense_union({field("", int32()), field("", utf8())}, {2, 5}), + R"([ + [2, 42], + [5, "Hello world!"], + [2, 42], + [2, 42], + [5, "Hello world!"], + [2, 42], + [5, "Hello world!"] + ])"), + *concat_array_type_codes); +} + TEST_F(ConcatenateTest, OffsetOverflow) { auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); fake_long->data()->GetMutableValues(1)[1] = diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 6b20d821d48..261998d4590 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -66,9 +66,6 @@ TEST_P(RandomArrayTest, GenerateBatch) { TEST_P(RandomArrayTest, GenerateZeroLengthArray) { auto field = GetField(); - if (field->type()->id() == Type::type::DENSE_UNION) { - GTEST_SKIP() << "Cannot generate zero-length dense union arrays"; - } auto array = GenerateArray(*field, 0, 0xDEADBEEF); AssertTypeEqual(field->type(), array->type()); ASSERT_EQ(0, array->length());