Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -708,10 +708,6 @@ DenseUnionArray::DenseUnionArray(std::shared_ptr<DataType> type, int64_t length,
Result<std::shared_ptr<Array>> DenseUnionArray::Make(
const Array& type_ids, const Array& value_offsets, ArrayVector children,
std::vector<std::string> field_names, std::vector<type_code_t> type_codes) {
if (value_offsets.length() == 0) {
return Status::Invalid("UnionArray offsets must have non-zero length");
}

if (value_offsets.type_id() != Type::INT32) {
return Status::TypeError("UnionArray offsets must be signed int32");
}
Expand Down
86 changes: 85 additions & 1 deletion cpp/src/arrow/array/concatenate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <vector>

#include "arrow/array/array_base.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
Expand Down Expand Up @@ -349,7 +350,90 @@ class ConcatenateImpl {
}

Status Visit(const UnionType& u) {
return Status::NotImplemented("concatenation of ", u);
// This implementation assumes that all input arrays are valid union arrays
// with same number of variants.

// Concatenate the type buffers.
ARROW_ASSIGN_OR_RAISE(auto type_buffers, Buffers(1, sizeof(int8_t)));
RETURN_NOT_OK(ConcatenateBuffers(type_buffers, pool_).Value(&out_->buffers[1]));

// Concatenate the child data. For sparse unions the child data is sliced
// based on the offset and length of the array data. For dense unions the
// child data is not sliced because this makes constructing the concatenated
// offsets buffer more simple. We could however choose to modify this and
// slice the child arrays and reflect this in the concatenated offsets
// buffer.
switch (u.mode()) {
case UnionMode::SPARSE: {
for (int i = 0; i < u.num_fields(); i++) {
ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
RETURN_NOT_OK(
ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
}
break;
}
case UnionMode::DENSE: {
for (int i = 0; i < u.num_fields(); i++) {
ArrayDataVector child_data(in_.size());
for (size_t j = 0; j < in_.size(); j++) {
child_data[j] = in_[j]->child_data[i];
}
RETURN_NOT_OK(
ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
}
break;
}
}

// Concatenate offsets buffers for dense union arrays.
if (u.mode() == UnionMode::DENSE) {
// The number of offset values is equal to the number of type_ids in the
// concatenated type buffers.
TypedBufferBuilder<int32_t> builder;
RETURN_NOT_OK(builder.Reserve(out_->length));

// Initialize a vector for child array lengths. These are updated during
// iteration over the input arrays to track the concatenated child array
// lengths. These lengths are used as offsets for the concatenated offsets
// buffer.
std::vector<int32_t> offset_map(u.num_fields());

// Iterate over all input arrays.
for (size_t i = 0; i < in_.size(); i++) {
// Get sliced type ids and offsets.
auto type_ids = in_[i]->GetValues<int8_t>(1);
auto offset_values = in_[i]->GetValues<int32_t>(2);

// Iterate over all elements in the type buffer and append the updated
// offset to the concatenated offsets buffer.
for (auto j = 0; j < in_[i]->length; j++) {
int32_t offset;
if (internal::AddWithOverflow(offset_map[u.child_ids()[type_ids[j]]],
offset_values[j], &offset)) {
return Status::Invalid("Offset value overflow when concatenating arrays");
}
RETURN_NOT_OK(builder.Append(offset));
}

// Increment the offsets in the offset map for the next iteration.
for (int j = 0; j < u.num_fields(); j++) {
int64_t length;
if (internal::AddWithOverflow(static_cast<int64_t>(offset_map[j]),
in_[i]->child_data[j]->length, &length)) {
return Status::Invalid("Offset value overflow when concatenating arrays");
}
// Make sure we can safely downcast to int32_t.
if (length > std::numeric_limits<int32_t>::max()) {
return Status::Invalid("Length overflow when concatenating arrays");
}
offset_map[j] = static_cast<int32_t>(length);
}
}

ARROW_ASSIGN_OR_RAISE(out_->buffers[2], builder.Finish());
}

return Status::OK();
}

Status Visit(const ExtensionType& e) {
Expand Down
170 changes: 157 additions & 13 deletions cpp/src/arrow/array/concatenate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -364,27 +364,171 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) {
AssertArraysEqual(*expected, *concat_actual);
}

TEST_F(ConcatenateTest, DISABLED_UnionType) {
TEST_F(ConcatenateTest, UnionType) {
// sparse mode
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
auto foo = this->GeneratePrimitive<Int8Type>(size, null_probability);
auto bar = this->GeneratePrimitive<DoubleType>(size, null_probability);
auto baz = this->GeneratePrimitive<BooleanType>(size, null_probability);
auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, null_probability);
ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz}));
*out = rng_.ArrayOf(sparse_union({
field("a", float64()),
field("b", boolean()),
}),
size, null_probability);
});
// dense mode
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
auto foo = this->GeneratePrimitive<Int8Type>(size, null_probability);
auto bar = this->GeneratePrimitive<DoubleType>(size, null_probability);
auto baz = this->GeneratePrimitive<BooleanType>(size, null_probability);
auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, null_probability);
auto value_offsets = rng_.Numeric<Int32Type>(size, 0, size, 0);
ASSERT_OK_AND_ASSIGN(
*out, DenseUnionArray::Make(*type_ids, *value_offsets, {foo, bar, baz}));
*out = rng_.ArrayOf(dense_union({
field("a", uint32()),
field("b", boolean()),
field("c", int8()),
}),
size, null_probability);
});
}

TEST_F(ConcatenateTest, DenseUnionTypeOverflow) {
// Offset overflow
auto type_ids = ArrayFromJSON(int8(), "[0]");
auto offsets = ArrayFromJSON(int32(), "[2147483646]");
auto child_array = ArrayFromJSON(uint8(), "[0, 1]");
ASSERT_OK_AND_ASSIGN(auto array,
DenseUnionArray::Make(*type_ids, *offsets, {child_array}));
ArrayVector arrays({array, array});
ASSERT_RAISES(Invalid, Concatenate(arrays, default_memory_pool()));

// Length overflow
auto type_ids_ok = ArrayFromJSON(int8(), "[0]");
auto offsets_ok = ArrayFromJSON(int32(), "[0]");
auto child_array_overflow =
this->rng_.ArrayOf(null(), std::numeric_limits<int32_t>::max() - 1, 0.0);
ASSERT_OK_AND_ASSIGN(
auto array_overflow,
DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow}));
ArrayVector arrays_overflow({array_overflow, array_overflow});
ASSERT_RAISES(Invalid, Concatenate(arrays_overflow, default_memory_pool()));
}

TEST_F(ConcatenateTest, DenseUnionType) {
auto array = ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([
[0, true],
[0, true],
[1, 1],
[1, 2],
[0, false],
[1, 3]
])");
ASSERT_OK(array->ValidateFull());

// Test concatenation of an unsliced array.
ASSERT_OK_AND_ASSIGN(auto concat_arrays,
Concatenate({array, array}, default_memory_pool()));
ASSERT_OK(concat_arrays->ValidateFull());
AssertArraysEqual(
*ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([
[0, true],
[0, true],
[1, 1],
[1, 2],
[0, false],
[1, 3],
[0, true],
[0, true],
[1, 1],
[1, 2],
[0, false],
[1, 3]
])"),
*concat_arrays);

// Test concatenation of a sliced array with an unsliced array.
ASSERT_OK_AND_ASSIGN(auto concat_sliced_arrays,
Concatenate({array->Slice(1, 4), array}, default_memory_pool()));
ASSERT_OK(concat_sliced_arrays->ValidateFull());
AssertArraysEqual(
*ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([
[0, true],
[1, 1],
[1, 2],
[0, false],
[0, true],
[0, true],
[1, 1],
[1, 2],
[0, false],
[1, 3]
])"),
*concat_sliced_arrays);

// Test concatenation of an unsliced array, but whose children are sliced.
auto type_ids = ArrayFromJSON(int8(), "[1, 1, 0, 0, 0]");
auto offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2]");
auto child_one =
ArrayFromJSON(boolean(), "[false, true, true, true, false, false, false]");
auto child_two = ArrayFromJSON(int8(), "[0, 1, 1, 0, 0, 0, 0]");
ASSERT_OK_AND_ASSIGN(
auto array_sliced_children,
DenseUnionArray::Make(*type_ids, *offsets,
{child_one->Slice(1, 3), child_two->Slice(1, 2)}));
ASSERT_OK(array_sliced_children->ValidateFull());
ASSERT_OK_AND_ASSIGN(
auto concat_sliced_children,
Concatenate({array_sliced_children, array_sliced_children}, default_memory_pool()));
ASSERT_OK(concat_sliced_children->ValidateFull());
AssertArraysEqual(
*ArrayFromJSON(dense_union({field("0", boolean()), field("1", int8())}), R"([
[1, 1],
[1, 1],
[0, true],
[0, true],
[0, true],
[1, 1],
[1, 1],
[0, true],
[0, true],
[0, true]
])"),
*concat_sliced_children);

// Test concatenation of a sliced array, whose children also have an offset.
ASSERT_OK_AND_ASSIGN(auto concat_sliced_array_sliced_children,
Concatenate({array_sliced_children->Slice(1, 1),
array_sliced_children->Slice(2, 3)},
default_memory_pool()));
ASSERT_OK(concat_sliced_array_sliced_children->ValidateFull());
AssertArraysEqual(
*ArrayFromJSON(dense_union({field("0", boolean()), field("1", int8())}), R"([
[1, 1],
[0, true],
[0, true],
[0, true]
])"),
*concat_sliced_array_sliced_children);

// Test concatenation of dense union array with types codes other than 0..n
auto array_type_codes =
ArrayFromJSON(dense_union({field("", int32()), field("", utf8())}, {2, 5}), R"([
[2, 42],
[5, "Hello world!"],
[2, 42]
])");
ASSERT_OK(array_type_codes->ValidateFull());
ASSERT_OK_AND_ASSIGN(
auto concat_array_type_codes,
Concatenate({array_type_codes, array_type_codes, array_type_codes->Slice(1, 1)},
default_memory_pool()));
ASSERT_OK(concat_array_type_codes->ValidateFull());
AssertArraysEqual(
*ArrayFromJSON(dense_union({field("", int32()), field("", utf8())}, {2, 5}),
R"([
[2, 42],
[5, "Hello world!"],
[2, 42],
[2, 42],
[5, "Hello world!"],
[2, 42],
[5, "Hello world!"]
])"),
*concat_array_type_codes);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also test concatenation of an array 1) which is not sliced, but whose children are sliced/have an offset? 2) which is sliced, whose children additionally have an offset?


TEST_F(ConcatenateTest, OffsetOverflow) {
auto fake_long = ArrayFromJSON(utf8(), "[\"\"]");
fake_long->data()->GetMutableValues<int32_t>(1)[1] =
Expand Down
3 changes: 0 additions & 3 deletions cpp/src/arrow/testing/random_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ TEST_P(RandomArrayTest, GenerateBatch) {

TEST_P(RandomArrayTest, GenerateZeroLengthArray) {
auto field = GetField();
if (field->type()->id() == Type::type::DENSE_UNION) {
GTEST_SKIP() << "Cannot generate zero-length dense union arrays";
}
auto array = GenerateArray(*field, 0, 0xDEADBEEF);
AssertTypeEqual(field->type(), array->type());
ASSERT_EQ(0, array->length());
Expand Down