From d20055ad979da4bc38b2037d91ccb115e732785c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 17 Jun 2019 14:36:03 -0400 Subject: [PATCH 01/16] minor refactors, adding some asserts --- cpp/src/arrow/builder.h | 1 + cpp/src/arrow/compare.cc | 42 +++++++------------------------------ cpp/src/arrow/ipc/reader.cc | 5 ++--- cpp/src/arrow/type.cc | 3 +++ 4 files changed, 14 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 56c3e2b3716..223ed5819f1 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -27,6 +27,7 @@ #include "arrow/array/builder_nested.h" // IWYU pragma: export #include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/array/builder_time.h" // IWYU pragma: export +#include "arrow/array/builder_union.h" // IWYU pragma: export #include "arrow/status.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index e1525a4f4d6..0480827c524 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -234,26 +234,15 @@ class RangeEqualsVisitor { const auto& left_type = checked_cast(*left.type()); // Define a mapping from the type id to child number - uint8_t max_code = 0; - const std::vector& type_codes = left_type.type_codes(); + uint8_t type_id_to_child_num[std::numeric_limits::max()] = {0}; for (size_t i = 0; i < type_codes.size(); ++i) { - const uint8_t code = type_codes[i]; - if (code > max_code) { - max_code = code; - } - } - - // Store mapping in a vector for constant time lookups - std::vector type_id_to_child_num(max_code + 1); - for (uint8_t i = 0; i < static_cast(type_codes.size()); ++i) { type_id_to_child_num[type_codes[i]] = i; } const uint8_t* left_ids = left.raw_type_ids(); const uint8_t* right_ids = right.raw_type_ids(); - uint8_t id, child_num; for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { if (left.IsNull(i) != right.IsNull(o_i)) { @@ -264,8 +253,7 @@ class RangeEqualsVisitor { return false; } - id = left_ids[i]; - child_num = type_id_to_child_num[id]; + auto child_num = type_id_to_child_num[left_ids[i]]; // TODO(wesm): really we should be comparing stretches of non-null data // rather than looking at one value at a time. @@ -773,30 +761,16 @@ class TypeEqualsVisitor { Status Visit(const UnionType& left) { const auto& right = checked_cast(right_); - if (left.mode() != right.mode() || - left.type_codes().size() != right.type_codes().size()) { + if (left.mode() != right.mode() || left.type_codes() != right.type_codes()) { result_ = false; return Status::OK(); } - const std::vector& left_codes = left.type_codes(); - const std::vector& right_codes = right.type_codes(); - - for (size_t i = 0; i < left_codes.size(); ++i) { - if (left_codes[i] != right_codes[i]) { - result_ = false; - return Status::OK(); - } - } - - for (int i = 0; i < left.num_children(); ++i) { - if (!left.child(i)->Equals(right_.child(i), check_metadata_)) { - result_ = false; - return Status::OK(); - } - } - - result_ = true; + result_ = std::equal( + left.children().begin(), left.children().end(), right.children().begin(), + [this](const std::shared_ptr& l, const std::shared_ptr& r) { + return l->Equals(r, check_metadata_); + }); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 002379e3779..8d3d5d1f251 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -290,12 +290,11 @@ class ArrayLoader { RETURN_NOT_OK(LoadCommon()); if (out_->length > 0) { - RETURN_NOT_OK(GetBuffer(context_->buffer_index, &out_->buffers[1])); + RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1])); if (type.mode() == UnionMode::DENSE) { - RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &out_->buffers[2])); + RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[2])); } } - context_->buffer_index += type.mode() == UnionMode::DENSE ? 2 : 1; return LoadChildren(type.children()); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 2dbd31a7dca..20289f0f88f 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -273,6 +273,9 @@ std::string DurationType::ToString() const { UnionType::UnionType(const std::vector>& fields, const std::vector& type_codes, UnionMode::type mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { + DCHECK_LE(fields.size(), type_codes.size()) << "union field with unknown type id"; + DCHECK_GE(fields.size(), type_codes.size()) + << "type id provided without corresponding union field"; children_ = fields; } From 7902d12c823a767c91b0cf2a7482517e82b6c858 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 17 Jun 2019 14:36:54 -0400 Subject: [PATCH 02/16] first pass at updating DenseUnionBuilder NB: serialize.cc is probably broken --- cpp/src/arrow/array-union-test.cc | 56 +++++++++++++++++++++++++++ cpp/src/arrow/array.cc | 1 + cpp/src/arrow/array.h | 7 ++-- cpp/src/arrow/array/builder_nested.cc | 2 +- cpp/src/arrow/array/builder_nested.h | 2 +- cpp/src/arrow/array/builder_union.cc | 34 +++++++++++----- cpp/src/arrow/array/builder_union.h | 21 +++++----- cpp/src/arrow/compare.cc | 2 +- cpp/src/arrow/python/serialize.cc | 12 +----- cpp/src/arrow/type.cc | 6 +++ cpp/src/arrow/type.h | 2 + 11 files changed, 109 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index 86cbeae6d78..30c4bf54d6b 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -188,4 +188,60 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { type_codes); } +TEST(UnionArrayBuilders, DenseUnionBuilder) { + enum { I8 = 8, STR = 13, DBL = 7 }; + + auto i8_builder = std::make_shared(); + auto str_builder = std::make_shared(); + auto dbl_builder = std::make_shared(); + DenseUnionBuilder union_builder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}, UnionMode::DENSE)); + + ASSERT_OK(union_builder.Append(I8)); + ASSERT_OK(i8_builder->Append(33)); + + ASSERT_OK(union_builder.Append(STR)); + ASSERT_OK(str_builder->Append("abc")); + + ASSERT_OK(union_builder.Append(DBL)); + ASSERT_OK(dbl_builder->Append(1.0)); + + ASSERT_OK(union_builder.Append(DBL)); + ASSERT_OK(dbl_builder->Append(-1.0)); + + ASSERT_OK(union_builder.Append(STR)); + ASSERT_OK(str_builder->Append("")); + + ASSERT_OK(union_builder.Append(I8)); + ASSERT_OK(i8_builder->Append(10)); + + ASSERT_OK(union_builder.Append(STR)); + ASSERT_OK(str_builder->Append("def")); + + ASSERT_OK(union_builder.Append(I8)); + ASSERT_OK(i8_builder->Append(-10)); + + ASSERT_OK(union_builder.Append(DBL)); + ASSERT_OK(dbl_builder->Append(0.5)); + + std::shared_ptr actual; + ASSERT_OK(union_builder.Finish(&actual)); + + auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); + + std::shared_ptr expected_types; + ArrayFromVector({I8, STR, DBL, DBL, STR, I8, STR, I8, DBL}, &expected_types); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index bc38559b91b..5f76f083968 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -606,6 +606,7 @@ void UnionArray::SetData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::UNION); ARROW_CHECK_EQ(data->buffers.size(), 3); + union_type_ = checked_cast(data_->type.get()); auto type_ids = data_->buffers[1]; auto value_offsets = data_->buffers[2]; diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 256bbdc6c67..599a6ea62af 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -1032,9 +1032,9 @@ class ARROW_EXPORT UnionArray : public Array { const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode::type mode() const { - return internal::checked_cast(*type()).mode(); - } + const UnionType* union_type() const { return union_type_; } + + UnionMode::type mode() const { return union_type_->mode(); } // Return the given field as an individual array. // For sparse unions, the returned array has its offset, length and null @@ -1047,6 +1047,7 @@ class ARROW_EXPORT UnionArray : public Array { const type_id_t* raw_type_ids_; const int32_t* raw_value_offsets_; + const UnionType* union_type_; // For caching boxed child data mutable std::vector> boxed_fields_; diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 0bf91719dfe..3194df9922b 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -300,7 +300,7 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr* out) { // Struct StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders) + std::vector> field_builders) : ArrayBuilder(type, pool) { children_ = std::move(field_builders); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index de031459181..8742f2b6e24 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -218,7 +218,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); + std::vector> field_builders); Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index f51b7d7f020..b95bcef4280 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -19,36 +19,50 @@ #include +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" namespace arrow { +using internal::checked_cast; + DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool, + std::vector> children, const std::shared_ptr& type) - : ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) {} + : ArrayBuilder(type, pool), + union_type_(checked_cast(type.get())), + types_builder_(pool), + offsets_builder_(pool), + type_id_to_child_num_(union_type_->max_type_code() + 1, -1) { + DCHECK_EQ(union_type_->mode(), UnionMode::DENSE); + int child_i = 0; + for (auto type_id : union_type_->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } + children_ = std::move(children); +} Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr types; + std::shared_ptr types, offsets, null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(types_builder_.Finish(&types)); - std::shared_ptr offsets; RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - std::shared_ptr null_bitmap; - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - - std::vector> fields; std::vector> child_data(children_.size()); - std::vector type_ids; for (size_t i = 0; i < children_.size(); ++i) { std::shared_ptr data; RETURN_NOT_OK(children_[i]->FinishInternal(&data)); child_data[i] = data; - fields.push_back(field(field_names_[i], children_[i]->type())); - type_ids.push_back(static_cast(i)); } // If the type has not been specified in the constructor, infer it if (!type_) { + std::vector> fields; + std::vector type_ids; + for (size_t i = 0; i < children_.size(); ++i) { + fields.push_back(field(field_names_[i], children_[i]->type())); + type_ids.push_back(static_cast(i)); + } type_ = union_(fields, type_ids, UnionMode::DENSE); } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index aac2e54f9a2..e5880784848 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -45,7 +45,8 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { /// Use this constructor to incrementally build the union array along /// with types, offsets, and null bitmap. explicit DenseUnionBuilder(MemoryPool* pool, - const std::shared_ptr& type = NULLPTR); + std::vector> children, + const std::shared_ptr& type); Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -54,22 +55,20 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { } Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(types_builder_.Reserve(length)); - ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length)); - ARROW_RETURN_NOT_OK(Reserve(length)); - for (int64_t i = 0; i < length; ++i) { - types_builder_.UnsafeAppend(0); - offsets_builder_.UnsafeAppend(0); - } + ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0)); + ARROW_RETURN_NOT_OK(offsets_builder_.Append(length, 0)); return AppendToBitmap(length, false); } /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. + /// /// \param[in] type index of the child the value will be appended - /// \param[in] offset offset of the value in that child - Status Append(int8_t type, int32_t offset) { + /// The corresponding child builder must be appended to independently + /// after this method is called. + Status Append(int8_t type) { ARROW_RETURN_NOT_OK(types_builder_.Append(type)); + auto offset = static_cast(children_[type_id_to_child_num_[type]]->length()); ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); return AppendToBitmap(true); } @@ -98,9 +97,11 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { } private: + const UnionType* union_type_; TypedBufferBuilder types_builder_; TypedBufferBuilder offsets_builder_; std::vector field_names_; + std::vector type_id_to_child_num_; }; } // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 0480827c524..113466c56eb 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -235,7 +235,7 @@ class RangeEqualsVisitor { // Define a mapping from the type id to child number const std::vector& type_codes = left_type.type_codes(); - uint8_t type_id_to_child_num[std::numeric_limits::max()] = {0}; + std::vector type_id_to_child_num(left.union_type()->max_type_code() + 1, 0); for (size_t i = 0; i < type_codes.size(); ++i) { type_id_to_child_num[type_codes[i]] = i; } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 57843943775..6ac5611b1f1 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -71,20 +71,12 @@ class SequenceBuilder { types_(::arrow::int8(), pool), offsets_(::arrow::int32(), pool), type_map_(PythonType::NUM_PYTHON_TYPES, -1) { - builder_.reset(new DenseUnionBuilder(pool)); + builder_.reset(new DenseUnionBuilder(pool, {}, nullptr)); } // Appending a none to the sequence Status AppendNone() { return builder_->AppendNull(); } - template - Status Update(BuilderType* child_builder, int8_t tag) { - int32_t offset32 = -1; - RETURN_NOT_OK(internal::CastSize(child_builder->length(), &offset32)); - DCHECK_GE(offset32, 0); - return builder_->Append(tag, offset32); - } - template Status CreateAndUpdate(std::shared_ptr* child_builder, int8_t tag, MakeBuilderFn make_builder) { @@ -95,7 +87,7 @@ class SequenceBuilder { convert << static_cast(tag); type_map_[tag] = builder_->AppendChild(*child_builder, convert.str()); } - return Update(child_builder->get(), type_map_[tag]); + return builder_->Append(type_map_[tag]); } template diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 20289f0f88f..55154ba9690 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -287,6 +287,12 @@ DataTypeLayout UnionType::layout() const { } } +uint8_t UnionType::max_type_code() const { + return type_codes_.size() == 0 + ? 0 + : *std::max_element(type_codes_.begin(), type_codes_.end()); +} + std::string UnionType::ToString() const { std::stringstream s; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 16f486f45f1..10c1f90b58f 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -686,6 +686,8 @@ class ARROW_EXPORT UnionType : public NestedType { const std::vector& type_codes() const { return type_codes_; } + uint8_t max_type_code() const; + UnionMode::type mode() const { return mode_; } private: From 351905dd98deed6551b3e1052c7007566c8823aa Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 18 Jun 2019 14:58:44 -0400 Subject: [PATCH 03/16] add test for lazily typed union builder --- cpp/src/arrow/array-union-test.cc | 112 ++++++++++++++++++++------- cpp/src/arrow/array/builder_union.cc | 12 +-- cpp/src/arrow/array/builder_union.h | 13 +++- 3 files changed, 103 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index 30c4bf54d6b..8efeae22341 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -188,59 +188,115 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { type_codes); } -TEST(UnionArrayBuilders, DenseUnionBuilder) { - enum { I8 = 8, STR = 13, DBL = 7 }; +// segfault in test_serialization.py: trying to roundtrip {(): 2} +// this is a dict -> list>>, +// union +// >> +// This breaks when trying to deserialize the *values* array, which has null offsets for +// some reason + +class DenseUnionBuilderTest : public ::testing::Test { + public: + void AppendInt(int8_t i) { + ASSERT_OK(union_builder->Append(I8)); + ASSERT_OK(i8_builder->Append(i)); + } + + void AppendString(const std::string& str) { + ASSERT_OK(union_builder->Append(STR)); + ASSERT_OK(str_builder->Append(str)); + } + + void AppendDouble(double dbl) { + ASSERT_OK(union_builder->Append(DBL)); + ASSERT_OK(dbl_builder->Append(dbl)); + } - auto i8_builder = std::make_shared(); - auto str_builder = std::make_shared(); - auto dbl_builder = std::make_shared(); - DenseUnionBuilder union_builder( + uint8_t I8 = 8, STR = 13, DBL = 7; + + std::shared_ptr i8_builder = std::make_shared(); + std::shared_ptr str_builder = std::make_shared(); + std::shared_ptr dbl_builder = std::make_shared(); + std::shared_ptr union_builder; +}; + +TEST_F(DenseUnionBuilderTest, Basics) { + union_builder.reset(new DenseUnionBuilder( default_memory_pool(), {i8_builder, str_builder, dbl_builder}, union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, - {I8, STR, DBL}, UnionMode::DENSE)); + {I8, STR, DBL}, UnionMode::DENSE))); + + AppendInt(33); + AppendString("abc"); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendString(""); + AppendInt(10); + AppendString("def"); + AppendInt(-10); + AppendDouble(0.5); - ASSERT_OK(union_builder.Append(I8)); - ASSERT_OK(i8_builder->Append(33)); + std::shared_ptr actual; + ASSERT_OK(union_builder->Finish(&actual)); - ASSERT_OK(union_builder.Append(STR)); - ASSERT_OK(str_builder->Append("abc")); + auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); - ASSERT_OK(union_builder.Append(DBL)); - ASSERT_OK(dbl_builder->Append(1.0)); + std::shared_ptr expected_types; + ArrayFromVector({I8, STR, DBL, DBL, STR, I8, STR, I8, DBL}, + &expected_types); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); - ASSERT_OK(union_builder.Append(DBL)); - ASSERT_OK(dbl_builder->Append(-1.0)); + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); - ASSERT_OK(union_builder.Append(STR)); - ASSERT_OK(str_builder->Append("")); + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} - ASSERT_OK(union_builder.Append(I8)); - ASSERT_OK(i8_builder->Append(10)); +TEST_F(DenseUnionBuilderTest, InferredType) { + union_builder.reset(new DenseUnionBuilder(default_memory_pool(), {}, nullptr)); - ASSERT_OK(union_builder.Append(STR)); - ASSERT_OK(str_builder->Append("def")); + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(33); + AppendInt(10); - ASSERT_OK(union_builder.Append(I8)); - ASSERT_OK(i8_builder->Append(-10)); + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendString(""); + AppendString("def"); + AppendInt(-10); - ASSERT_OK(union_builder.Append(DBL)); - ASSERT_OK(dbl_builder->Append(0.5)); + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendDouble(0.5); std::shared_ptr actual; - ASSERT_OK(union_builder.Finish(&actual)); + ASSERT_OK(union_builder->Finish(&actual)); auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); std::shared_ptr expected_types; - ArrayFromVector({I8, STR, DBL, DBL, STR, I8, STR, I8, DBL}, &expected_types); - auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); + ArrayFromVector({I8, I8, STR, STR, STR, I8, DBL, DBL, DBL}, + &expected_types); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]"); std::shared_ptr expected; ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, {expected_i8, expected_str, expected_dbl}, {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); ASSERT_ARRAYS_EQUAL(*expected, *actual); } diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index b95bcef4280..89ba1f5f198 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -33,11 +33,13 @@ DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool, union_type_(checked_cast(type.get())), types_builder_(pool), offsets_builder_(pool), - type_id_to_child_num_(union_type_->max_type_code() + 1, -1) { - DCHECK_EQ(union_type_->mode(), UnionMode::DENSE); - int child_i = 0; - for (auto type_id : union_type_->type_codes()) { - type_id_to_child_num_[type_id] = child_i++; + type_id_to_child_num_(union_type_ ? union_type_->max_type_code() + 1 : 0, -1) { + if (union_type_) { + DCHECK_EQ(union_type_->mode(), UnionMode::DENSE); + int child_i = 0; + for (auto type_id : union_type_->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } } children_ = std::move(children); } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index e5880784848..9a6f8fd29ae 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -93,7 +93,18 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { const std::string& field_name = "") { children_.push_back(child); field_names_.push_back(field_name); - return static_cast(children_.size() - 1); + auto child_num = static_cast(children_.size() - 1); + // search for an available type_id + // FIXME(bkietz) far from optimal + auto max_type = static_cast(type_id_to_child_num_.size()); + for (int8_t type = 0; type < max_type; ++type) { + if (type_id_to_child_num_[type] == -1) { + type_id_to_child_num_[type] = child_num; + return type; + } + } + type_id_to_child_num_.push_back(child_num); + return max_type; } private: From 8d4f36dcf9a4fb4c8674407566efee126cec62b9 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 18 Jun 2019 15:33:06 -0400 Subject: [PATCH 04/16] add tests for building lists where the value builder has mutable type --- cpp/src/arrow/array-dict-test.cc | 45 +++++++++++++++++++++++++++++++ cpp/src/arrow/array-union-test.cc | 29 ++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 16d8aac4d59..34a5e211d0d 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -948,4 +948,49 @@ TEST(TestDictionary, TransposeNulls) { AssertArraysEqual(*expected, *out); } +TEST(TestDictionary, ListOfDictionary) { + std::unique_ptr root_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())), + &root_builder)); + auto list_builder = checked_cast(root_builder.get()); + auto dict_builder = + checked_cast*>(list_builder->value_builder()); + + ASSERT_OK(list_builder->Append()); + std::vector expected; + for (char a : "abc") { + for (char d : "def") { + for (char g : "ghi") { + for (char j : "jkl") { + for (char m : "mno") { + for (char p : "pqr") { + if ((static_cast(a) + d + g + j + m + p) % 16 == 0) { + ASSERT_OK(list_builder->Append()); + } + // 3**6 distinct strings; too large for int8 + char str[6] = {a, d, g, j, m, p}; + ASSERT_OK(dict_builder->Append(str)); + expected.push_back(str); + } + } + } + } + } + } + std::shared_ptr expected_dict; + ArrayFromVector(expected, &expected_dict); + + std::shared_ptr array; + ASSERT_OK(root_builder->Finish(&array)); + ASSERT_OK(ValidateArray(*array)); + + auto expected_type = list(dictionary(int16(), utf8())); + ASSERT_EQ(array->type()->ToString(), expected_type->ToString()); + + auto list_array = checked_cast(array.get()); + auto actual_dict = + checked_cast(*list_array->values()).dictionary(); + ASSERT_ARRAYS_EQUAL(*expected_dict, *actual_dict); +} + } // namespace arrow diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index 8efeae22341..0c5520730ae 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -300,4 +300,33 @@ TEST_F(DenseUnionBuilderTest, InferredType) { ASSERT_ARRAYS_EQUAL(*expected, *actual); } +TEST_F(DenseUnionBuilderTest, ListOfInferredType) { + union_builder.reset(new DenseUnionBuilder(default_memory_pool(), {}, nullptr)); + ListBuilder list_builder(default_memory_pool(), union_builder); + + ASSERT_OK(list_builder.Append()); + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(10); + + ASSERT_OK(list_builder.Append()); + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendInt(-10); + + ASSERT_OK(list_builder.Append()); + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(0.5); + + std::shared_ptr actual; + ASSERT_OK(list_builder.Finish(&actual)); + + auto expected_type = + list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}, UnionMode::DENSE)); + ASSERT_EQ(expected_type->ToString(), actual->type()->ToString()); +} + } // namespace arrow From 6245c82f11e9831dd0252d983a1f8706773efd25 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 18 Jun 2019 16:08:52 -0400 Subject: [PATCH 05/16] add SparseUnionBuilder and MakeBuilder case --- cpp/src/arrow/array-union-test.cc | 113 +++++++++++++++++++++++++++ cpp/src/arrow/array/builder_union.cc | 45 +++++++++++ cpp/src/arrow/array/builder_union.h | 89 +++++++++++++++++++++ cpp/src/arrow/builder.cc | 24 +++++- cpp/src/arrow/ipc/json-simple.cc | 75 ++++++++++++++++++ 5 files changed, 343 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index 0c5520730ae..cc5374e6b12 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -329,4 +329,117 @@ TEST_F(DenseUnionBuilderTest, ListOfInferredType) { ASSERT_EQ(expected_type->ToString(), actual->type()->ToString()); } +class SparseUnionBuilderTest : public ::testing::Test { + public: + void AppendInt(int8_t i) { + ASSERT_OK(union_builder->Append(I8)); + ASSERT_OK(i8_builder->Append(i)); + ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(dbl_builder->AppendNull()); + } + + void AppendString(const std::string& str) { + ASSERT_OK(union_builder->Append(STR)); + ASSERT_OK(i8_builder->AppendNull()); + ASSERT_OK(str_builder->Append(str)); + ASSERT_OK(dbl_builder->AppendNull()); + } + + void AppendDouble(double dbl) { + ASSERT_OK(union_builder->Append(DBL)); + ASSERT_OK(i8_builder->AppendNull()); + ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(dbl_builder->Append(dbl)); + } + + uint8_t I8 = 8, STR = 13, DBL = 7; + + std::shared_ptr i8_builder = std::make_shared(); + std::shared_ptr str_builder = std::make_shared(); + std::shared_ptr dbl_builder = std::make_shared(); + std::shared_ptr union_builder; +}; + +TEST_F(SparseUnionBuilderTest, Basics) { + union_builder.reset(new SparseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}, UnionMode::SPARSE))); + + AppendInt(33); + AppendString("abc"); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendString(""); + AppendInt(10); + AppendString("def"); + AppendInt(-10); + AppendDouble(0.5); + + std::shared_ptr actual; + ASSERT_OK(union_builder->Finish(&actual)); + + auto expected_i8 = + ArrayFromJSON(int8(), "[33, null, null, null, null, 10, null, -10, null]"); + auto expected_str = + ArrayFromJSON(utf8(), R"([null, "abc", null, null, "", null, "def", null, null])"); + auto expected_dbl = + ArrayFromJSON(float64(), "[null, null, 1.0, -1.0, null, null, null, null, 0.5]"); + + std::shared_ptr expected_types; + ArrayFromVector({I8, STR, DBL, DBL, STR, I8, STR, I8, DBL}, + &expected_types); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} + +TEST_F(SparseUnionBuilderTest, InferredType) { + union_builder.reset(new SparseUnionBuilder(default_memory_pool(), {}, nullptr)); + + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(33); + AppendInt(10); + + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendString(""); + AppendString("def"); + AppendInt(-10); + + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendDouble(0.5); + + std::shared_ptr actual; + ASSERT_OK(union_builder->Finish(&actual)); + + auto expected_i8 = + ArrayFromJSON(int8(), "[33, 10, null, null, null, -10, null, null, null]"); + auto expected_str = + ArrayFromJSON(utf8(), R"([null, null, "abc", "", "def", null, null, null, null])"); + auto expected_dbl = + ArrayFromJSON(float64(), "[null, null, null, null, null, null, 1.0, -1.0, 0.5]"); + + std::shared_ptr expected_types; + ArrayFromVector({I8, I8, STR, STR, STR, I8, DBL, DBL, DBL}, + &expected_types); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 89ba1f5f198..a909e731f14 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -73,4 +73,49 @@ Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } +SparseUnionBuilder::SparseUnionBuilder( + MemoryPool* pool, std::vector> children, + const std::shared_ptr& type) + : ArrayBuilder(type, pool), + union_type_(checked_cast(type.get())), + types_builder_(pool), + type_id_to_child_num_(union_type_ ? union_type_->max_type_code() + 1 : 0, -1) { + if (union_type_) { + DCHECK_EQ(union_type_->mode(), UnionMode::SPARSE); + int child_i = 0; + for (auto type_id : union_type_->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } + } + children_ = std::move(children); +} + +Status SparseUnionBuilder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr types, offsets, null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + RETURN_NOT_OK(types_builder_.Finish(&types)); + + std::vector> child_data(children_.size()); + for (size_t i = 0; i < children_.size(); ++i) { + std::shared_ptr data; + RETURN_NOT_OK(children_[i]->FinishInternal(&data)); + child_data[i] = data; + } + + // If the type has not been specified in the constructor, infer it + if (!type_) { + std::vector> fields; + std::vector type_ids; + for (size_t i = 0; i < children_.size(); ++i) { + fields.push_back(field(field_names_[i], children_[i]->type())); + type_ids.push_back(static_cast(i)); + } + type_ = union_(fields, type_ids, UnionMode::SPARSE); + } + + *out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_); + (*out)->child_data = std::move(child_data); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 9a6f8fd29ae..b8c061b50b2 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -91,6 +91,9 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { /// the union array. int8_t AppendChild(const std::shared_ptr& child, const std::string& field_name = "") { + // force type inferrence in Finish + type_ = NULLPTR; + children_.push_back(child); field_names_.push_back(field_name); auto child_num = static_cast(children_.size() - 1); @@ -115,4 +118,90 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { std::vector type_id_to_child_num_; }; +/// \class SparseUnionBuilder +/// +/// You need to call AppendChild for each of the children builders you want +/// to use. The function will return an int8_t, which is the type tag +/// associated with that child. You can then call Append with that tag +/// (followed by an append on the child builder) to add elements to +/// the union array. +/// +/// You can either specify the type when the UnionBuilder is constructed +/// or let the UnionBuilder infer the type at runtime (by omitting the +/// type argument from the constructor). +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the union array along + /// with types, offsets, and null bitmap. + explicit SparseUnionBuilder(MemoryPool* pool, + std::vector> children, + const std::shared_ptr& type); + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(types_builder_.Append(0)); + return AppendToBitmap(false); + } + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0)); + return AppendToBitmap(length, false); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// + /// \param[in] type index of the child the value will be appended + /// The corresponding child builder must be appended to independently + /// after this method is called. + Status Append(int8_t type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(type)); + return AppendToBitmap(true); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Make a new child builder available to the UnionArray + /// + /// \param[in] child the child builder + /// \param[in] field_name the name of the field in the union array type + /// if type inference is used + /// \return child index, which is the "type" argument that needs + /// to be passed to the "Append" method to add a new element to + /// the union array. + int8_t AppendChild(const std::shared_ptr& child, + const std::string& field_name = "") { + // force type inferrence in Finish + type_ = NULLPTR; + + children_.push_back(child); + field_names_.push_back(field_name); + auto child_num = static_cast(children_.size() - 1); + // search for an available type_id + // FIXME(bkietz) far from optimal + auto max_type = static_cast(type_id_to_child_num_.size()); + for (int8_t type = 0; type < max_type; ++type) { + if (type_id_to_child_num_[type] == -1) { + type_id_to_child_num_[type] = child_num; + return type; + } + } + type_id_to_child_num_.push_back(child_num); + return max_type; + } + + private: + const UnionType* union_type_; + TypedBufferBuilder types_builder_; + std::vector field_names_; + std::vector type_id_to_child_num_; +}; + } // namespace arrow diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index f6f80425f35..cee443c4885 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -155,14 +155,32 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::STRUCT: { const std::vector>& fields = type->children(); - std::vector> values_builder; + std::vector> field_builders; for (auto it : fields) { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); - values_builder.emplace_back(std::move(builder)); + field_builders.emplace_back(std::move(builder)); + } + out->reset(new StructBuilder(type, pool, std::move(field_builders))); + return Status::OK(); + } + + case Type::UNION: { + const auto& union_type = internal::checked_cast(*type); + const std::vector>& fields = type->children(); + std::vector> field_builders; + + for (auto it : fields) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); + field_builders.emplace_back(std::move(builder)); + } + if (union_type.mode() == UnionMode::DENSE) { + out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type)); + } else { + out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type)); } - out->reset(new StructBuilder(type, pool, std::move(values_builder))); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index f850f3d2b06..b6ef41d4c2d 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -610,6 +610,81 @@ class StructConverter final : public ConcreteConverter { std::vector> child_converters_; }; +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class UnionConverter final : public ConcreteConverter { + public: + explicit UnionConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + std::vector> child_builders; + for (const auto& field : type_->children()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + builder_ = std::make_shared(type_, default_memory_pool(), + std::move(child_builders)); + return Status::OK(); + } + + Status AppendNull() override { + for (auto& converter : child_converters_) { + RETURN_NOT_OK(converter->AppendNull()); + } + return builder_->AppendNull(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsArray()) { + auto size = json_obj.Size(); + auto expected_size = static_cast(type_->num_children()); + if (size != expected_size) { + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", size); + } + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + } + return builder_->Append(); + } + if (json_obj.IsObject()) { + auto remaining = json_obj.MemberCount(); + auto num_children = type_->num_children(); + for (int32_t i = 0; i < num_children; ++i) { + const auto& field = type_->child(i); + auto it = json_obj.FindMember(field->name()); + if (it != json_obj.MemberEnd()) { + --remaining; + RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + } else { + RETURN_NOT_OK(child_converters_[i]->AppendNull()); + } + } + if (remaining > 0) { + return Status::Invalid("Unexpected members in JSON object for type ", + type_->ToString()); + } + return builder_->Append(); + } + return JSONTypeError("array or object", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; + std::vector> child_converters_; +}; + // ------------------------------------------------------------------------ // General conversion functions From 33fade16eb1feb8a09aa52c460254e88acf3843b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 18 Jun 2019 17:07:48 -0400 Subject: [PATCH 06/16] Adding support for DenseUnions to ArrayFromJSON --- cpp/src/arrow/ipc/json-simple-test.cc | 93 +++++++++++++++++++++++++++ cpp/src/arrow/ipc/json-simple.cc | 67 +++++++++---------- 2 files changed, 127 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 772557b12bd..6fe9afa47f8 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -905,6 +905,99 @@ TEST(TestStruct, Errors) { ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array)); } +TEST(TestUnion, Basics) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[true, false]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_a, expected_b}, {"a", "b"}, {4, 8}, + &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestUnion, ListOfUnion) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + auto list_type = list(union_type); + auto array = ArrayFromJSON(list_type, + "[" + "[[4, 122], [8, true]]," + "[[4, null], null, [8, false]]" + "]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[true, false]"); + + std::shared_ptr expected_values, expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_a, expected_b}, {"a", "b"}, {4, 8}, + &expected_values)); + auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]"); + ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values, + default_memory_pool(), &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestUnion, UnionOfStructs) { + std::vector> fields = { + field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), + field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), + field("foxtrot", list(int8()))})), + field("q", struct_({field("quebec", utf8())}))}; + auto type = union_(fields, {0, 23, 47}, UnionMode::DENSE); + auto array = ArrayFromJSON(type, R"([ + [0, {"alpha": 0.0, "bravo": "charlie"}], + [23, {"whiskey": 99}], + [0, {"bravo": "mike"}], + null, + [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] + ])"); + + auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]"); + ArrayVector expected_fields = {ArrayFromJSON(fields[0]->type(), R"([ + {"alpha": 0.0, "bravo": "charlie"}, + {"bravo": "mike"} + ])"), + ArrayFromJSON(fields[1]->type(), R"([ + {"whiskey": 99}, + {"tango": 8.25, "foxtrot": [0, 2, 3]} + ])"), + ArrayFromJSON(fields[2]->type(), "[]")}; + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, expected_fields, + {"ab", "wtf", "q"}, {0, 23, 47}, &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestUnion, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); +} + } // namespace json } // namespace internal } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index b6ef41d4c2d..ce7c09671b8 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -618,6 +618,13 @@ class UnionConverter final : public ConcreteConverter { explicit UnionConverter(const std::shared_ptr& type) { type_ = type; } Status Init() override { + auto union_type = checked_cast(type_.get()); + type_id_to_child_num_.clear(); + type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); + int child_i = 0; + for (auto type_id : union_type->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } std::vector> child_builders; for (const auto& field : type_->children()) { std::shared_ptr child_converter; @@ -625,8 +632,8 @@ class UnionConverter final : public ConcreteConverter { child_converters_.push_back(child_converter); child_builders.push_back(child_converter->builder()); } - builder_ = std::make_shared(type_, default_memory_pool(), - std::move(child_builders)); + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); return Status::OK(); } @@ -644,45 +651,34 @@ class UnionConverter final : public ConcreteConverter { if (json_obj.IsNull()) { return AppendNull(); } - if (json_obj.IsArray()) { - auto size = json_obj.Size(); - auto expected_size = static_cast(type_->num_children()); - if (size != expected_size) { - return Status::Invalid("Expected array of size ", expected_size, - ", got array of size ", size); - } - for (uint32_t i = 0; i < size; ++i) { - RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); - } - return builder_->Append(); + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); } - if (json_obj.IsObject()) { - auto remaining = json_obj.MemberCount(); - auto num_children = type_->num_children(); - for (int32_t i = 0; i < num_children; ++i) { - const auto& field = type_->child(i); - auto it = json_obj.FindMember(field->name()); - if (it != json_obj.MemberEnd()) { - --remaining; - RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); - } else { - RETURN_NOT_OK(child_converters_[i]->AppendNull()); - } - } - if (remaining > 0) { - return Status::Invalid("Unexpected members in JSON object for type ", - type_->ToString()); - } - return builder_->Append(); + if (json_obj.Size() != 2) { + return Status::Invalid("Expected [type_id, value] pair, got array of size ", + json_obj.Size()); } - return JSONTypeError("array or object", json_obj.GetType()); + const auto& id_obj = json_obj[0]; + if (!id_obj.IsInt()) { + return JSONTypeError("int", id_obj.GetType()); + } + + auto id = static_cast(id_obj.GetInt()); + auto child_num = type_id_to_child_num_[id]; + if (child_num == -1) { + return Status::Invalid("type_id ", id, " not found in ", *type_); + } + + RETURN_NOT_OK(builder_->Append(id)); + return child_converters_[child_num]->AppendValue(json_obj[1]); } std::shared_ptr builder() override { return builder_; } private: - std::shared_ptr builder_; + std::shared_ptr builder_; std::vector> child_converters_; + std::vector type_id_to_child_num_; }; // ------------------------------------------------------------------------ @@ -723,6 +719,11 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) + case Type::UNION: + if (checked_cast(type.get())->mode() == UnionMode::DENSE) { + res = std::make_shared(type); + break; + } default: { return Status::NotImplemented("JSON conversion to ", type->ToString(), " not implemented"); From 5b1ec9361ebe6bb88b697e6ca8cf9a9af235d80b Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 19 Jun 2019 14:32:57 -0400 Subject: [PATCH 07/16] improve doccomments, dedupe test code --- cpp/src/arrow/array-union-test.cc | 250 ++++++++++++-------------- cpp/src/arrow/array/builder_union.cc | 68 ++++--- cpp/src/arrow/array/builder_union.h | 96 ++++------ cpp/src/arrow/ipc/json-simple-test.cc | 105 ++++++++++- cpp/src/arrow/ipc/json-simple.cc | 33 ++-- cpp/src/arrow/python/serialize.cc | 2 +- 6 files changed, 316 insertions(+), 238 deletions(-) diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index cc5374e6b12..647488e739f 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -188,6 +188,7 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { type_codes); } +// FIXME(bkietz): // segfault in test_serialization.py: trying to roundtrip {(): 2} // this is a dict -> list>>, @@ -196,29 +197,127 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { // This breaks when trying to deserialize the *values* array, which has null offsets for // some reason -class DenseUnionBuilderTest : public ::testing::Test { +template +class UnionBuilderTest : public ::testing::Test { public: - void AppendInt(int8_t i) { + uint8_t I8 = 8, STR = 13, DBL = 7; + + virtual void AppendInt(int8_t i) { + expected_types_vector.push_back(I8); ASSERT_OK(union_builder->Append(I8)); ASSERT_OK(i8_builder->Append(i)); } - void AppendString(const std::string& str) { + virtual void AppendString(const std::string& str) { + expected_types_vector.push_back(STR); ASSERT_OK(union_builder->Append(STR)); ASSERT_OK(str_builder->Append(str)); } - void AppendDouble(double dbl) { + virtual void AppendDouble(double dbl) { + expected_types_vector.push_back(DBL); ASSERT_OK(union_builder->Append(DBL)); ASSERT_OK(dbl_builder->Append(dbl)); } - uint8_t I8 = 8, STR = 13, DBL = 7; + void AppendBasics() { + AppendInt(33); + AppendString("abc"); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendString(""); + AppendInt(10); + AppendString("def"); + AppendInt(-10); + AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ArrayFromVector(expected_types_vector, &expected_types); + } + + void AppendInferred() { + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(33); + AppendInt(10); + + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendString(""); + AppendString("def"); + AppendInt(-10); + + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ArrayFromVector(expected_types_vector, &expected_types); + + ASSERT_EQ(I8, 0); + ASSERT_EQ(STR, 1); + ASSERT_EQ(DBL, 2); + } + + void AppendListOfInferred(std::shared_ptr* actual) { + ListBuilder list_builder(default_memory_pool(), union_builder); + + ASSERT_OK(list_builder.Append()); + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(10); + + ASSERT_OK(list_builder.Append()); + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendInt(-10); + + ASSERT_OK(list_builder.Append()); + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(0.5); + ASSERT_OK(list_builder.Finish(actual)); + ArrayFromVector(expected_types_vector, &expected_types); + + ASSERT_EQ(I8, 0); + ASSERT_EQ(STR, 1); + ASSERT_EQ(DBL, 2); + } + + std::vector expected_types_vector; + std::shared_ptr expected_types; std::shared_ptr i8_builder = std::make_shared(); std::shared_ptr str_builder = std::make_shared(); std::shared_ptr dbl_builder = std::make_shared(); - std::shared_ptr union_builder; + std::shared_ptr union_builder{new B(default_memory_pool())}; + std::shared_ptr actual; +}; + +class DenseUnionBuilderTest : public UnionBuilderTest {}; +class SparseUnionBuilderTest : public UnionBuilderTest { + public: + using Base = UnionBuilderTest; + + void AppendInt(int8_t i) override { + Base::AppendInt(i); + ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(dbl_builder->AppendNull()); + } + + void AppendString(const std::string& str) override { + Base::AppendString(str); + ASSERT_OK(i8_builder->AppendNull()); + ASSERT_OK(dbl_builder->AppendNull()); + } + + void AppendDouble(double dbl) override { + Base::AppendDouble(dbl); + ASSERT_OK(i8_builder->AppendNull()); + ASSERT_OK(str_builder->AppendNull()); + } }; TEST_F(DenseUnionBuilderTest, Basics) { @@ -226,27 +325,12 @@ TEST_F(DenseUnionBuilderTest, Basics) { default_memory_pool(), {i8_builder, str_builder, dbl_builder}, union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, {I8, STR, DBL}, UnionMode::DENSE))); - - AppendInt(33); - AppendString("abc"); - AppendDouble(1.0); - AppendDouble(-1.0); - AppendString(""); - AppendInt(10); - AppendString("def"); - AppendInt(-10); - AppendDouble(0.5); - - std::shared_ptr actual; - ASSERT_OK(union_builder->Finish(&actual)); + AppendBasics(); auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); - std::shared_ptr expected_types; - ArrayFromVector({I8, STR, DBL, DBL, STR, I8, STR, I8, DBL}, - &expected_types); auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); std::shared_ptr expected; @@ -259,36 +343,12 @@ TEST_F(DenseUnionBuilderTest, Basics) { } TEST_F(DenseUnionBuilderTest, InferredType) { - union_builder.reset(new DenseUnionBuilder(default_memory_pool(), {}, nullptr)); - - I8 = union_builder->AppendChild(i8_builder, "i8"); - ASSERT_EQ(I8, 0); - AppendInt(33); - AppendInt(10); - - STR = union_builder->AppendChild(str_builder, "str"); - ASSERT_EQ(STR, 1); - AppendString("abc"); - AppendString(""); - AppendString("def"); - AppendInt(-10); - - DBL = union_builder->AppendChild(dbl_builder, "dbl"); - ASSERT_EQ(DBL, 2); - AppendDouble(1.0); - AppendDouble(-1.0); - AppendDouble(0.5); - - std::shared_ptr actual; - ASSERT_OK(union_builder->Finish(&actual)); + AppendInferred(); auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); - std::shared_ptr expected_types; - ArrayFromVector({I8, I8, STR, STR, STR, I8, DBL, DBL, DBL}, - &expected_types); auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]"); std::shared_ptr expected; @@ -301,27 +361,8 @@ TEST_F(DenseUnionBuilderTest, InferredType) { } TEST_F(DenseUnionBuilderTest, ListOfInferredType) { - union_builder.reset(new DenseUnionBuilder(default_memory_pool(), {}, nullptr)); - ListBuilder list_builder(default_memory_pool(), union_builder); - - ASSERT_OK(list_builder.Append()); - I8 = union_builder->AppendChild(i8_builder, "i8"); - ASSERT_EQ(I8, 0); - AppendInt(10); - - ASSERT_OK(list_builder.Append()); - STR = union_builder->AppendChild(str_builder, "str"); - ASSERT_EQ(STR, 1); - AppendString("abc"); - AppendInt(-10); - - ASSERT_OK(list_builder.Append()); - DBL = union_builder->AppendChild(dbl_builder, "dbl"); - ASSERT_EQ(DBL, 2); - AppendDouble(0.5); - std::shared_ptr actual; - ASSERT_OK(list_builder.Finish(&actual)); + AppendListOfInferred(&actual); auto expected_type = list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, @@ -329,55 +370,13 @@ TEST_F(DenseUnionBuilderTest, ListOfInferredType) { ASSERT_EQ(expected_type->ToString(), actual->type()->ToString()); } -class SparseUnionBuilderTest : public ::testing::Test { - public: - void AppendInt(int8_t i) { - ASSERT_OK(union_builder->Append(I8)); - ASSERT_OK(i8_builder->Append(i)); - ASSERT_OK(str_builder->AppendNull()); - ASSERT_OK(dbl_builder->AppendNull()); - } - - void AppendString(const std::string& str) { - ASSERT_OK(union_builder->Append(STR)); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(str_builder->Append(str)); - ASSERT_OK(dbl_builder->AppendNull()); - } - - void AppendDouble(double dbl) { - ASSERT_OK(union_builder->Append(DBL)); - ASSERT_OK(i8_builder->AppendNull()); - ASSERT_OK(str_builder->AppendNull()); - ASSERT_OK(dbl_builder->Append(dbl)); - } - - uint8_t I8 = 8, STR = 13, DBL = 7; - - std::shared_ptr i8_builder = std::make_shared(); - std::shared_ptr str_builder = std::make_shared(); - std::shared_ptr dbl_builder = std::make_shared(); - std::shared_ptr union_builder; -}; - TEST_F(SparseUnionBuilderTest, Basics) { union_builder.reset(new SparseUnionBuilder( default_memory_pool(), {i8_builder, str_builder, dbl_builder}, union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, {I8, STR, DBL}, UnionMode::SPARSE))); - AppendInt(33); - AppendString("abc"); - AppendDouble(1.0); - AppendDouble(-1.0); - AppendString(""); - AppendInt(10); - AppendString("def"); - AppendInt(-10); - AppendDouble(0.5); - - std::shared_ptr actual; - ASSERT_OK(union_builder->Finish(&actual)); + AppendBasics(); auto expected_i8 = ArrayFromJSON(int8(), "[33, null, null, null, null, 10, null, -10, null]"); @@ -386,10 +385,6 @@ TEST_F(SparseUnionBuilderTest, Basics) { auto expected_dbl = ArrayFromJSON(float64(), "[null, null, 1.0, -1.0, null, null, null, null, 0.5]"); - std::shared_ptr expected_types; - ArrayFromVector({I8, STR, DBL, DBL, STR, I8, STR, I8, DBL}, - &expected_types); - std::shared_ptr expected; ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_i8, expected_str, expected_dbl}, @@ -400,28 +395,7 @@ TEST_F(SparseUnionBuilderTest, Basics) { } TEST_F(SparseUnionBuilderTest, InferredType) { - union_builder.reset(new SparseUnionBuilder(default_memory_pool(), {}, nullptr)); - - I8 = union_builder->AppendChild(i8_builder, "i8"); - ASSERT_EQ(I8, 0); - AppendInt(33); - AppendInt(10); - - STR = union_builder->AppendChild(str_builder, "str"); - ASSERT_EQ(STR, 1); - AppendString("abc"); - AppendString(""); - AppendString("def"); - AppendInt(-10); - - DBL = union_builder->AppendChild(dbl_builder, "dbl"); - ASSERT_EQ(DBL, 2); - AppendDouble(1.0); - AppendDouble(-1.0); - AppendDouble(0.5); - - std::shared_ptr actual; - ASSERT_OK(union_builder->Finish(&actual)); + AppendInferred(); auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, null, null, null, -10, null, null, null]"); @@ -430,10 +404,6 @@ TEST_F(SparseUnionBuilderTest, InferredType) { auto expected_dbl = ArrayFromJSON(float64(), "[null, null, null, null, null, null, 1.0, -1.0, 0.5]"); - std::shared_ptr expected_types; - ArrayFromVector({I8, I8, STR, STR, STR, I8, DBL, DBL, DBL}, - &expected_types); - std::shared_ptr expected; ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_i8, expected_str, expected_dbl}, diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index a909e731f14..1c4018d532b 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -26,20 +26,20 @@ namespace arrow { using internal::checked_cast; +DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool) + : ArrayBuilder(nullptr, pool), types_builder_(pool), offsets_builder_(pool) {} + DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool, std::vector> children, const std::shared_ptr& type) - : ArrayBuilder(type, pool), - union_type_(checked_cast(type.get())), - types_builder_(pool), - offsets_builder_(pool), - type_id_to_child_num_(union_type_ ? union_type_->max_type_code() + 1 : 0, -1) { - if (union_type_) { - DCHECK_EQ(union_type_->mode(), UnionMode::DENSE); - int child_i = 0; - for (auto type_id : union_type_->type_codes()) { - type_id_to_child_num_[type_id] = child_i++; - } + : ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) { + auto union_type = checked_cast(type.get()); + DCHECK_NE(union_type, nullptr); + type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); + DCHECK_EQ(union_type->mode(), UnionMode::DENSE); + int child_i = 0; + for (auto type_id : union_type->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; } children_ = std::move(children); } @@ -73,21 +73,25 @@ Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } +SparseUnionBuilder::SparseUnionBuilder(MemoryPool* pool) + : ArrayBuilder(nullptr, pool), types_builder_(pool) {} + SparseUnionBuilder::SparseUnionBuilder( MemoryPool* pool, std::vector> children, const std::shared_ptr& type) - : ArrayBuilder(type, pool), - union_type_(checked_cast(type.get())), - types_builder_(pool), - type_id_to_child_num_(union_type_ ? union_type_->max_type_code() + 1 : 0, -1) { - if (union_type_) { - DCHECK_EQ(union_type_->mode(), UnionMode::SPARSE); - int child_i = 0; - for (auto type_id : union_type_->type_codes()) { - type_id_to_child_num_[type_id] = child_i++; - } + : ArrayBuilder(type, pool), types_builder_(pool) { + auto union_type = checked_cast(type.get()); + DCHECK_NE(union_type, nullptr); + type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); + DCHECK_EQ(union_type->mode(), UnionMode::SPARSE); + int child_i = 0; + for (auto type_id : union_type->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; } children_ = std::move(children); + for (auto&& child : children_) { + DCHECK_EQ(child->length(), 0); + } } Status SparseUnionBuilder::FinishInternal(std::shared_ptr* out) { @@ -118,4 +122,26 @@ Status SparseUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } +int8_t SparseUnionBuilder::AppendChild(const std::shared_ptr& child, + const std::string& field_name) { + // force type inferrence in Finish + type_ = NULLPTR; + DCHECK_EQ(child->length(), length_); + + children_.push_back(child); + field_names_.push_back(field_name); + auto child_num = static_cast(children_.size() - 1); + // search for an available type_id + // FIXME(bkietz) far from optimal + auto max_type = static_cast(type_id_to_child_num_.size()); + for (int8_t type = 0; type < max_type; ++type) { + if (type_id_to_child_num_[type] == -1) { + type_id_to_child_num_[type] = child_num; + return type; + } + } + type_id_to_child_num_.push_back(child_num); + return max_type; +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index b8c061b50b2..5b926fb3cef 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -29,24 +29,18 @@ namespace arrow { /// \class DenseUnionBuilder /// -/// You need to call AppendChild for each of the children builders you want -/// to use. The function will return an int8_t, which is the type tag -/// associated with that child. You can then call Append with that tag -/// (followed by an append on the child builder) to add elements to -/// the union array. -/// -/// You can either specify the type when the UnionBuilder is constructed -/// or let the UnionBuilder infer the type at runtime (by omitting the -/// type argument from the constructor). -/// /// This API is EXPERIMENTAL. class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { public: - /// Use this constructor to incrementally build the union array along - /// with types, offsets, and null bitmap. - explicit DenseUnionBuilder(MemoryPool* pool, - std::vector> children, - const std::shared_ptr& type); + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit DenseUnionBuilder(MemoryPool* pool); + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + DenseUnionBuilder(MemoryPool* pool, std::vector> children, + const std::shared_ptr& type); Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -63,12 +57,14 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. /// - /// \param[in] type index of the child the value will be appended - /// The corresponding child builder must be appended to independently - /// after this method is called. - Status Append(int8_t type) { - ARROW_RETURN_NOT_OK(types_builder_.Append(type)); - auto offset = static_cast(children_[type_id_to_child_num_[type]]->length()); + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called. + Status Append(int8_t next_type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); + auto offset = + static_cast(children_[type_id_to_child_num_[next_type]]->length()); ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); return AppendToBitmap(true); } @@ -111,7 +107,6 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { } private: - const UnionType* union_type_; TypedBufferBuilder types_builder_; TypedBufferBuilder offsets_builder_; std::vector field_names_; @@ -120,24 +115,19 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { /// \class SparseUnionBuilder /// -/// You need to call AppendChild for each of the children builders you want -/// to use. The function will return an int8_t, which is the type tag -/// associated with that child. You can then call Append with that tag -/// (followed by an append on the child builder) to add elements to -/// the union array. -/// -/// You can either specify the type when the UnionBuilder is constructed -/// or let the UnionBuilder infer the type at runtime (by omitting the -/// type argument from the constructor). -/// /// This API is EXPERIMENTAL. class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder { public: - /// Use this constructor to incrementally build the union array along - /// with types, offsets, and null bitmap. - explicit SparseUnionBuilder(MemoryPool* pool, - std::vector> children, - const std::shared_ptr& type); + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit SparseUnionBuilder(MemoryPool* pool); + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + SparseUnionBuilder(MemoryPool* pool, + std::vector> children, + const std::shared_ptr& type); Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -152,11 +142,12 @@ class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder { /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. /// - /// \param[in] type index of the child the value will be appended - /// The corresponding child builder must be appended to independently - /// after this method is called. - Status Append(int8_t type) { - ARROW_RETURN_NOT_OK(types_builder_.Append(type)); + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called, and all other child builders must have null appended + Status Append(int8_t next_type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); return AppendToBitmap(true); } @@ -177,28 +168,9 @@ class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder { /// to be passed to the "Append" method to add a new element to /// the union array. int8_t AppendChild(const std::shared_ptr& child, - const std::string& field_name = "") { - // force type inferrence in Finish - type_ = NULLPTR; - - children_.push_back(child); - field_names_.push_back(field_name); - auto child_num = static_cast(children_.size() - 1); - // search for an available type_id - // FIXME(bkietz) far from optimal - auto max_type = static_cast(type_id_to_child_num_.size()); - for (int8_t type = 0; type < max_type; ++type) { - if (type_id_to_child_num_[type] == -1) { - type_id_to_child_num_[type] = child_num; - return type; - } - } - type_id_to_child_num_.push_back(child_num); - return max_type; - } + const std::string& field_name = ""); private: - const UnionType* union_type_; TypedBufferBuilder types_builder_; std::vector field_names_; std::vector type_id_to_child_num_; diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 6fe9afa47f8..ce8b21a84a3 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -905,9 +905,10 @@ TEST(TestStruct, Errors) { ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array)); } -TEST(TestUnion, Basics) { +TEST(TestDenseUnion, Basics) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); + auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]"); @@ -924,7 +925,25 @@ TEST(TestUnion, Basics) { ASSERT_ARRAYS_EQUAL(*expected, *array); } -TEST(TestUnion, ListOfUnion) { +TEST(TestSparseUnion, Basics) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + + auto type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"}, + {4, 8}, &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnion, ListOfUnion) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); @@ -951,7 +970,32 @@ TEST(TestUnion, ListOfUnion) { ASSERT_ARRAYS_EQUAL(*expected, *array); } -TEST(TestUnion, UnionOfStructs) { +TEST(TestSparseUnion, ListOfUnion) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + auto list_type = list(union_type); + auto array = ArrayFromJSON(list_type, + "[" + "[[4, 122], [8, true]]," + "[[4, null], null, [8, false]]" + "]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]"); + + std::shared_ptr expected_values, expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"}, + {4, 8}, &expected_values)); + auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]"); + ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values, + default_memory_pool(), &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnion, UnionOfStructs) { std::vector> fields = { field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), @@ -985,7 +1029,47 @@ TEST(TestUnion, UnionOfStructs) { ASSERT_ARRAYS_EQUAL(*expected, *array); } -TEST(TestUnion, Errors) { +TEST(TestSparseUnion, UnionOfStructs) { + std::vector> fields = { + field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), + field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), + field("foxtrot", list(int8()))})), + field("q", struct_({field("quebec", utf8())}))}; + auto type = union_(fields, {0, 23, 47}, UnionMode::SPARSE); + auto array = ArrayFromJSON(type, R"([ + [0, {"alpha": 0.0, "bravo": "charlie"}], + [23, {"whiskey": 99}], + [0, {"bravo": "mike"}], + null, + [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] + ])"); + + auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]"); + ArrayVector expected_fields = { + ArrayFromJSON(fields[0]->type(), R"([ + {"alpha": 0.0, "bravo": "charlie"}, + null, + {"bravo": "mike"}, + null, + null + ])"), + ArrayFromJSON(fields[1]->type(), R"([ + null, + {"whiskey": 99}, + null, + null, + {"tango": 8.25, "foxtrot": [0, 2, 3]} + ])"), + ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")}; + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, expected_fields, {"ab", "wtf", "q"}, + {0, 23, 47}, &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnion, Errors) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); @@ -998,6 +1082,19 @@ TEST(TestUnion, Errors) { ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); } +TEST(TestSparseUnion, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); +} + } // namespace json } // namespace internal } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index ce7c09671b8..ae01bcc4b31 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -619,6 +619,7 @@ class UnionConverter final : public ConcreteConverter { Status Init() override { auto union_type = checked_cast(type_.get()); + mode_ = union_type->mode(); type_id_to_child_num_.clear(); type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); int child_i = 0; @@ -632,8 +633,13 @@ class UnionConverter final : public ConcreteConverter { child_converters_.push_back(child_converter); child_builders.push_back(child_converter->builder()); } - builder_ = std::make_shared(default_memory_pool(), - std::move(child_builders), type_); + if (mode_ == UnionMode::DENSE) { + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); + } else { + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); + } return Status::OK(); } @@ -669,14 +675,25 @@ class UnionConverter final : public ConcreteConverter { return Status::Invalid("type_id ", id, " not found in ", *type_); } - RETURN_NOT_OK(builder_->Append(id)); - return child_converters_[child_num]->AppendValue(json_obj[1]); + auto child_converter = child_converters_[child_num]; + if (mode_ == UnionMode::DENSE) { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + } else { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + for (auto&& other_converter : child_converters_) { + if (other_converter != child_converter) { + RETURN_NOT_OK(other_converter->AppendNull()); + } + } + } + return child_converter->AppendValue(json_obj[1]); } std::shared_ptr builder() override { return builder_; } private: - std::shared_ptr builder_; + UnionMode::type mode_; + std::shared_ptr builder_; std::vector> child_converters_; std::vector type_id_to_child_num_; }; @@ -719,11 +736,7 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) - case Type::UNION: - if (checked_cast(type.get())->mode() == UnionMode::DENSE) { - res = std::make_shared(type); - break; - } + SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) default: { return Status::NotImplemented("JSON conversion to ", type->ToString(), " not implemented"); diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 6ac5611b1f1..bc64fb7b55a 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -71,7 +71,7 @@ class SequenceBuilder { types_(::arrow::int8(), pool), offsets_(::arrow::int32(), pool), type_map_(PythonType::NUM_PYTHON_TYPES, -1) { - builder_.reset(new DenseUnionBuilder(pool, {}, nullptr)); + builder_.reset(new DenseUnionBuilder(pool)); } // Appending a none to the sequence From 5742db998aade935d1160e49e4dea147bf84ca04 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 20 Jun 2019 12:29:04 -0400 Subject: [PATCH 08/16] debugging: highlight the broken case and a similar one --- python/pyarrow/tests/test_serialization.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 22983e77e99..f9ce16bf9ac 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -302,6 +302,12 @@ def custom_deserializer(serialized_obj): assert deserialized == (0, 'a') +def test_primitive_serialization_notbroken(large_buffer): + serialization_roundtrip({(1,2): 2}, large_buffer) + +def test_primitive_serialization_broken(large_buffer): + serialization_roundtrip({(): 2}, large_buffer) + def test_primitive_serialization(large_buffer): for obj in PRIMITIVE_OBJECTS: serialization_roundtrip(obj, large_buffer) From cf1c5bec5eeeecb65a4e628fa693bc151517edae Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 2 Jul 2019 19:48:29 -0400 Subject: [PATCH 09/16] revert changes to reader.cc --- cpp/src/arrow/ipc/reader.cc | 5 +++-- python/pyarrow/tests/test_serialization.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8d3d5d1f251..002379e3779 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -290,11 +290,12 @@ class ArrayLoader { RETURN_NOT_OK(LoadCommon()); if (out_->length > 0) { - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1])); + RETURN_NOT_OK(GetBuffer(context_->buffer_index, &out_->buffers[1])); if (type.mode() == UnionMode::DENSE) { - RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[2])); + RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &out_->buffers[2])); } } + context_->buffer_index += type.mode() == UnionMode::DENSE ? 2 : 1; return LoadChildren(type.children()); } diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index f9ce16bf9ac..6c626992dce 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -303,11 +303,13 @@ def custom_deserializer(serialized_obj): def test_primitive_serialization_notbroken(large_buffer): - serialization_roundtrip({(1,2): 2}, large_buffer) + serialization_roundtrip({(1, 2): 2}, large_buffer) + def test_primitive_serialization_broken(large_buffer): serialization_roundtrip({(): 2}, large_buffer) + def test_primitive_serialization(large_buffer): for obj in PRIMITIVE_OBJECTS: serialization_roundtrip(obj, large_buffer) From 673916e5051941bdd3d17dc026166cde35ef59f4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 2 Jul 2019 20:22:12 -0400 Subject: [PATCH 10/16] Disable ListOfDictionary test until ListBuilder is updated --- cpp/src/arrow/array-dict-test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 34a5e211d0d..ed37df30264 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -948,7 +948,7 @@ TEST(TestDictionary, TransposeNulls) { AssertArraysEqual(*expected, *out); } -TEST(TestDictionary, ListOfDictionary) { +TEST(TestDictionary, DISABLED_ListOfDictionary) { std::unique_ptr root_builder; ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())), &root_builder)); From 37de5f2d559225553e71d4387bb258f8d13f93af Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 3 Jul 2019 11:37:38 -0400 Subject: [PATCH 11/16] explicit uint8_t for msvc --- cpp/src/arrow/compare.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 113466c56eb..097bc8f7698 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -236,7 +236,7 @@ class RangeEqualsVisitor { // Define a mapping from the type id to child number const std::vector& type_codes = left_type.type_codes(); std::vector type_id_to_child_num(left.union_type()->max_type_code() + 1, 0); - for (size_t i = 0; i < type_codes.size(); ++i) { + for (uint8_t i = 0; i < type_codes.size(); ++i) { type_id_to_child_num[type_codes[i]] = i; } From fd64c1ba7db8253a133d5e6de198cfac4fa8fa66 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 10 Jul 2019 10:12:57 -0400 Subject: [PATCH 12/16] rewrite union builder to share a base class, let children_ be indexed by type_id --- cpp/src/arrow/array-union-test.cc | 9 -- cpp/src/arrow/array/builder_nested.cc | 2 +- cpp/src/arrow/array/builder_union.cc | 138 +++++++++----------------- cpp/src/arrow/array/builder_union.h | 135 ++++++++++++------------- 4 files changed, 113 insertions(+), 171 deletions(-) diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index 647488e739f..62a4e15eb0c 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -188,15 +188,6 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { type_codes); } -// FIXME(bkietz): -// segfault in test_serialization.py: trying to roundtrip {(): 2} -// this is a dict -> list>>, -// union -// >> -// This breaks when trying to deserialize the *values* array, which has null offsets for -// some reason - template class UnionBuilderTest : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 3194df9922b..30b3fc05e59 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -60,7 +60,7 @@ Status ListBuilder::CheckNextOffset() const { const int64_t num_values = value_builder_->length(); ARROW_RETURN_IF( num_values > kListMaximumElements, - Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,", + Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,", " have ", num_values)); return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 1c4018d532b..ff31ca356f4 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -26,122 +26,82 @@ namespace arrow { using internal::checked_cast; -DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool) - : ArrayBuilder(nullptr, pool), types_builder_(pool), offsets_builder_(pool) {} - -DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool, - std::vector> children, - const std::shared_ptr& type) - : ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) { - auto union_type = checked_cast(type.get()); - DCHECK_NE(union_type, nullptr); - type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); - DCHECK_EQ(union_type->mode(), UnionMode::DENSE); - int child_i = 0; - for (auto type_id : union_type->type_codes()) { - type_id_to_child_num_[type_id] = child_i++; - } - children_ = std::move(children); -} - -Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr types, offsets, null_bitmap; +Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr types, null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(types_builder_.Finish(&types)); - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - std::vector> child_data(children_.size()); - for (size_t i = 0; i < children_.size(); ++i) { - std::shared_ptr data; - RETURN_NOT_OK(children_[i]->FinishInternal(&data)); - child_data[i] = data; + // If the type has not been specified in the constructor, gather type_ids + std::vector type_ids; + if (type_ == nullptr) { + for (size_t i = 0; i < children_.size(); ++i) { + type_ids.push_back(static_cast(i)); + } + } else { + type_ids = checked_cast(*type_).type_codes(); + } + + std::vector> child_data(type_ids.size()); + for (size_t i = 0; i < type_ids.size(); ++i) { + RETURN_NOT_OK(children_[type_ids[i]]->FinishInternal(&child_data[i])); } // If the type has not been specified in the constructor, infer it - if (!type_) { + if (type_ == nullptr) { std::vector> fields; - std::vector type_ids; - for (size_t i = 0; i < children_.size(); ++i) { - fields.push_back(field(field_names_[i], children_[i]->type())); - type_ids.push_back(static_cast(i)); + auto field_names_it = field_names_.begin(); + for (auto&& data : child_data) { + fields.push_back(field(*field_names_it++, data->type)); } - type_ = union_(fields, type_ids, UnionMode::DENSE); + type_ = union_(fields, type_ids, mode_); } - *out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_); + *out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_); (*out)->child_data = std::move(child_data); return Status::OK(); } -SparseUnionBuilder::SparseUnionBuilder(MemoryPool* pool) - : ArrayBuilder(nullptr, pool), types_builder_(pool) {} - -SparseUnionBuilder::SparseUnionBuilder( - MemoryPool* pool, std::vector> children, +BasicUnionBuilder::BasicUnionBuilder( + MemoryPool* pool, UnionMode::type mode, + const std::vector>& children, const std::shared_ptr& type) - : ArrayBuilder(type, pool), types_builder_(pool) { + : ArrayBuilder(type, pool), mode_(mode), types_builder_(pool) { auto union_type = checked_cast(type.get()); DCHECK_NE(union_type, nullptr); - type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); - DCHECK_EQ(union_type->mode(), UnionMode::SPARSE); - int child_i = 0; - for (auto type_id : union_type->type_codes()) { - type_id_to_child_num_[type_id] = child_i++; - } - children_ = std::move(children); - for (auto&& child : children_) { - DCHECK_EQ(child->length(), 0); - } -} + DCHECK_EQ(union_type->mode(), mode); -Status SparseUnionBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr types, offsets, null_bitmap; - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - RETURN_NOT_OK(types_builder_.Finish(&types)); + // NB: children_ is indexed by the child array's type_id, *not* by the index + // of the child_data in the Finished array data + children_.resize(union_type->max_type_code() + 1, nullptr); - std::vector> child_data(children_.size()); - for (size_t i = 0; i < children_.size(); ++i) { - std::shared_ptr data; - RETURN_NOT_OK(children_[i]->FinishInternal(&data)); - child_data[i] = data; - } - - // If the type has not been specified in the constructor, infer it - if (!type_) { - std::vector> fields; - std::vector type_ids; - for (size_t i = 0; i < children_.size(); ++i) { - fields.push_back(field(field_names_[i], children_[i]->type())); - type_ids.push_back(static_cast(i)); - } - type_ = union_(fields, type_ids, UnionMode::SPARSE); + auto field_it = type->children().begin(); + auto children_it = children.begin(); + for (auto type_id : union_type->type_codes()) { + children_[type_id] = *children_it++; + field_names_.push_back((*field_it++)->name()); } - - *out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_); - (*out)->child_data = std::move(child_data); - return Status::OK(); + DCHECK_EQ(children_it, children.end()); + DCHECK_EQ(field_it, type->children().end()); } -int8_t SparseUnionBuilder::AppendChild(const std::shared_ptr& child, - const std::string& field_name) { +int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_child, + const std::string& field_name) { // force type inferrence in Finish - type_ = NULLPTR; - DCHECK_EQ(child->length(), length_); + type_ = nullptr; - children_.push_back(child); field_names_.push_back(field_name); - auto child_num = static_cast(children_.size() - 1); - // search for an available type_id - // FIXME(bkietz) far from optimal - auto max_type = static_cast(type_id_to_child_num_.size()); - for (int8_t type = 0; type < max_type; ++type) { - if (type_id_to_child_num_[type] == -1) { - type_id_to_child_num_[type] = child_num; - return type; + + for (int8_t type_id = dense_type_id_; static_cast(type_id) < children_.size(); + ++type_id) { + if (children_[type_id] == NULLPTR) { + children_[type_id] = new_child; + return type_id; } + dense_type_id_ = type_id; } - type_id_to_child_num_.push_back(child_num); - return max_type; + + children_.push_back(new_child); + return dense_type_id_++; } } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 5b926fb3cef..d06f71a4f7f 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -27,20 +27,64 @@ namespace arrow { +class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { + public: + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Make a new child builder available to the UnionArray + /// + /// \param[in] new_child the child builder + /// \param[in] field_name the name of the field in the union array type + /// if type inference is used + /// \return child index, which is the "type" argument that needs + /// to be passed to the "Append" method to add a new element to + /// the union array. + int8_t AppendChild(const std::shared_ptr& new_child, + const std::string& field_name = ""); + + protected: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) + : ArrayBuilder(nullptr, pool), mode_(mode), types_builder_(pool) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode, + const std::vector>& children, + const std::shared_ptr& type); + + UnionMode::type mode_; + int8_t dense_type_id_ = 0; + TypedBufferBuilder types_builder_; + std::vector field_names_; +}; + /// \class DenseUnionBuilder /// /// This API is EXPERIMENTAL. -class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { +class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { public: /// Use this constructor to initialize the UnionBuilder with no child builders, /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. - explicit DenseUnionBuilder(MemoryPool* pool); + explicit DenseUnionBuilder(MemoryPool* pool) + : BasicUnionBuilder(pool, UnionMode::DENSE) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor - DenseUnionBuilder(MemoryPool* pool, std::vector> children, - const std::shared_ptr& type); + DenseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type) + : BasicUnionBuilder(pool, UnionMode::DENSE, children, type), + offsets_builder_(pool) {} Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -63,71 +107,42 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { /// is called. Status Append(int8_t next_type) { ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); - auto offset = - static_cast(children_[type_id_to_child_num_[next_type]]->length()); + if (children_[next_type]->length() == kListMaximumElements) { + return Status::CapacityError( + "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single " + "child"); + } + auto offset = static_cast(children_[next_type]->length()); ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); return AppendToBitmap(true); } - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \brief Make a new child builder available to the UnionArray - /// - /// \param[in] child the child builder - /// \param[in] field_name the name of the field in the union array type - /// if type inference is used - /// \return child index, which is the "type" argument that needs - /// to be passed to the "Append" method to add a new element to - /// the union array. - int8_t AppendChild(const std::shared_ptr& child, - const std::string& field_name = "") { - // force type inferrence in Finish - type_ = NULLPTR; - - children_.push_back(child); - field_names_.push_back(field_name); - auto child_num = static_cast(children_.size() - 1); - // search for an available type_id - // FIXME(bkietz) far from optimal - auto max_type = static_cast(type_id_to_child_num_.size()); - for (int8_t type = 0; type < max_type; ++type) { - if (type_id_to_child_num_[type] == -1) { - type_id_to_child_num_[type] = child_num; - return type; - } - } - type_id_to_child_num_.push_back(child_num); - return max_type; + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out)); + return offsets_builder_.Finish(&(*out)->buffers[2]); } private: - TypedBufferBuilder types_builder_; TypedBufferBuilder offsets_builder_; - std::vector field_names_; - std::vector type_id_to_child_num_; }; /// \class SparseUnionBuilder /// /// This API is EXPERIMENTAL. -class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder { +class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { public: /// Use this constructor to initialize the UnionBuilder with no child builders, /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. - explicit SparseUnionBuilder(MemoryPool* pool); + explicit SparseUnionBuilder(MemoryPool* pool) + : BasicUnionBuilder(pool, UnionMode::SPARSE) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor SparseUnionBuilder(MemoryPool* pool, - std::vector> children, - const std::shared_ptr& type); + const std::vector>& children, + const std::shared_ptr& type) + : BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {} Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -150,30 +165,6 @@ class ARROW_EXPORT SparseUnionBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); return AppendToBitmap(true); } - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \brief Make a new child builder available to the UnionArray - /// - /// \param[in] child the child builder - /// \param[in] field_name the name of the field in the union array type - /// if type inference is used - /// \return child index, which is the "type" argument that needs - /// to be passed to the "Append" method to add a new element to - /// the union array. - int8_t AppendChild(const std::shared_ptr& child, - const std::string& field_name = ""); - - private: - TypedBufferBuilder types_builder_; - std::vector field_names_; - std::vector type_id_to_child_num_; }; } // namespace arrow From 4131fe3fb4858c0a501eeca37505a1cb769ef9fa Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 10 Jul 2019 10:53:37 -0400 Subject: [PATCH 13/16] separate child builder array indexable by type_id --- cpp/src/arrow/array/builder_union.cc | 39 ++++++++++++++-------------- cpp/src/arrow/array/builder_union.h | 7 ++--- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index ff31ca356f4..4874008c7fc 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -31,19 +31,21 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(types_builder_.Finish(&types)); - // If the type has not been specified in the constructor, gather type_ids - std::vector type_ids; + // If the type has not been specified in the constructor, gather type_codes + std::vector type_codes; if (type_ == nullptr) { for (size_t i = 0; i < children_.size(); ++i) { - type_ids.push_back(static_cast(i)); + if (type_id_to_children_[i] != nullptr) { + type_codes.push_back(static_cast(i)); + } } } else { - type_ids = checked_cast(*type_).type_codes(); + type_codes = checked_cast(*type_).type_codes(); } - std::vector> child_data(type_ids.size()); - for (size_t i = 0; i < type_ids.size(); ++i) { - RETURN_NOT_OK(children_[type_ids[i]]->FinishInternal(&child_data[i])); + std::vector> child_data(type_codes.size()); + for (size_t i = 0; i < children_.size(); ++i) { + RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i])); } // If the type has not been specified in the constructor, infer it @@ -53,7 +55,7 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { for (auto&& data : child_data) { fields.push_back(field(*field_names_it++, data->type)); } - type_ = union_(fields, type_ids, mode_); + type_ = union_(fields, type_codes, mode_); } *out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_); @@ -70,14 +72,13 @@ BasicUnionBuilder::BasicUnionBuilder( DCHECK_NE(union_type, nullptr); DCHECK_EQ(union_type->mode(), mode); - // NB: children_ is indexed by the child array's type_id, *not* by the index - // of the child_data in the Finished array data - children_.resize(union_type->max_type_code() + 1, nullptr); + children_ = children; + type_id_to_children_.resize(union_type->max_type_code() + 1, nullptr); auto field_it = type->children().begin(); auto children_it = children.begin(); for (auto type_id : union_type->type_codes()) { - children_[type_id] = *children_it++; + type_id_to_children_[type_id] = *children_it++; field_names_.push_back((*field_it++)->name()); } DCHECK_EQ(children_it, children.end()); @@ -90,17 +91,17 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c type_ = nullptr; field_names_.push_back(field_name); + children_.push_back(new_child); - for (int8_t type_id = dense_type_id_; static_cast(type_id) < children_.size(); - ++type_id) { - if (children_[type_id] == NULLPTR) { - children_[type_id] = new_child; - return type_id; + for (; static_cast(dense_type_id_) < type_id_to_children_.size(); + ++dense_type_id_) { + if (type_id_to_children_[dense_type_id_] == nullptr) { + type_id_to_children_[dense_type_id_] = new_child; + return dense_type_id_++; } - dense_type_id_ = type_id; } - children_.push_back(new_child); + type_id_to_children_.push_back(new_child); return dense_type_id_++; } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index d06f71a4f7f..bded3d85819 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -53,7 +53,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) - : ArrayBuilder(nullptr, pool), mode_(mode), types_builder_(pool) {} + : ArrayBuilder(NULLPTR, pool), mode_(mode), types_builder_(pool) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor @@ -62,6 +62,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { const std::shared_ptr& type); UnionMode::type mode_; + std::vector> type_id_to_children_; int8_t dense_type_id_ = 0; TypedBufferBuilder types_builder_; std::vector field_names_; @@ -107,12 +108,12 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { /// is called. Status Append(int8_t next_type) { ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); - if (children_[next_type]->length() == kListMaximumElements) { + if (type_id_to_children_[next_type]->length() == kListMaximumElements) { return Status::CapacityError( "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single " "child"); } - auto offset = static_cast(children_[next_type]->length()); + auto offset = static_cast(type_id_to_children_[next_type]->length()); ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); return AppendToBitmap(true); } From 17e6e27aa4df239c2ef611ad234ac6be6f20f02d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 10 Jul 2019 13:20:31 -0400 Subject: [PATCH 14/16] construct offset_builder_ with a MemoryPool --- cpp/src/arrow/array/builder_union.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index bded3d85819..6ce42b8a8b7 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -77,7 +77,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. explicit DenseUnionBuilder(MemoryPool* pool) - : BasicUnionBuilder(pool, UnionMode::DENSE) {} + : BasicUnionBuilder(pool, UnionMode::DENSE), offsets_builder_(pool) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor From 0efe91d774299b37caf2a09171329c1b6c7039fc Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 11 Jul 2019 17:58:00 -0400 Subject: [PATCH 15/16] address review comments --- cpp/src/arrow/array/builder_union.cc | 10 +++++++++- cpp/src/arrow/array/builder_union.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 4874008c7fc..53d8191b45c 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -42,8 +42,9 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { } else { type_codes = checked_cast(*type_).type_codes(); } + DCHECK_EQ(type_codes.size(), children_.size()); - std::vector> child_data(type_codes.size()); + std::vector> child_data(children_.size()); for (size_t i = 0; i < children_.size(); ++i) { RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i])); } @@ -74,6 +75,7 @@ BasicUnionBuilder::BasicUnionBuilder( children_ = children; type_id_to_children_.resize(union_type->max_type_code() + 1, nullptr); + DCHECK_LT(type_id_to_children_.size(), std::numeric_limits::max()); auto field_it = type->children().begin(); auto children_it = children.begin(); @@ -93,6 +95,9 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c field_names_.push_back(field_name); children_.push_back(new_child); + // Find type_id such that type_id_to_children_[type_id] == nullptr + // and use that for the new child. Start searching at dense_type_id_ + // since type_id_to_children_ is densely packed up at least up to dense_type_id_ for (; static_cast(dense_type_id_) < type_id_to_children_.size(); ++dense_type_id_) { if (type_id_to_children_[dense_type_id_] == nullptr) { @@ -101,6 +106,9 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c } } + DCHECK_LT(type_id_to_children_.size(), std::numeric_limits::max()); + + // type_id_to_children_ is already densely packed, so just append the new child type_id_to_children_.push_back(new_child); return dense_type_id_++; } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 6ce42b8a8b7..e04d5d26576 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -63,6 +63,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { UnionMode::type mode_; std::vector> type_id_to_children_; + // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr int8_t dense_type_id_ = 0; TypedBufferBuilder types_builder_; std::vector field_names_; From 38e2828a0b4205106acee539a1dd4e67927c78aa Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 12 Jul 2019 10:17:53 -0400 Subject: [PATCH 16/16] iwyu #include --- cpp/src/arrow/array/builder_union.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 53d8191b45c..8de786f6afa 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -17,6 +17,7 @@ #include "arrow/array/builder_union.h" +#include #include #include "arrow/util/checked_cast.h"