diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 16d8aac4d59..ed37df30264 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -948,4 +948,49 @@ TEST(TestDictionary, TransposeNulls) { AssertArraysEqual(*expected, *out); } +TEST(TestDictionary, DISABLED_ListOfDictionary) { + std::unique_ptr root_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())), + &root_builder)); + auto list_builder = checked_cast(root_builder.get()); + auto dict_builder = + checked_cast*>(list_builder->value_builder()); + + ASSERT_OK(list_builder->Append()); + std::vector expected; + for (char a : "abc") { + for (char d : "def") { + for (char g : "ghi") { + for (char j : "jkl") { + for (char m : "mno") { + for (char p : "pqr") { + if ((static_cast(a) + d + g + j + m + p) % 16 == 0) { + ASSERT_OK(list_builder->Append()); + } + // 3**6 distinct strings; too large for int8 + char str[6] = {a, d, g, j, m, p}; + ASSERT_OK(dict_builder->Append(str)); + expected.push_back(str); + } + } + } + } + } + } + std::shared_ptr expected_dict; + ArrayFromVector(expected, &expected_dict); + + std::shared_ptr array; + ASSERT_OK(root_builder->Finish(&array)); + ASSERT_OK(ValidateArray(*array)); + + auto expected_type = list(dictionary(int16(), utf8())); + ASSERT_EQ(array->type()->ToString(), expected_type->ToString()); + + auto list_array = checked_cast(array.get()); + auto actual_dict = + checked_cast(*list_array->values()).dictionary(); + ASSERT_ARRAYS_EQUAL(*expected_dict, *actual_dict); +} + } // namespace arrow diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc index 86cbeae6d78..62a4e15eb0c 100644 --- a/cpp/src/arrow/array-union-test.cc +++ b/cpp/src/arrow/array-union-test.cc @@ -188,4 +188,219 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { type_codes); } +template +class UnionBuilderTest : public ::testing::Test { + public: + uint8_t I8 = 8, STR = 13, DBL = 7; + + virtual void AppendInt(int8_t i) { + expected_types_vector.push_back(I8); + ASSERT_OK(union_builder->Append(I8)); + ASSERT_OK(i8_builder->Append(i)); + } + + virtual void AppendString(const std::string& str) { + expected_types_vector.push_back(STR); + ASSERT_OK(union_builder->Append(STR)); + ASSERT_OK(str_builder->Append(str)); + } + + virtual void AppendDouble(double dbl) { + expected_types_vector.push_back(DBL); + ASSERT_OK(union_builder->Append(DBL)); + ASSERT_OK(dbl_builder->Append(dbl)); + } + + void AppendBasics() { + AppendInt(33); + AppendString("abc"); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendString(""); + AppendInt(10); + AppendString("def"); + AppendInt(-10); + AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ArrayFromVector(expected_types_vector, &expected_types); + } + + void AppendInferred() { + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(33); + AppendInt(10); + + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendString(""); + AppendString("def"); + AppendInt(-10); + + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(1.0); + AppendDouble(-1.0); + AppendDouble(0.5); + ASSERT_OK(union_builder->Finish(&actual)); + ArrayFromVector(expected_types_vector, &expected_types); + + ASSERT_EQ(I8, 0); + ASSERT_EQ(STR, 1); + ASSERT_EQ(DBL, 2); + } + + void AppendListOfInferred(std::shared_ptr* actual) { + ListBuilder list_builder(default_memory_pool(), union_builder); + + ASSERT_OK(list_builder.Append()); + I8 = union_builder->AppendChild(i8_builder, "i8"); + ASSERT_EQ(I8, 0); + AppendInt(10); + + ASSERT_OK(list_builder.Append()); + STR = union_builder->AppendChild(str_builder, "str"); + ASSERT_EQ(STR, 1); + AppendString("abc"); + AppendInt(-10); + + ASSERT_OK(list_builder.Append()); + DBL = union_builder->AppendChild(dbl_builder, "dbl"); + ASSERT_EQ(DBL, 2); + AppendDouble(0.5); + + ASSERT_OK(list_builder.Finish(actual)); + ArrayFromVector(expected_types_vector, &expected_types); + + ASSERT_EQ(I8, 0); + ASSERT_EQ(STR, 1); + ASSERT_EQ(DBL, 2); + } + + std::vector expected_types_vector; + std::shared_ptr expected_types; + std::shared_ptr i8_builder = std::make_shared(); + std::shared_ptr str_builder = std::make_shared(); + std::shared_ptr dbl_builder = std::make_shared(); + std::shared_ptr union_builder{new B(default_memory_pool())}; + std::shared_ptr actual; +}; + +class DenseUnionBuilderTest : public UnionBuilderTest {}; +class SparseUnionBuilderTest : public UnionBuilderTest { + public: + using Base = UnionBuilderTest; + + void AppendInt(int8_t i) override { + Base::AppendInt(i); + ASSERT_OK(str_builder->AppendNull()); + ASSERT_OK(dbl_builder->AppendNull()); + } + + void AppendString(const std::string& str) override { + Base::AppendString(str); + ASSERT_OK(i8_builder->AppendNull()); + ASSERT_OK(dbl_builder->AppendNull()); + } + + void AppendDouble(double dbl) override { + Base::AppendDouble(dbl); + ASSERT_OK(i8_builder->AppendNull()); + ASSERT_OK(str_builder->AppendNull()); + } +}; + +TEST_F(DenseUnionBuilderTest, Basics) { + union_builder.reset(new DenseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}, UnionMode::DENSE))); + AppendBasics(); + + auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); + + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} + +TEST_F(DenseUnionBuilderTest, InferredType) { + AppendInferred(); + + auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); + auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); + auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]"); + + auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} + +TEST_F(DenseUnionBuilderTest, ListOfInferredType) { + std::shared_ptr actual; + AppendListOfInferred(&actual); + + auto expected_type = + list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}, UnionMode::DENSE)); + ASSERT_EQ(expected_type->ToString(), actual->type()->ToString()); +} + +TEST_F(SparseUnionBuilderTest, Basics) { + union_builder.reset(new SparseUnionBuilder( + default_memory_pool(), {i8_builder, str_builder, dbl_builder}, + union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}, UnionMode::SPARSE))); + + AppendBasics(); + + auto expected_i8 = + ArrayFromJSON(int8(), "[33, null, null, null, null, 10, null, -10, null]"); + auto expected_str = + ArrayFromJSON(utf8(), R"([null, "abc", null, null, "", null, "def", null, null])"); + auto expected_dbl = + ArrayFromJSON(float64(), "[null, null, 1.0, -1.0, null, null, null, null, 0.5]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} + +TEST_F(SparseUnionBuilderTest, InferredType) { + AppendInferred(); + + auto expected_i8 = + ArrayFromJSON(int8(), "[33, 10, null, null, null, -10, null, null, null]"); + auto expected_str = + ArrayFromJSON(utf8(), R"([null, null, "abc", "", "def", null, null, null, null])"); + auto expected_dbl = + ArrayFromJSON(float64(), "[null, null, null, null, null, null, 1.0, -1.0, 0.5]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, + {expected_i8, expected_str, expected_dbl}, + {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected)); + + ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); + ASSERT_ARRAYS_EQUAL(*expected, *actual); +} } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index bc38559b91b..5f76f083968 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -606,6 +606,7 @@ void UnionArray::SetData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::UNION); ARROW_CHECK_EQ(data->buffers.size(), 3); + union_type_ = checked_cast(data_->type.get()); auto type_ids = data_->buffers[1]; auto value_offsets = data_->buffers[2]; diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 256bbdc6c67..599a6ea62af 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -1032,9 +1032,9 @@ class ARROW_EXPORT UnionArray : public Array { const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode::type mode() const { - return internal::checked_cast(*type()).mode(); - } + const UnionType* union_type() const { return union_type_; } + + UnionMode::type mode() const { return union_type_->mode(); } // Return the given field as an individual array. // For sparse unions, the returned array has its offset, length and null @@ -1047,6 +1047,7 @@ class ARROW_EXPORT UnionArray : public Array { const type_id_t* raw_type_ids_; const int32_t* raw_value_offsets_; + const UnionType* union_type_; // For caching boxed child data mutable std::vector> boxed_fields_; diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 0bf91719dfe..30b3fc05e59 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -60,7 +60,7 @@ Status ListBuilder::CheckNextOffset() const { const int64_t num_values = value_builder_->length(); ARROW_RETURN_IF( num_values > kListMaximumElements, - Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,", + Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,", " have ", num_values)); return Status::OK(); } @@ -300,7 +300,7 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr* out) { // Struct StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders) + std::vector> field_builders) : ArrayBuilder(type, pool) { children_ = std::move(field_builders); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index de031459181..8742f2b6e24 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -218,7 +218,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); + std::vector> field_builders); Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index f51b7d7f020..8de786f6afa 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -17,44 +17,101 @@ #include "arrow/array/builder_union.h" +#include #include +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" namespace arrow { -DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool, - const std::shared_ptr& type) - : ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) {} +using internal::checked_cast; -Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr types; +Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr types, null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(types_builder_.Finish(&types)); - std::shared_ptr offsets; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - std::shared_ptr null_bitmap; - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + // If the type has not been specified in the constructor, gather type_codes + std::vector type_codes; + if (type_ == nullptr) { + for (size_t i = 0; i < children_.size(); ++i) { + if (type_id_to_children_[i] != nullptr) { + type_codes.push_back(static_cast(i)); + } + } + } else { + type_codes = checked_cast(*type_).type_codes(); + } + DCHECK_EQ(type_codes.size(), children_.size()); - std::vector> fields; std::vector> child_data(children_.size()); - std::vector type_ids; for (size_t i = 0; i < children_.size(); ++i) { - std::shared_ptr data; - RETURN_NOT_OK(children_[i]->FinishInternal(&data)); - child_data[i] = data; - fields.push_back(field(field_names_[i], children_[i]->type())); - type_ids.push_back(static_cast(i)); + RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i])); } // If the type has not been specified in the constructor, infer it - if (!type_) { - type_ = union_(fields, type_ids, UnionMode::DENSE); + if (type_ == nullptr) { + std::vector> fields; + auto field_names_it = field_names_.begin(); + for (auto&& data : child_data) { + fields.push_back(field(*field_names_it++, data->type)); + } + type_ = union_(fields, type_codes, mode_); } - *out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_); + *out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_); (*out)->child_data = std::move(child_data); return Status::OK(); } +BasicUnionBuilder::BasicUnionBuilder( + MemoryPool* pool, UnionMode::type mode, + const std::vector>& children, + const std::shared_ptr& type) + : ArrayBuilder(type, pool), mode_(mode), types_builder_(pool) { + auto union_type = checked_cast(type.get()); + DCHECK_NE(union_type, nullptr); + DCHECK_EQ(union_type->mode(), mode); + + children_ = children; + type_id_to_children_.resize(union_type->max_type_code() + 1, nullptr); + DCHECK_LT(type_id_to_children_.size(), std::numeric_limits::max()); + + auto field_it = type->children().begin(); + auto children_it = children.begin(); + for (auto type_id : union_type->type_codes()) { + type_id_to_children_[type_id] = *children_it++; + field_names_.push_back((*field_it++)->name()); + } + DCHECK_EQ(children_it, children.end()); + DCHECK_EQ(field_it, type->children().end()); +} + +int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_child, + const std::string& field_name) { + // force type inferrence in Finish + type_ = nullptr; + + field_names_.push_back(field_name); + children_.push_back(new_child); + + // Find type_id such that type_id_to_children_[type_id] == nullptr + // and use that for the new child. Start searching at dense_type_id_ + // since type_id_to_children_ is densely packed up at least up to dense_type_id_ + for (; static_cast(dense_type_id_) < type_id_to_children_.size(); + ++dense_type_id_) { + if (type_id_to_children_[dense_type_id_] == nullptr) { + type_id_to_children_[dense_type_id_] = new_child; + return dense_type_id_++; + } + } + + DCHECK_LT(type_id_to_children_.size(), std::numeric_limits::max()); + + // type_id_to_children_ is already densely packed, so just append the new child + type_id_to_children_.push_back(new_child); + return dense_type_id_++; +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index aac2e54f9a2..e04d5d26576 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -27,25 +27,66 @@ namespace arrow { +class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { + public: + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Make a new child builder available to the UnionArray + /// + /// \param[in] new_child the child builder + /// \param[in] field_name the name of the field in the union array type + /// if type inference is used + /// \return child index, which is the "type" argument that needs + /// to be passed to the "Append" method to add a new element to + /// the union array. + int8_t AppendChild(const std::shared_ptr& new_child, + const std::string& field_name = ""); + + protected: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) + : ArrayBuilder(NULLPTR, pool), mode_(mode), types_builder_(pool) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode, + const std::vector>& children, + const std::shared_ptr& type); + + UnionMode::type mode_; + std::vector> type_id_to_children_; + // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr + int8_t dense_type_id_ = 0; + TypedBufferBuilder types_builder_; + std::vector field_names_; +}; + /// \class DenseUnionBuilder /// -/// You need to call AppendChild for each of the children builders you want -/// to use. The function will return an int8_t, which is the type tag -/// associated with that child. You can then call Append with that tag -/// (followed by an append on the child builder) to add elements to -/// the union array. -/// -/// You can either specify the type when the UnionBuilder is constructed -/// or let the UnionBuilder infer the type at runtime (by omitting the -/// type argument from the constructor). -/// /// This API is EXPERIMENTAL. -class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { +class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { public: - /// Use this constructor to incrementally build the union array along - /// with types, offsets, and null bitmap. - explicit DenseUnionBuilder(MemoryPool* pool, - const std::shared_ptr& type = NULLPTR); + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit DenseUnionBuilder(MemoryPool* pool) + : BasicUnionBuilder(pool, UnionMode::DENSE), offsets_builder_(pool) {} + + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + DenseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type) + : BasicUnionBuilder(pool, UnionMode::DENSE, children, type), + offsets_builder_(pool) {} Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -54,53 +95,78 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { } Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(types_builder_.Reserve(length)); - ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length)); - ARROW_RETURN_NOT_OK(Reserve(length)); - for (int64_t i = 0; i < length; ++i) { - types_builder_.UnsafeAppend(0); - offsets_builder_.UnsafeAppend(0); - } + ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0)); + ARROW_RETURN_NOT_OK(offsets_builder_.Append(length, 0)); return AppendToBitmap(length, false); } /// \brief Append an element to the UnionArray. This must be followed /// by an append to the appropriate child builder. - /// \param[in] type index of the child the value will be appended - /// \param[in] offset offset of the value in that child - Status Append(int8_t type, int32_t offset) { - ARROW_RETURN_NOT_OK(types_builder_.Append(type)); + /// + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called. + Status Append(int8_t next_type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); + if (type_id_to_children_[next_type]->length() == kListMaximumElements) { + return Status::CapacityError( + "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single " + "child"); + } + auto offset = static_cast(type_id_to_children_[next_type]->length()); ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); return AppendToBitmap(true); } - Status FinishInternal(std::shared_ptr* out) override; + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out)); + return offsets_builder_.Finish(&(*out)->buffers[2]); + } - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond + private: + TypedBufferBuilder offsets_builder_; +}; - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +/// \class SparseUnionBuilder +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { + public: + /// Use this constructor to initialize the UnionBuilder with no child builders, + /// allowing type to be inferred. You will need to call AppendChild for each of the + /// children builders you want to use. + explicit SparseUnionBuilder(MemoryPool* pool) + : BasicUnionBuilder(pool, UnionMode::SPARSE) {} - /// \brief Make a new child builder available to the UnionArray - /// - /// \param[in] child the child builder - /// \param[in] field_name the name of the field in the union array type - /// if type inference is used - /// \return child index, which is the "type" argument that needs - /// to be passed to the "Append" method to add a new element to - /// the union array. - int8_t AppendChild(const std::shared_ptr& child, - const std::string& field_name = "") { - children_.push_back(child); - field_names_.push_back(field_name); - return static_cast(children_.size() - 1); + /// Use this constructor to specify the type explicitly. + /// You can still add child builders to the union after using this constructor + SparseUnionBuilder(MemoryPool* pool, + const std::vector>& children, + const std::shared_ptr& type) + : BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {} + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(types_builder_.Append(0)); + return AppendToBitmap(false); } - private: - TypedBufferBuilder types_builder_; - TypedBufferBuilder offsets_builder_; - std::vector field_names_; + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0)); + return AppendToBitmap(length, false); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// + /// \param[in] next_type type_id of the child to which the next value will be appended. + /// + /// The corresponding child builder must be appended to independently after this method + /// is called, and all other child builders must have null appended + Status Append(int8_t next_type) { + ARROW_RETURN_NOT_OK(types_builder_.Append(next_type)); + return AppendToBitmap(true); + } }; } // namespace arrow diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index f6f80425f35..cee443c4885 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -155,14 +155,32 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::STRUCT: { const std::vector>& fields = type->children(); - std::vector> values_builder; + std::vector> field_builders; for (auto it : fields) { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); - values_builder.emplace_back(std::move(builder)); + field_builders.emplace_back(std::move(builder)); + } + out->reset(new StructBuilder(type, pool, std::move(field_builders))); + return Status::OK(); + } + + case Type::UNION: { + const auto& union_type = internal::checked_cast(*type); + const std::vector>& fields = type->children(); + std::vector> field_builders; + + for (auto it : fields) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); + field_builders.emplace_back(std::move(builder)); + } + if (union_type.mode() == UnionMode::DENSE) { + out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type)); + } else { + out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type)); } - out->reset(new StructBuilder(type, pool, std::move(values_builder))); return Status::OK(); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 56c3e2b3716..223ed5819f1 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -27,6 +27,7 @@ #include "arrow/array/builder_nested.h" // IWYU pragma: export #include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/array/builder_time.h" // IWYU pragma: export +#include "arrow/array/builder_union.h" // IWYU pragma: export #include "arrow/status.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index e1525a4f4d6..097bc8f7698 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -234,26 +234,15 @@ class RangeEqualsVisitor { const auto& left_type = checked_cast(*left.type()); // Define a mapping from the type id to child number - uint8_t max_code = 0; - const std::vector& type_codes = left_type.type_codes(); - for (size_t i = 0; i < type_codes.size(); ++i) { - const uint8_t code = type_codes[i]; - if (code > max_code) { - max_code = code; - } - } - - // Store mapping in a vector for constant time lookups - std::vector type_id_to_child_num(max_code + 1); - for (uint8_t i = 0; i < static_cast(type_codes.size()); ++i) { + std::vector type_id_to_child_num(left.union_type()->max_type_code() + 1, 0); + for (uint8_t i = 0; i < type_codes.size(); ++i) { type_id_to_child_num[type_codes[i]] = i; } const uint8_t* left_ids = left.raw_type_ids(); const uint8_t* right_ids = right.raw_type_ids(); - uint8_t id, child_num; for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; ++i, ++o_i) { if (left.IsNull(i) != right.IsNull(o_i)) { @@ -264,8 +253,7 @@ class RangeEqualsVisitor { return false; } - id = left_ids[i]; - child_num = type_id_to_child_num[id]; + auto child_num = type_id_to_child_num[left_ids[i]]; // TODO(wesm): really we should be comparing stretches of non-null data // rather than looking at one value at a time. @@ -773,30 +761,16 @@ class TypeEqualsVisitor { Status Visit(const UnionType& left) { const auto& right = checked_cast(right_); - if (left.mode() != right.mode() || - left.type_codes().size() != right.type_codes().size()) { + if (left.mode() != right.mode() || left.type_codes() != right.type_codes()) { result_ = false; return Status::OK(); } - const std::vector& left_codes = left.type_codes(); - const std::vector& right_codes = right.type_codes(); - - for (size_t i = 0; i < left_codes.size(); ++i) { - if (left_codes[i] != right_codes[i]) { - result_ = false; - return Status::OK(); - } - } - - for (int i = 0; i < left.num_children(); ++i) { - if (!left.child(i)->Equals(right_.child(i), check_metadata_)) { - result_ = false; - return Status::OK(); - } - } - - result_ = true; + result_ = std::equal( + left.children().begin(), left.children().end(), right.children().begin(), + [this](const std::shared_ptr& l, const std::shared_ptr& r) { + return l->Equals(r, check_metadata_); + }); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 772557b12bd..ce8b21a84a3 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -905,6 +905,196 @@ TEST(TestStruct, Errors) { ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array)); } +TEST(TestDenseUnion, Basics) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + + auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[true, false]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_a, expected_b}, {"a", "b"}, {4, 8}, + &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestSparseUnion, Basics) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + + auto type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]"); + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"}, + {4, 8}, &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnion, ListOfUnion) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + auto list_type = list(union_type); + auto array = ArrayFromJSON(list_type, + "[" + "[[4, 122], [8, true]]," + "[[4, null], null, [8, false]]" + "]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[true, false]"); + + std::shared_ptr expected_values, expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, + {expected_a, expected_b}, {"a", "b"}, {4, 8}, + &expected_values)); + auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]"); + ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values, + default_memory_pool(), &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestSparseUnion, ListOfUnion) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + auto list_type = list(union_type); + auto array = ArrayFromJSON(list_type, + "[" + "[[4, 122], [8, true]]," + "[[4, null], null, [8, false]]" + "]"); + + auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); + auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]"); + auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]"); + + std::shared_ptr expected_values, expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"}, + {4, 8}, &expected_values)); + auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]"); + ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values, + default_memory_pool(), &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnion, UnionOfStructs) { + std::vector> fields = { + field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), + field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), + field("foxtrot", list(int8()))})), + field("q", struct_({field("quebec", utf8())}))}; + auto type = union_(fields, {0, 23, 47}, UnionMode::DENSE); + auto array = ArrayFromJSON(type, R"([ + [0, {"alpha": 0.0, "bravo": "charlie"}], + [23, {"whiskey": 99}], + [0, {"bravo": "mike"}], + null, + [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] + ])"); + + auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]"); + ArrayVector expected_fields = {ArrayFromJSON(fields[0]->type(), R"([ + {"alpha": 0.0, "bravo": "charlie"}, + {"bravo": "mike"} + ])"), + ArrayFromJSON(fields[1]->type(), R"([ + {"whiskey": 99}, + {"tango": 8.25, "foxtrot": [0, 2, 3]} + ])"), + ArrayFromJSON(fields[2]->type(), "[]")}; + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, expected_fields, + {"ab", "wtf", "q"}, {0, 23, 47}, &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestSparseUnion, UnionOfStructs) { + std::vector> fields = { + field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), + field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), + field("foxtrot", list(int8()))})), + field("q", struct_({field("quebec", utf8())}))}; + auto type = union_(fields, {0, 23, 47}, UnionMode::SPARSE); + auto array = ArrayFromJSON(type, R"([ + [0, {"alpha": 0.0, "bravo": "charlie"}], + [23, {"whiskey": 99}], + [0, {"bravo": "mike"}], + null, + [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] + ])"); + + auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]"); + ArrayVector expected_fields = { + ArrayFromJSON(fields[0]->type(), R"([ + {"alpha": 0.0, "bravo": "charlie"}, + null, + {"bravo": "mike"}, + null, + null + ])"), + ArrayFromJSON(fields[1]->type(), R"([ + null, + {"whiskey": 99}, + null, + null, + {"tango": 8.25, "foxtrot": [0, 2, 3]} + ])"), + ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")}; + + std::shared_ptr expected; + ASSERT_OK(UnionArray::MakeSparse(*expected_types, expected_fields, {"ab", "wtf", "q"}, + {0, 23, 47}, &expected)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnion, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); +} + +TEST(TestSparseUnion, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); +} + } // namespace json } // namespace internal } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index f850f3d2b06..ae01bcc4b31 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -610,6 +610,94 @@ class StructConverter final : public ConcreteConverter { std::vector> child_converters_; }; +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class UnionConverter final : public ConcreteConverter { + public: + explicit UnionConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + auto union_type = checked_cast(type_.get()); + mode_ = union_type->mode(); + type_id_to_child_num_.clear(); + type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); + int child_i = 0; + for (auto type_id : union_type->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } + std::vector> child_builders; + for (const auto& field : type_->children()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + if (mode_ == UnionMode::DENSE) { + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); + } else { + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); + } + return Status::OK(); + } + + Status AppendNull() override { + for (auto& converter : child_converters_) { + RETURN_NOT_OK(converter->AppendNull()); + } + return builder_->AppendNull(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid("Expected [type_id, value] pair, got array of size ", + json_obj.Size()); + } + const auto& id_obj = json_obj[0]; + if (!id_obj.IsInt()) { + return JSONTypeError("int", id_obj.GetType()); + } + + auto id = static_cast(id_obj.GetInt()); + auto child_num = type_id_to_child_num_[id]; + if (child_num == -1) { + return Status::Invalid("type_id ", id, " not found in ", *type_); + } + + auto child_converter = child_converters_[child_num]; + if (mode_ == UnionMode::DENSE) { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + } else { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + for (auto&& other_converter : child_converters_) { + if (other_converter != child_converter) { + RETURN_NOT_OK(other_converter->AppendNull()); + } + } + } + return child_converter->AppendValue(json_obj[1]); + } + + std::shared_ptr builder() override { return builder_; } + + private: + UnionMode::type mode_; + std::shared_ptr builder_; + std::vector> child_converters_; + std::vector type_id_to_child_num_; +}; + // ------------------------------------------------------------------------ // General conversion functions @@ -648,6 +736,7 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) + SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) default: { return Status::NotImplemented("JSON conversion to ", type->ToString(), " not implemented"); diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 57843943775..bc64fb7b55a 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -77,14 +77,6 @@ class SequenceBuilder { // Appending a none to the sequence Status AppendNone() { return builder_->AppendNull(); } - template - Status Update(BuilderType* child_builder, int8_t tag) { - int32_t offset32 = -1; - RETURN_NOT_OK(internal::CastSize(child_builder->length(), &offset32)); - DCHECK_GE(offset32, 0); - return builder_->Append(tag, offset32); - } - template Status CreateAndUpdate(std::shared_ptr* child_builder, int8_t tag, MakeBuilderFn make_builder) { @@ -95,7 +87,7 @@ class SequenceBuilder { convert << static_cast(tag); type_map_[tag] = builder_->AppendChild(*child_builder, convert.str()); } - return Update(child_builder->get(), type_map_[tag]); + return builder_->Append(type_map_[tag]); } template diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 2dbd31a7dca..55154ba9690 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -273,6 +273,9 @@ std::string DurationType::ToString() const { UnionType::UnionType(const std::vector>& fields, const std::vector& type_codes, UnionMode::type mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { + DCHECK_LE(fields.size(), type_codes.size()) << "union field with unknown type id"; + DCHECK_GE(fields.size(), type_codes.size()) + << "type id provided without corresponding union field"; children_ = fields; } @@ -284,6 +287,12 @@ DataTypeLayout UnionType::layout() const { } } +uint8_t UnionType::max_type_code() const { + return type_codes_.size() == 0 + ? 0 + : *std::max_element(type_codes_.begin(), type_codes_.end()); +} + std::string UnionType::ToString() const { std::stringstream s; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 16f486f45f1..10c1f90b58f 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -686,6 +686,8 @@ class ARROW_EXPORT UnionType : public NestedType { const std::vector& type_codes() const { return type_codes_; } + uint8_t max_type_code() const; + UnionMode::type mode() const { return mode_; } private: diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index 22983e77e99..6c626992dce 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -302,6 +302,14 @@ def custom_deserializer(serialized_obj): assert deserialized == (0, 'a') +def test_primitive_serialization_notbroken(large_buffer): + serialization_roundtrip({(1, 2): 2}, large_buffer) + + +def test_primitive_serialization_broken(large_buffer): + serialization_roundtrip({(): 2}, large_buffer) + + def test_primitive_serialization(large_buffer): for obj in PRIMITIVE_OBJECTS: serialization_roundtrip(obj, large_buffer)