From fb8fea5e6bddbb5c37b9d0e0f85167faa40f9cdf Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 3 Jun 2020 16:05:56 -0400 Subject: [PATCH 1/7] ARROW-8866: [C++] Split UNION into SPARSE_UNION and DENSE_UNION --- c_glib/arrow-glib/composite-array.cpp | 18 +- c_glib/arrow-glib/composite-data-type.cpp | 10 +- cpp/src/arrow/adapters/orc/adapter_util.cc | 2 +- cpp/src/arrow/array/array_nested.cc | 99 ++++--- cpp/src/arrow/array/array_nested.h | 168 +++++++----- cpp/src/arrow/array/array_primitive.h | 4 +- cpp/src/arrow/array/array_test.cc | 4 +- cpp/src/arrow/array/array_union_test.cc | 92 +++---- cpp/src/arrow/array/array_view_test.cc | 6 +- cpp/src/arrow/array/builder_union.cc | 27 +- cpp/src/arrow/array/builder_union.h | 23 +- cpp/src/arrow/array/concatenate_test.cc | 4 +- cpp/src/arrow/array/diff.cc | 4 +- cpp/src/arrow/array/diff_test.cc | 6 +- cpp/src/arrow/array/util.cc | 8 +- cpp/src/arrow/array/validate.cc | 3 +- cpp/src/arrow/builder.cc | 46 ++-- cpp/src/arrow/c/bridge.cc | 16 +- cpp/src/arrow/c/bridge_test.cc | 32 +-- cpp/src/arrow/compare.cc | 6 +- .../arrow/compute/kernels/codegen_internal.cc | 44 +-- .../compute/kernels/vector_filter_test.cc | 4 +- .../kernels/vector_selection_internal.h | 259 ++++++++++-------- .../arrow/compute/kernels/vector_take_test.cc | 5 +- cpp/src/arrow/ipc/json_internal.cc | 30 +- cpp/src/arrow/ipc/json_simple.cc | 3 +- cpp/src/arrow/ipc/json_simple_test.cc | 36 +-- cpp/src/arrow/ipc/json_test.cc | 5 +- cpp/src/arrow/ipc/metadata_internal.cc | 9 +- cpp/src/arrow/ipc/read_write_test.cc | 12 +- cpp/src/arrow/ipc/reader.cc | 5 +- cpp/src/arrow/ipc/test_common.cc | 21 +- cpp/src/arrow/ipc/writer.cc | 168 ++++++------ cpp/src/arrow/pretty_print.cc | 5 +- cpp/src/arrow/python/arrow_to_pandas.cc | 2 +- cpp/src/arrow/python/deserialize.cc | 2 +- cpp/src/arrow/python/serialize.cc | 4 +- cpp/src/arrow/scalar.cc | 12 +- cpp/src/arrow/scalar.h | 11 +- cpp/src/arrow/testing/util.h | 5 + cpp/src/arrow/type.cc | 146 ++++++---- cpp/src/arrow/type.h | 62 +++-- cpp/src/arrow/type_fwd.h | 55 ++-- cpp/src/arrow/type_test.cc | 25 +- cpp/src/arrow/type_traits.h | 18 +- cpp/src/arrow/visitor.cc | 6 +- cpp/src/arrow/visitor.h | 6 +- cpp/src/arrow/visitor_inline.h | 3 +- cpp/src/gandiva/llvm_types_test.cc | 3 +- python/pyarrow/array.pxi | 9 +- python/pyarrow/includes/libarrow.pxd | 31 ++- python/pyarrow/lib.pyx | 3 +- python/pyarrow/public-api.pxi | 4 +- python/pyarrow/scalar.pxi | 7 +- python/pyarrow/types.pxi | 6 +- python/pyarrow/types.py | 5 +- 56 files changed, 918 insertions(+), 691 deletions(-) diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index 322c0786020..58bebb8179c 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -1019,13 +1019,13 @@ garrow_sparse_union_array_new_internal(GArrowSparseUnionDataType *data_type, arrow_field_names.push_back(arrow_field->name()); } arrow_sparse_union_array_result = - arrow::UnionArray::MakeSparse(*arrow_type_ids, + arrow::SparseUnionArray::Make(*arrow_type_ids, arrow_fields, arrow_field_names, arrow_union_data_type->type_codes()); } else { arrow_sparse_union_array_result = - arrow::UnionArray::MakeSparse(*arrow_type_ids, arrow_fields); + arrow::SparseUnionArray::Make(*arrow_type_ids, arrow_fields); } if (garrow::check(error, arrow_sparse_union_array_result, @@ -1217,14 +1217,14 @@ garrow_dense_union_array_new_internal(GArrowDenseUnionDataType *data_type, arrow_field_names.push_back(arrow_field->name()); } arrow_dense_union_array_result = - arrow::UnionArray::MakeDense(*arrow_type_ids, + arrow::DenseUnionArray::Make(*arrow_type_ids, *arrow_value_offsets, arrow_fields, arrow_field_names, arrow_union_data_type->type_codes()); } else { arrow_dense_union_array_result = - arrow::UnionArray::MakeDense(*arrow_type_ids, + arrow::DenseUnionArray::Make(*arrow_type_ids, *arrow_value_offsets, arrow_fields); } @@ -1345,6 +1345,7 @@ garrow_dictionary_array_dispose(GObject *object) g_object_unref(priv->dictionary); priv->dictionary = NULL; } +<<<<<<< HEAD G_OBJECT_CLASS(garrow_dictionary_array_parent_class)->dispose(object); } @@ -1367,6 +1368,15 @@ garrow_dictionary_array_set_property(GObject *object, default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; +======= + auto arrow_union_array = arrow::DenseUnionArray::Make( + *arrow_type_ids, *arrow_value_offsets, arrow_fields, arrow_field_names, + arrow_union_data_type->type_codes()); + if (garrow::check(error, arrow_union_array, "[dense-union-array][new][data-type]")) { + return GARROW_DENSE_UNION_ARRAY(garrow_array_new_raw(&(*arrow_union_array))); + } else { + return NULL; +>>>>>>> ARROW-8866: [C++] Split UNION into SPARSE_UNION and DENSE_UNION } } diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 1fdbef9cc8d..3944ff2d216 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -573,9 +573,8 @@ garrow_sparse_union_data_type_new(GList *fields, } auto arrow_data_type = - std::make_shared(arrow_fields, - arrow_type_codes, - arrow::UnionMode::SPARSE); + std::make_shared(arrow_fields, + arrow_type_codes); auto data_type = g_object_new(GARROW_TYPE_SPARSE_UNION_DATA_TYPE, "data-type", &arrow_data_type, NULL); @@ -623,9 +622,8 @@ garrow_dense_union_data_type_new(GList *fields, } auto arrow_data_type = - std::make_shared(arrow_fields, - arrow_type_codes, - arrow::UnionMode::DENSE); + std::make_shared(arrow_fields, + arrow_type_codes); auto data_type = g_object_new(GARROW_TYPE_DENSE_UNION_DATA_TYPE, "data-type", &arrow_data_type, NULL); diff --git a/cpp/src/arrow/adapters/orc/adapter_util.cc b/cpp/src/arrow/adapters/orc/adapter_util.cc index 710a2a8f729..5a36e2c0100 100644 --- a/cpp/src/arrow/adapters/orc/adapter_util.cc +++ b/cpp/src/arrow/adapters/orc/adapter_util.cc @@ -415,7 +415,7 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { fields.push_back(field("_union_" + std::to_string(child), elemtype)); type_codes.push_back(static_cast(child)); } - *out = union_(fields, type_codes); + *out = sparse_union(fields, type_codes); break; } default: { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 7e4a71ab344..4c337fcd655 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -556,44 +556,77 @@ Result StructArray::Flatten(MemoryPool* pool) const { // ---------------------------------------------------------------------- // UnionArray -void UnionArray::SetData(const std::shared_ptr& data) { - this->Array::SetData(data); +void UnionArray::SetData(std::shared_ptr data) { + this->Array::SetData(std::move(data)); - ARROW_CHECK_EQ(data->type->id(), Type::UNION); - ARROW_CHECK_EQ(data->buffers.size(), 3); union_type_ = checked_cast(data_->type.get()); + ARROW_CHECK_GE(data_->buffers.size(), 2); auto type_codes = data_->buffers[1]; - auto value_offsets = data_->buffers[2]; raw_type_codes_ = type_codes == nullptr ? nullptr : reinterpret_cast(type_codes->data()); + boxed_fields_.resize(data_->child_data.size()); +} + +void SparseUnionArray::SetData(std::shared_ptr data) { + this->UnionArray::SetData(std::move(data)); + + ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION); + ARROW_CHECK_EQ(data_->buffers.size(), 2); +} + +void DenseUnionArray::SetData(const std::shared_ptr& data) { + this->UnionArray::SetData(std::move(data)); + + ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION); + ARROW_CHECK_EQ(data_->buffers.size(), 3); + auto value_offsets = data_->buffers[2]; raw_value_offsets_ = value_offsets == nullptr ? nullptr : reinterpret_cast(value_offsets->data()); - boxed_fields_.resize(data->child_data.size()); } -UnionArray::UnionArray(const std::shared_ptr& data) { SetData(data); } +SparseUnionArray::SparseUnionArray(std::shared_ptr data) { + SetData(std::move(data)); +} + +SparseUnionArray::SparseUnionArray(std::shared_ptr type, int64_t length, + ArrayVector children, + std::shared_ptr type_codes, + std::shared_ptr null_bitmap, + int64_t null_count, int64_t offset) { + auto internal_data = ArrayData::Make( + std::move(type), length, + BufferVector{std::move(null_bitmap), std::move(type_codes)}, null_count, offset); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + } + SetData(std::move(internal_data)); +} + +DenseUnionArray::DenseUnionArray(const std::shared_ptr& data) { + SetData(data); +} -UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - const std::shared_ptr& type_codes, - const std::shared_ptr& value_offsets, - const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) { +DenseUnionArray::DenseUnionArray(std::shared_ptr type, int64_t length, + ArrayVector children, std::shared_ptr type_ids, + std::shared_ptr value_offsets, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { auto internal_data = ArrayData::Make( - type, length, {null_bitmap, type_codes, value_offsets}, null_count, offset); + std::move(type), length, + BufferVector{std::move(null_bitmap), std::move(type_ids), std::move(value_offsets)}, + null_count, offset); for (const auto& child : children) { internal_data->child_data.push_back(child->data()); } SetData(internal_data); } -Result> UnionArray::MakeDense( - const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - const std::vector& field_names, const std::vector& type_codes) { +Result> DenseUnionArray::Make( + const Array& type_ids, const Array& value_offsets, ArrayVector children, + std::vector field_names, std::vector type_codes) { if (value_offsets.length() == 0) { return Status::Invalid("UnionArray offsets must have non-zero length"); } @@ -607,7 +640,7 @@ Result> UnionArray::MakeDense( } if (value_offsets.null_count() != 0) { - return Status::Invalid("MakeDense does not allow NAs in value_offsets"); + return Status::Invalid("Make does not allow NAs in value_offsets"); } if (field_names.size() > 0 && field_names.size() != children.size()) { @@ -622,19 +655,19 @@ Result> UnionArray::MakeDense( checked_cast(type_ids).values(), checked_cast(value_offsets).values()}; - std::shared_ptr union_type = - union_(children, field_names, type_codes, UnionMode::DENSE); - auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers), - type_ids.null_count(), type_ids.offset()); + auto union_type = dense_union(children, std::move(field_names), std::move(type_codes)); + auto internal_data = + ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); for (const auto& child : children) { internal_data->child_data.push_back(child->data()); } - return std::make_shared(internal_data); + return std::make_shared(std::move(internal_data)); } -Result> UnionArray::MakeSparse( - const Array& type_ids, const std::vector>& children, - const std::vector& field_names, const std::vector& type_codes) { +Result> SparseUnionArray::Make( + const Array& type_ids, ArrayVector children, std::vector field_names, + std::vector type_codes) { if (type_ids.type_id() != Type::INT8) { return Status::TypeError("UnionArray type_ids must be signed int8"); } @@ -648,11 +681,11 @@ Result> UnionArray::MakeSparse( } BufferVector buffers = {type_ids.null_bitmap(), - checked_cast(type_ids).values(), nullptr}; - std::shared_ptr union_type = - union_(children, field_names, type_codes, UnionMode::SPARSE); - auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers), - type_ids.null_count(), type_ids.offset()); + checked_cast(type_ids).values()}; + auto union_type = sparse_union(children, std::move(field_names), std::move(type_codes)); + auto internal_data = + ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); for (const auto& child : children) { internal_data->child_data.push_back(child->data()); if (child->length() != type_ids.length()) { @@ -660,7 +693,7 @@ Result> UnionArray::MakeSparse( "Sparse UnionArray must have len(child) == len(type_ids) for all children"); } } - return std::make_shared(internal_data); + return std::make_shared(std::move(internal_data)); } std::shared_ptr UnionArray::child(int i) const { return field(i); } diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 391dfaf40c0..81bcbbab1c1 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -31,6 +31,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -337,57 +338,105 @@ class ARROW_EXPORT StructArray : public Array { /// Concrete Array class for union data class ARROW_EXPORT UnionArray : public Array { public: - using TypeClass = UnionType; - using type_code_t = int8_t; - explicit UnionArray(const std::shared_ptr& data); + /// Note that this buffer does not account for any slice offset + std::shared_ptr type_codes() const { return data_->buffers[1]; } + + const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; } + + /// The physical child id containing value at index. + int child_id(int64_t i) const { + return union_type_->child_ids()[raw_type_codes_[i + data_->offset]]; + } + + const UnionType* union_type() const { return union_type_; } + + UnionMode::type mode() const { return union_type_->mode(); } + + // Return the given field as an individual array. + // For sparse unions, the returned array has its offset, length and null + // count adjusted. + ARROW_DEPRECATED("Deprecated in 1.0.0. Use field(pos)") + std::shared_ptr child(int pos) const; + + /// \brief Return the given field as an individual array. + /// + /// For sparse unions, the returned array has its offset, length and null + /// count adjusted. + std::shared_ptr field(int pos) const; + + protected: + void SetData(std::shared_ptr data); + + const type_code_t* raw_type_codes_; + const UnionType* union_type_; + + // For caching boxed child data + mutable std::vector> boxed_fields_; +}; + +/// Concrete Array class for union data +class ARROW_EXPORT SparseUnionArray : public UnionArray { + public: + using TypeClass = SparseUnionType; + + explicit SparseUnionArray(std::shared_ptr data); - UnionArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - const std::shared_ptr& type_ids, - const std::shared_ptr& value_offsets = NULLPTR, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); + SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, + std::shared_ptr type_ids, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); - /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// \brief Construct SparseUnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and - /// input types. The value_offsets are assumed to be well-formed. + /// input types. /// /// \param[in] type_ids An array of logical type ids for the union type - /// \param[in] value_offsets An array of signed int32 values indicating the - /// relative offset into the respective child array for the type in a given slot. - /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. - static Result> MakeDense( - const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - const std::vector& field_names = {}, - const std::vector& type_codes = {}); + static Result> Make(const Array& type_ids, ArrayVector children, + std::vector field_names = {}, + std::vector type_codes = {}); - /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// \brief Construct Sparse UnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and - /// input types. The value_offsets are assumed to be well-formed. + /// input types. /// /// \param[in] type_ids An array of logical type ids for the union type - /// \param[in] value_offsets An array of signed int32 values indicating the - /// relative offset into the respective child array for the type in a given slot. - /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] type_codes Vector of type codes. - static Result> MakeDense( - const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - const std::vector& type_codes) { - return MakeDense(type_ids, value_offsets, children, std::vector{}, - type_codes); + static Result> Make(const Array& type_ids, ArrayVector children, + std::vector type_codes) { + return Make(std::move(type_ids), std::move(children), std::vector{}, + std::move(type_codes)); } - /// \brief Construct Sparse UnionArray from type_ids and children + const SparseUnionType* union_type() const { + return internal::checked_cast(union_type_); + } + + protected: + void SetData(std::shared_ptr data); +}; + +/// Concrete Array class for union data +class ARROW_EXPORT DenseUnionArray : public UnionArray { + public: + using TypeClass = DenseUnionType; + + explicit DenseUnionArray(const std::shared_ptr& data); + + DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, + std::shared_ptr type_ids, + std::shared_ptr value_offsets = NULLPTR, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct DenseUnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and /// input types. @@ -396,12 +445,13 @@ class ARROW_EXPORT UnionArray : public Array { /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. - static Result> MakeSparse( - const Array& type_ids, const std::vector>& children, - const std::vector& field_names = {}, - const std::vector& type_codes = {}); + static Result> Make(const Array& type_ids, + const Array& value_offsets, + ArrayVector children, + std::vector field_names = {}, + std::vector type_codes = {}); - /// \brief Construct Sparse UnionArray from type_ids and children + /// \brief Construct Dense UnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and /// input types. @@ -409,57 +459,29 @@ class ARROW_EXPORT UnionArray : public Array { /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] type_codes Vector of type codes. - static Result> MakeSparse( - const Array& type_ids, const std::vector>& children, - const std::vector& type_codes) { - return MakeSparse(type_ids, children, std::vector{}, type_codes); + static Result> Make(const Array& type_ids, + const Array& value_offsets, + ArrayVector children, + std::vector type_codes) { + return Make(type_ids, value_offsets, std::move(children), std::vector{}, + std::move(type_codes)); } - /// Note that this buffer does not account for any slice offset - std::shared_ptr type_codes() const { return data_->buffers[1]; } - - const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; } - - /// The physical child id containing value at index. - int child_id(int64_t i) const { - return union_type_->child_ids()[raw_type_codes_[i + data_->offset]]; + const DenseUnionType* union_type() const { + return internal::checked_cast(union_type_); } - /// For dense arrays only. /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } - /// For dense arrays only. int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - /// For dense arrays only. const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - const UnionType* union_type() const { return union_type_; } - - UnionMode::type mode() const { return union_type_->mode(); } - - // Return the given field as an individual array. - // For sparse unions, the returned array has its offset, length and null - // count adjusted. - ARROW_DEPRECATED("Deprecated in 1.0.0. Use field(pos)") - std::shared_ptr child(int pos) const; - - /// \brief Return the given field as an individual array. - /// - /// For sparse unions, the returned array has its offset, length and null - /// count adjusted. - std::shared_ptr field(int pos) const; - protected: - void SetData(const std::shared_ptr& data); - - const type_code_t* raw_type_codes_; const int32_t* raw_value_offsets_; - const UnionType* union_type_; - // For caching boxed child data - mutable std::vector> boxed_fields_; + void SetData(const std::shared_ptr& data); }; } // namespace arrow diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index e58f5f4c8b6..2716573ee93 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -108,11 +108,11 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { // For compatibility with Take kernel. TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } - int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } - const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } protected: + static constexpr int32_t byte_width() { return sizeof(TypeClass::DayMilliseconds); } + inline void SetData(const std::shared_ptr& data) { this->PrimitiveArray::SetData(data); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 8936c421e27..4fb0a4adf4d 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -296,8 +296,8 @@ TEST_F(TestArray, TestMakeArrayOfNull) { fixed_size_list(int64(), 4), dictionary(int32(), utf8()), struct_({field("a", utf8()), field("b", int32())}), - union_({field("a", utf8()), field("b", int32())}, {0, 1}, UnionMode::SPARSE), - union_({field("a", utf8()), field("b", int32())}, {0, 1}, UnionMode::DENSE) + sparse_union({field("a", utf8()), field("b", int32())}, {0, 1}), + dense_union({field("a", utf8()), field("b", int32())}, {0, 1}), // clang-format on }; diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index ee1abe1e39b..b346efeb53c 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -68,33 +68,33 @@ TEST(TestUnionArray, TestSliceEquals) { TEST(TestSparseUnionArray, Validate) { auto a = ArrayFromJSON(int32(), "[4, 5]"); - auto type = union_({field("a", int32())}, UnionMode::SPARSE); + auto type = sparse_union({field("a", int32())}); auto children = std::vector>{a}; auto type_ids_array = ArrayFromJSON(int8(), "[0, 0, 0]"); auto type_ids = type_ids_array->data()->buffers[1]; - auto arr = std::make_shared(type, 2, children, type_ids); + auto arr = std::make_shared(type, 2, children, type_ids); ASSERT_OK(arr->ValidateFull()); - arr = std::make_shared(type, 1, children, type_ids, nullptr, nullptr, 0, - /*offset=*/1); + arr = std::make_shared(type, 1, children, type_ids, nullptr, 0, + /*offset=*/1); ASSERT_OK(arr->ValidateFull()); - arr = std::make_shared(type, 0, children, type_ids, nullptr, nullptr, 0, - /*offset=*/2); + arr = std::make_shared(type, 0, children, type_ids, nullptr, 0, + /*offset=*/2); ASSERT_OK(arr->ValidateFull()); // Length + offset < child length, but it's ok - arr = std::make_shared(type, 1, children, type_ids, nullptr, nullptr, 0, - /*offset=*/0); + arr = std::make_shared(type, 1, children, type_ids, nullptr, 0, + /*offset=*/0); ASSERT_OK(arr->ValidateFull()); // Length + offset > child length - arr = std::make_shared(type, 1, children, type_ids, nullptr, nullptr, 0, - /*offset=*/2); + arr = std::make_shared(type, 1, children, type_ids, nullptr, 0, + /*offset=*/2); ASSERT_RAISES(Invalid, arr->ValidateFull()); // Offset > child length - arr = std::make_shared(type, 0, children, type_ids, nullptr, nullptr, 0, - /*offset=*/3); + arr = std::make_shared(type, 0, children, type_ids, nullptr, 0, + /*offset=*/3); ASSERT_RAISES(Invalid, arr->ValidateFull()); } @@ -165,43 +165,43 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) { // without field names and type codes ASSERT_OK_AND_ASSIGN(result, - UnionArray::MakeDense(*type_ids_, *value_offsets, children)); + DenseUnionArray::Make(*type_ids_, *value_offsets, children)); ASSERT_OK(result->ValidateFull()); union_array = checked_cast(result.get()); CheckUnionArray(*union_array, UnionMode::DENSE, {"0", "1", "2", "3"}, {0, 1, 2, 3}); // with field name ASSERT_RAISES(Invalid, - UnionArray::MakeDense(*type_ids_, *value_offsets, children, {"one"})); + DenseUnionArray::Make(*type_ids_, *value_offsets, children, {"one"})); ASSERT_OK_AND_ASSIGN( - result, UnionArray::MakeDense(*type_ids_, *value_offsets, children, field_names)); + result, DenseUnionArray::Make(*type_ids_, *value_offsets, children, field_names)); ASSERT_OK(result->ValidateFull()); union_array = checked_cast(result.get()); CheckUnionArray(*union_array, UnionMode::DENSE, field_names, {0, 1, 2, 3}); // with type codes - ASSERT_RAISES(Invalid, UnionArray::MakeDense(*logical_type_ids_, *value_offsets, + ASSERT_RAISES(Invalid, DenseUnionArray::Make(*logical_type_ids_, *value_offsets, children, std::vector{0})); - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*logical_type_ids_, *value_offsets, + ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*logical_type_ids_, *value_offsets, children, type_codes_)); ASSERT_OK(result->ValidateFull()); union_array = checked_cast(result.get()); CheckUnionArray(*union_array, UnionMode::DENSE, {"0", "1", "2", "3"}, type_codes_); // with field names and type codes - ASSERT_RAISES(Invalid, UnionArray::MakeDense(*logical_type_ids_, *value_offsets, + ASSERT_RAISES(Invalid, DenseUnionArray::Make(*logical_type_ids_, *value_offsets, children, {"one"}, type_codes_)); - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*logical_type_ids_, *value_offsets, + ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*logical_type_ids_, *value_offsets, children, field_names, type_codes_)); ASSERT_OK(result->ValidateFull()); union_array = checked_cast(result.get()); CheckUnionArray(*union_array, UnionMode::DENSE, field_names, type_codes_); // Invalid type codes - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*invalid_type_ids1_, *value_offsets, + ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*invalid_type_ids1_, *value_offsets, children, type_codes_)); ASSERT_RAISES(Invalid, result->ValidateFull()); - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*invalid_type_ids2_, *value_offsets, + ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*invalid_type_ids2_, *value_offsets, children, type_codes_)); ASSERT_RAISES(Invalid, result->ValidateFull()); @@ -209,11 +209,11 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) { std::shared_ptr invalid_offsets; ArrayFromVector({1, 0, 0, 0, 1, 1, 1, 2, 1, 2}, &invalid_offsets); ASSERT_OK_AND_ASSIGN(result, - UnionArray::MakeDense(*type_ids_, *invalid_offsets, children)); + DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); ASSERT_RAISES(Invalid, result->ValidateFull()); ArrayFromVector({1, 0, 0, 0, 1, -1, 1, 2, 1, 2}, &invalid_offsets); ASSERT_OK_AND_ASSIGN(result, - UnionArray::MakeDense(*type_ids_, *invalid_offsets, children)); + DenseUnionArray::Make(*type_ids_, *invalid_offsets, children)); ASSERT_RAISES(Invalid, result->ValidateFull()); } @@ -231,31 +231,31 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { std::shared_ptr result; // without field names and type codes - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeSparse(*type_ids_, children)); + ASSERT_OK_AND_ASSIGN(result, SparseUnionArray::Make(*type_ids_, children)); ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, {"0", "1", "2", "3"}, {0, 1, 2, 3}); // with field names - ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, {"one"})); - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeSparse(*type_ids_, children, field_names)); + ASSERT_RAISES(Invalid, SparseUnionArray::Make(*type_ids_, children, {"one"})); + ASSERT_OK_AND_ASSIGN(result, SparseUnionArray::Make(*type_ids_, children, field_names)); ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, field_names, {0, 1, 2, 3}); // with type codes - ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*logical_type_ids_, children, + ASSERT_RAISES(Invalid, SparseUnionArray::Make(*logical_type_ids_, children, std::vector{0})); ASSERT_OK_AND_ASSIGN(result, - UnionArray::MakeSparse(*logical_type_ids_, children, type_codes_)); + SparseUnionArray::Make(*logical_type_ids_, children, type_codes_)); ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, {"0", "1", "2", "3"}, type_codes_); // with field names and type codes - ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*logical_type_ids_, children, {"one"}, + ASSERT_RAISES(Invalid, SparseUnionArray::Make(*logical_type_ids_, children, {"one"}, type_codes_)); - ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeSparse(*logical_type_ids_, children, + ASSERT_OK_AND_ASSIGN(result, SparseUnionArray::Make(*logical_type_ids_, children, field_names, type_codes_)); ASSERT_OK(result->ValidateFull()); CheckUnionArray(checked_cast(*result), UnionMode::SPARSE, field_names, @@ -263,15 +263,15 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) { // Invalid type codes ASSERT_OK_AND_ASSIGN( - result, UnionArray::MakeSparse(*invalid_type_ids1_, children, type_codes_)); + result, SparseUnionArray::Make(*invalid_type_ids1_, children, type_codes_)); ASSERT_RAISES(Invalid, result->ValidateFull()); ASSERT_OK_AND_ASSIGN( - result, UnionArray::MakeSparse(*invalid_type_ids2_, children, type_codes_)); + result, SparseUnionArray::Make(*invalid_type_ids2_, children, type_codes_)); ASSERT_RAISES(Invalid, result->ValidateFull()); // Invalid child length ArrayFromVector({0, 0, 0, 0, 0, -12, 0, 0, 0}, &children[3]); - ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children)); + ASSERT_RAISES(Invalid, SparseUnionArray::Make(*type_ids_, children)); } template @@ -396,8 +396,8 @@ class SparseUnionBuilderTest : public UnionBuilderTest { TEST_F(DenseUnionBuilderTest, Basics) { union_builder.reset(new DenseUnionBuilder( default_memory_pool(), {i8_builder, str_builder, dbl_builder}, - union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, - {I8, STR, DBL}, UnionMode::DENSE))); + dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); AppendBasics(); auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); @@ -407,7 +407,7 @@ TEST_F(DenseUnionBuilderTest, Basics) { auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); ASSERT_OK_AND_ASSIGN(auto expected, - UnionArray::MakeDense(*expected_types, *expected_offsets, + DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_i8, expected_str, expected_dbl}, {"i8", "str", "dbl"}, {I8, STR, DBL})); @@ -425,7 +425,7 @@ TEST_F(DenseUnionBuilderTest, InferredType) { auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]"); ASSERT_OK_AND_ASSIGN(auto expected, - UnionArray::MakeDense(*expected_types, *expected_offsets, + DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_i8, expected_str, expected_dbl}, {"i8", "str", "dbl"}, {I8, STR, DBL})); @@ -437,17 +437,17 @@ TEST_F(DenseUnionBuilderTest, ListOfInferredType) { std::shared_ptr actual; AppendListOfInferred(&actual); - auto expected_type = - list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, - {I8, STR, DBL}, UnionMode::DENSE)); + auto expected_type = list( + dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL})); ASSERT_EQ(expected_type->ToString(), actual->type()->ToString()); } TEST_F(SparseUnionBuilderTest, Basics) { union_builder.reset(new SparseUnionBuilder( default_memory_pool(), {i8_builder, str_builder, dbl_builder}, - union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, - {I8, STR, DBL}, UnionMode::SPARSE))); + sparse_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())}, + {I8, STR, DBL}))); AppendBasics(); @@ -460,7 +460,7 @@ TEST_F(SparseUnionBuilderTest, Basics) { ASSERT_OK_AND_ASSIGN( auto expected, - UnionArray::MakeSparse(*expected_types, {expected_i8, expected_str, expected_dbl}, + SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl}, {"i8", "str", "dbl"}, {I8, STR, DBL})); ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); @@ -479,7 +479,7 @@ TEST_F(SparseUnionBuilderTest, InferredType) { ASSERT_OK_AND_ASSIGN( auto expected, - UnionArray::MakeSparse(*expected_types, {expected_i8, expected_str, expected_dbl}, + SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl}, {"i8", "str", "dbl"}, {I8, STR, DBL})); ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); @@ -491,8 +491,8 @@ TEST_F(SparseUnionBuilderTest, StructWithUnion) { StructBuilder builder(struct_({field("u", union_builder->type())}), default_memory_pool(), {union_builder}); ASSERT_EQ(union_builder->AppendChild(std::make_shared(), "i"), 0); - ASSERT_TRUE( - builder.type()->Equals(struct_({field("u", union_({field("i", int32())}, {0}))}))); + ASSERT_TRUE(builder.type()->Equals( + struct_({field("u", sparse_union({field("i", int32())}, {0}))}))); } } // namespace arrow diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index e9d87bd0430..dc61a0d4779 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -333,7 +333,7 @@ TEST(TestArrayView, SparseUnionAsStruct) { auto child1 = ArrayFromJSON(int16(), "[0, -1, 42]"); auto child2 = ArrayFromJSON(int32(), "[0, 1069547520, -1071644672]"); auto indices = ArrayFromJSON(int8(), "[0, 0, 1]"); - ASSERT_OK_AND_ASSIGN(auto arr, UnionArray::MakeSparse(*indices, {child1, child2})); + ASSERT_OK_AND_ASSIGN(auto arr, SparseUnionArray::Make(*indices, {child1, child2})); ASSERT_OK(arr->ValidateFull()); auto ty1 = struct_({field("a", int8()), field("b", uint16()), field("c", float32())}); @@ -343,7 +343,7 @@ TEST(TestArrayView, SparseUnionAsStruct) { // With nulls indices = ArrayFromJSON(int8(), "[null, 0, 1]"); - ASSERT_OK_AND_ASSIGN(arr, UnionArray::MakeSparse(*indices, {child1, child2})); + ASSERT_OK_AND_ASSIGN(arr, SparseUnionArray::Make(*indices, {child1, child2})); ASSERT_OK(arr->ValidateFull()); expected = ArrayFromJSON(ty1, "[null, [0, 65535, 1.5], [1, 42, -2.5]]"); CheckView(arr, expected); @@ -352,7 +352,7 @@ TEST(TestArrayView, SparseUnionAsStruct) { // With nested nulls child1 = ArrayFromJSON(int16(), "[0, -1, null]"); child2 = ArrayFromJSON(int32(), "[0, null, -1071644672]"); - ASSERT_OK_AND_ASSIGN(arr, UnionArray::MakeSparse(*indices, {child1, child2})); + ASSERT_OK_AND_ASSIGN(arr, SparseUnionArray::Make(*indices, {child1, child2})); ASSERT_OK(arr->ValidateFull()); expected = ArrayFromJSON(ty1, "[null, [0, 65535, null], [1, null, -2.5]]"); CheckView(arr, expected); diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index ace91de65b8..a5e4c135bb3 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -39,22 +39,25 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i])); } - *out = ArrayData::Make(type(), length(), {null_bitmap, types, nullptr}, null_count_); + *out = ArrayData::Make(type(), length(), {null_bitmap, types}, null_count_); (*out)->child_data = std::move(child_data); return Status::OK(); } +Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out)); + (*out)->buffers.resize(3); + ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&(*out)->buffers[2])); + return Status::OK(); +} + BasicUnionBuilder::BasicUnionBuilder( - MemoryPool* pool, UnionMode::type mode, - const std::vector>& children, + MemoryPool* pool, const std::vector>& children, const std::shared_ptr& type) - : ArrayBuilder(pool), - child_fields_(children.size()), - mode_(mode), - types_builder_(pool) { - DCHECK_EQ(type->id(), Type::UNION); + : ArrayBuilder(pool), child_fields_(children.size()), types_builder_(pool) { const auto& union_type = checked_cast(*type); - DCHECK_EQ(union_type.mode(), mode); + mode_ = union_type.mode(); + DCHECK_EQ(children.size(), union_type.type_codes().size()); type_codes_ = union_type.type_codes(); @@ -73,9 +76,6 @@ BasicUnionBuilder::BasicUnionBuilder( } } -BasicUnionBuilder::BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) - : BasicUnionBuilder(pool, mode, {}, union_(mode)) {} - int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_child, const std::string& field_name) { children_.push_back(new_child); @@ -96,7 +96,8 @@ std::shared_ptr BasicUnionBuilder::type() const { for (size_t i = 0; i < child_fields.size(); ++i) { child_fields[i] = child_fields_[i]->WithType(children_[i]->type()); } - return union_(std::move(child_fields), type_codes_, mode_); + return mode_ == UnionMode::SPARSE ? sparse_union(std::move(child_fields), type_codes_) + : dense_union(std::move(child_fields), type_codes_); } int8_t BasicUnionBuilder::NextTypeId() { diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index aba707e8383..e3376734d35 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -57,14 +57,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { std::shared_ptr type() const override; protected: - /// Use this constructor to initialize the UnionBuilder with no child builders, - /// allowing type to be inferred. You will need to call AppendChild for each of the - /// children builders you want to use. - BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode); - - /// Use this constructor to specify the type explicitly. - /// You can still add child builders to the union after using this constructor - BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode, + BasicUnionBuilder(MemoryPool* pool, const std::vector>& children, const std::shared_ptr& type); @@ -89,15 +82,14 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. explicit DenseUnionBuilder(MemoryPool* pool) - : BasicUnionBuilder(pool, UnionMode::DENSE), offsets_builder_(pool) {} + : BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor DenseUnionBuilder(MemoryPool* pool, const std::vector>& children, const std::shared_ptr& type) - : BasicUnionBuilder(pool, UnionMode::DENSE, children, type), - offsets_builder_(pool) {} + : BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {} Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); @@ -130,10 +122,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return AppendToBitmap(true); } - Status FinishInternal(std::shared_ptr* out) override { - ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out)); - return offsets_builder_.Finish(&(*out)->buffers[2]); - } + Status FinishInternal(std::shared_ptr* out) override; private: TypedBufferBuilder offsets_builder_; @@ -148,14 +137,14 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. explicit SparseUnionBuilder(MemoryPool* pool) - : BasicUnionBuilder(pool, UnionMode::SPARSE) {} + : BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {} /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor SparseUnionBuilder(MemoryPool* pool, const std::vector>& children, const std::shared_ptr& type) - : BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {} + : BasicUnionBuilder(pool, children, type) {} Status AppendNull() final { ARROW_RETURN_NOT_OK(types_builder_.Append(0)); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 04e3a8f4c2d..9c887eb25b4 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -218,7 +218,7 @@ TEST_F(ConcatenateTest, DISABLED_UnionType) { auto bar = this->GeneratePrimitive(size, null_probability); auto baz = this->GeneratePrimitive(size, null_probability); auto type_ids = rng_.Numeric(size, 0, 2, null_probability); - ASSERT_OK_AND_ASSIGN(*out, UnionArray::MakeSparse(*type_ids, {foo, bar, baz})); + ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz})); }); // dense mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { @@ -228,7 +228,7 @@ TEST_F(ConcatenateTest, DISABLED_UnionType) { auto type_ids = rng_.Numeric(size, 0, 2, null_probability); auto value_offsets = rng_.Numeric(size, 0, size, 0); ASSERT_OK_AND_ASSIGN( - *out, UnionArray::MakeDense(*type_ids, *value_offsets, {foo, bar, baz})); + *out, DenseUnionArray::Make(*type_ids, *value_offsets, {foo, bar, baz})); }); } diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 377f77a2792..6b98feba02a 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -574,7 +574,7 @@ class MakeFormatterImpl { using UnionImpl::UnionImpl; void operator()(const Array& array, int64_t index, std::ostream* os) { - const auto& union_array = checked_cast(array); + const auto& union_array = checked_cast(array); DoFormat(union_array, index, index, os); } }; @@ -583,7 +583,7 @@ class MakeFormatterImpl { using UnionImpl::UnionImpl; void operator()(const Array& array, int64_t index, std::ostream* os) { - const auto& union_array = checked_cast(array); + const auto& union_array = checked_cast(array); DoFormat(union_array, index, union_array.raw_value_offsets()[index], os); } }; diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 827c8894ac6..bfe46d4762c 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -350,7 +350,7 @@ TEST_F(DiffTest, BasicsWithStructs) { } TEST_F(DiffTest, BasicsWithUnions) { - auto type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5}); + auto type = sparse_union({field("foo", utf8()), field("bar", int32())}, {2, 5}); // insert one base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])"); @@ -509,8 +509,8 @@ TEST_F(DiffTest, UnifiedDiffFormatter) { )"); // unions - for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) { - type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5}, mode); + for (auto union_ : UnionTypeFactories()) { + type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5}); base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])"); target_ = ArrayFromJSON(type, R"([[2, "!"], [2, "3"], [5, 13]])"); AssertDiffAndFormat(R"( diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 2760258897d..a30849a8f34 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -233,12 +233,8 @@ class NullArrayFactory { } Status Visit(const UnionType& type) { - if (type.mode() == UnionMode::DENSE) { - out_->buffers.resize(3, buffer_); - } else { - out_->buffers = {buffer_, buffer_, nullptr}; - } - + auto n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3; + out_->buffers.resize(n_buffers, buffer_); for (int i = 0; i < type_->num_fields(); ++i) { ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(i, length_)); } diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index e17243e81ad..f4229616508 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -438,7 +438,8 @@ struct ValidateArrayDataVisitor { } // Check offsets - const int32_t* offsets = array.raw_value_offsets(); + const int32_t* offsets = + checked_cast(array).raw_value_offsets(); for (int64_t i = 0; i < array.length(); ++i) { if (array.IsNull(i)) { continue; diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index f314c2908a6..6be07d6ca75 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -80,6 +80,19 @@ struct DictionaryBuilderCase { out->reset(new TYPE_CLASS##Builder(type, pool)); \ return Status::OK(); +Result>> FieldBuilders(const DataType& type, + MemoryPool* pool) { + std::vector> field_builders; + + for (const auto& field : type.fields()) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, field->type(), &builder)); + field_builders.emplace_back(std::move(builder)); + } + + return field_builders; +} + Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out) { switch (type->id()) { @@ -158,33 +171,20 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } case Type::STRUCT: { - const std::vector>& fields = type->fields(); - std::vector> field_builders; - - for (const auto& it : fields) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); - field_builders.emplace_back(std::move(builder)); - } + ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool)); out->reset(new StructBuilder(type, pool, std::move(field_builders))); return Status::OK(); } - case Type::UNION: { - const auto& union_type = internal::checked_cast(*type); - const std::vector>& fields = type->fields(); - std::vector> field_builders; - - for (const auto& it : fields) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); - field_builders.emplace_back(std::move(builder)); - } - if (union_type.mode() == UnionMode::DENSE) { - out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type)); - } else { - out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type)); - } + case Type::SPARSE_UNION: { + ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool)); + out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type)); + return Status::OK(); + } + + case Type::DENSE_UNION: { + ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool)); + out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type)); return Status::OK(); } diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index c0e8e1d2316..45355435aa7 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1067,7 +1067,11 @@ struct SchemaImporter { c_struct_->format, "'"); } } - type_ = union_(std::move(fields), std::move(type_codes), mode); + if (mode == UnionMode::SPARSE) { + type_ = sparse_union(std::move(fields), std::move(type_codes)); + } else { + type_ = dense_union(std::move(fields), std::move(type_codes)); + } return Status::OK(); } @@ -1322,14 +1326,16 @@ struct ArrayImporter { Status Visit(const UnionType& type) { auto mode = type.mode(); - RETURN_NOT_OK(CheckNumBuffers(3)); + if (mode == UnionMode::SPARSE) { + RETURN_NOT_OK(CheckNumBuffers(2)); + } else { + RETURN_NOT_OK(CheckNumBuffers(3)); + } RETURN_NOT_OK(AllocateArrayData()); RETURN_NOT_OK(ImportNullBitmap()); RETURN_NOT_OK(ImportFixedSizeBuffer(1, sizeof(int8_t))); if (mode == UnionMode::DENSE) { RETURN_NOT_OK(ImportFixedSizeBuffer(2, sizeof(int32_t))); - } else { - RETURN_NOT_OK(ImportUnusedBuffer(2)); } return Status::OK(); } @@ -1419,8 +1425,6 @@ struct ArrayImporter { return ImportBuffer(buffer_id, buffer_size); } - Status ImportUnusedBuffer(int32_t buffer_id) { return ImportBuffer(buffer_id, 0); } - Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) { // Compute visible size of buffer int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset); diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 33d49cc575c..43273a31c9a 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -347,13 +347,13 @@ TEST_F(TestSchemaExport, Union) { // Dense auto field_a = field("a", int8()); auto field_b = field("b", boolean(), /*nullable=*/false); - auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE); + auto type = dense_union({field_a, field_b}, {42, 43}); TestNested(type, {"+ud:42,43", "c", "b"}, {"", "a", "b"}, {ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, 0}); // Sparse field_a = field("a", int8(), /*nullable=*/false); field_b = field("b", boolean()); - type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE); + type = sparse_union({field_a, field_b}, {42, 43}); TestNested(type, {"+us:42,43", "c", "b"}, {"", "a", "b"}, {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}); } @@ -800,12 +800,12 @@ TEST_F(TestArrayExport, Union) { // Dense auto field_a = field("a", int8()); auto field_b = field("b", boolean(), /*nullable=*/false); - auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE); + auto type = dense_union({field_a, field_b}, {42, 43}); TestNested(type, data); // Sparse field_a = field("a", int8(), /*nullable=*/false); field_b = field("b", boolean()); - type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE); + type = sparse_union({field_a, field_b}, {42, 43}); TestNested(type, data); } @@ -1297,7 +1297,7 @@ TEST_F(TestSchemaImport, Union) { FillPrimitive(AddChild(), "c", "ints"); FillStructLike("+us:43,42", 2); auto expected = - union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE); + sparse_union({field("strs", utf8()), field("ints", int8())}, {43, 42}); CheckImport(expected); // Dense @@ -1305,7 +1305,7 @@ TEST_F(TestSchemaImport, Union) { FillPrimitive(AddChild(), "c", "ints"); FillStructLike("+ud:43,42", 2); expected = - union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE); + dense_union({field("strs", utf8()), field("ints", int8())}, {43, 42}); CheckImport(expected); } @@ -1677,13 +1677,13 @@ class TestArrayImport : public ::testing::Test { c->children = NLastChildren(c->n_children, c); } - void FillUnionLike(struct ArrowArray* c, int64_t length, + void FillUnionLike(struct ArrowArray* c, UnionMode::type mode, int64_t length, int64_t null_count, int64_t offset, int64_t n_children, const void** buffers) { c->length = length; c->null_count = null_count; c->offset = offset; - c->n_buffers = 3; + c->n_buffers = mode == UnionMode::SPARSE ? 2 : 3; c->buffers = buffers; c->n_children = n_children; c->children = NLastChildren(c->n_children, c); @@ -1717,10 +1717,10 @@ class TestArrayImport : public ::testing::Test { FillStructLike(&c_struct_, length, null_count, offset, n_children, buffers); } - void FillUnionLike(int64_t length, int64_t null_count, + void FillUnionLike(UnionMode::type mode, int64_t length, int64_t null_count, int64_t offset, int64_t n_children, const void** buffers) { - FillUnionLike(&c_struct_, length, null_count, offset, n_children, buffers); + FillUnionLike(&c_struct_, mode, length, null_count, offset, n_children, buffers); } void CheckImport(const std::shared_ptr& expected) { @@ -2021,9 +2021,9 @@ TEST_F(TestArrayImport, Union) { // Sparse FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); FillPrimitive(AddChild(), 4, -1, 0, primitive_buffers_nulls1_8); - FillUnionLike(4, 0, 0, 2, sparse_union_buffers_no_nulls1); + FillUnionLike(UnionMode::SPARSE, 4, 0, 0, 2, sparse_union_buffers_no_nulls1); auto type = - union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE); + sparse_union({field("strs", utf8()), field("ints", int8())}, {43, 42}); auto expected = ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "bar"], [43, "quux"]])"); CheckImport(expected); @@ -2031,9 +2031,9 @@ TEST_F(TestArrayImport, Union) { // Dense FillStringLike(AddChild(), 2, 0, 0, string_buffers_no_nulls1); FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1_8); - FillUnionLike(5, 0, 0, 2, dense_union_buffers_no_nulls1); + FillUnionLike(UnionMode::DENSE, 5, 0, 0, 2, dense_union_buffers_no_nulls1); type = - union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE); + dense_union({field("strs", utf8()), field("ints", int8())}, {43, 42}); expected = ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "foo"], [43, ""], [42, 3]])"); CheckImport(expected); @@ -2405,9 +2405,9 @@ TEST_F(TestSchemaRoundtrip, Union) { auto f2 = field("f2", list(decimal(19, 4))); auto type_codes = std::vector{42, 43}; - TestWithTypeFactory([&]() { return union_({f1, f2}, type_codes, UnionMode::SPARSE); }); + TestWithTypeFactory([&]() { return sparse_union({f1, f2}, type_codes); }); f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); - TestWithTypeFactory([&]() { return union_({f1, f2}, type_codes, UnionMode::DENSE); }); + TestWithTypeFactory([&]() { return dense_union({f1, f2}, type_codes); }); } TEST_F(TestSchemaRoundtrip, Dictionary) { diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index db4033863fc..7600746026e 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -262,8 +262,10 @@ class RangeEqualsVisitor { return false; } } else { - const int32_t offset = left.raw_value_offsets()[i]; - const int32_t o_offset = right.raw_value_offsets()[o_i]; + const int32_t offset = + checked_cast(left).raw_value_offsets()[i]; + const int32_t o_offset = + checked_cast(right).raw_value_offsets()[o_i]; if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset, right.field(child_num))) { return false; diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 3ea3570003f..427ad2b0c54 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -31,8 +31,8 @@ void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) { ctx->SetStatus(Status::NotImplemented("This kernel is malformed")); } -void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec, - const ExecBatch& batch, Datum* out) { +void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec, const ExecBatch& batch, + Datum* out) { ExecBatch flipped_batch = batch; std::swap(flipped_batch.values[0], flipped_batch.values[1]); exec(ctx, flipped_batch, out); @@ -74,10 +74,15 @@ static void InitStaticData() { Extend(g_floating_types, &g_numeric_types); // Temporal types - g_temporal_types = {date32(), date64(), time32(TimeUnit::SECOND), - time32(TimeUnit::MILLI), time64(TimeUnit::MICRO), - time64(TimeUnit::NANO), timestamp(TimeUnit::SECOND), - timestamp(TimeUnit::MILLI), timestamp(TimeUnit::MICRO), + g_temporal_types = {date32(), + date64(), + time32(TimeUnit::SECOND), + time32(TimeUnit::MILLI), + time64(TimeUnit::MICRO), + time64(TimeUnit::NANO), + timestamp(TimeUnit::SECOND), + timestamp(TimeUnit::MILLI), + timestamp(TimeUnit::MICRO), timestamp(TimeUnit::NANO)}; // Base binary types (without FixedSizeBinary) @@ -142,19 +147,20 @@ const std::vector>& PrimitiveTypes() { const std::vector>& ExampleParametricTypes() { static DataTypeVector example_parametric_types = { - decimal(12, 2), - duration(TimeUnit::SECOND), - timestamp(TimeUnit::SECOND), - time32(TimeUnit::SECOND), - time64(TimeUnit::MICRO), - fixed_size_binary(0), - list(null()), - large_list(null()), - fixed_size_list(field("dummy", null()), 0), - struct_({}), - union_({}), - dictionary(int32(), null()), - map(null(), null())}; + decimal(12, 2), + duration(TimeUnit::SECOND), + timestamp(TimeUnit::SECOND), + time32(TimeUnit::SECOND), + time64(TimeUnit::MICRO), + fixed_size_binary(0), + list(null()), + large_list(null()), + fixed_size_list(field("dummy", null()), 0), + struct_({}), + sparse_union(FieldVector{}), + dense_union(FieldVector{}), + dictionary(int32(), null()), + map(null(), null())}; return example_parametric_types; } diff --git a/cpp/src/arrow/compute/kernels/vector_filter_test.cc b/cpp/src/arrow/compute/kernels/vector_filter_test.cc index a835417dd0f..327789127db 100644 --- a/cpp/src/arrow/compute/kernels/vector_filter_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_filter_test.cc @@ -488,8 +488,8 @@ TEST_F(TestFilterKernelWithStruct, FilterStruct) { class TestFilterKernelWithUnion : public TestFilterKernel {}; TEST_F(TestFilterKernelWithUnion, FilterUnion) { - for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) { - auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}, mode); + for (auto union_ : UnionTypeFactories()) { + auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}); auto union_json = R"([ null, [2, 222], diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index 6333ef3a671..8908b3b1628 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -494,28 +494,99 @@ class TakerImpl : public Taker { }; template -class TakerImpl : public Taker { +class TakerImpl : public Taker { public: using Taker::Taker; Status Init() override { - union_type_ = checked_cast(this->type_.get()); + union_type_ = checked_cast(this->type_.get()); + children_.resize(this->type_->num_fields()); - if (union_type_->mode() == UnionMode::SPARSE) { - sparse_children_.resize(this->type_->num_fields()); - } else { - dense_children_.resize(this->type_->num_fields()); - child_length_.resize(union_type_->max_type_code() + 1); + for (int i = 0; i < this->type_->num_fields(); ++i) { + RETURN_NOT_OK( + Taker::Make(this->type_->field(i)->type(), &children_[i])); } + return Status::OK(); + } + + Status SetContext(KernelContext* ctx) override { + pool_ = ctx->memory_pool(); + null_bitmap_builder_.reset(new TypedBufferBuilder(pool_)); + type_code_builder_.reset(new TypedBufferBuilder(pool_)); for (int i = 0; i < this->type_->num_fields(); ++i) { - if (union_type_->mode() == UnionMode::SPARSE) { - RETURN_NOT_OK(Taker::Make(this->type_->field(i)->type(), - &sparse_children_[i])); - } else { - RETURN_NOT_OK(Taker>::Make( - this->type_->field(i)->type(), &dense_children_[i])); - } + RETURN_NOT_OK(children_[i]->SetContext(ctx)); + } + return Status::OK(); + } + + Status Take(const Array& values, IndexSequence indices) override { + DCHECK(this->type_->Equals(values.type())); + const auto& union_array = checked_cast(values); + auto type_codes = union_array.raw_type_codes(); + + RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length())); + RETURN_NOT_OK(type_code_builder_->Reserve(indices.length())); + RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) { + null_bitmap_builder_->UnsafeAppend(is_valid); + type_code_builder_->UnsafeAppend(type_codes[index]); + return Status::OK(); + })); + + // bounds checking was done while appending to the null bitmap + indices.set_never_out_of_bounds(); + + for (int i = 0; i < this->type_->num_fields(); ++i) { + RETURN_NOT_OK(children_[i]->Take(*union_array.field(i), indices)); + } + return Status::OK(); + } + + Status Finish(std::shared_ptr* out) override { + auto null_count = null_bitmap_builder_->false_count(); + auto length = null_bitmap_builder_->length(); + std::shared_ptr null_bitmap, type_codes; + RETURN_NOT_OK(null_bitmap_builder_->Finish(&null_bitmap)); + RETURN_NOT_OK(type_code_builder_->Finish(&type_codes)); + + ArrayVector fields(this->type_->num_fields()); + for (int i = 0; i < this->type_->num_fields(); ++i) { + RETURN_NOT_OK(children_[i]->Finish(&fields[i])); + } + + out->reset(new SparseUnionArray(this->type_, length, std::move(fields), type_codes, + null_bitmap, null_count)); + return Status::OK(); + } + + protected: + int32_t* GetInt32(const std::shared_ptr& b) const { + return reinterpret_cast(b->mutable_data()); + } + + const SparseUnionType* union_type_ = nullptr; + MemoryPool* pool_ = nullptr; + std::unique_ptr> null_bitmap_builder_; + std::unique_ptr> type_code_builder_; + std::unique_ptr> offset_builder_; + std::vector>> children_; + std::vector child_length_; +}; + +template +class TakerImpl : public Taker { + public: + using Taker::Taker; + + Status Init() override { + union_type_ = checked_cast(this->type_.get()); + + dense_children_.resize(this->type_->num_fields()); + child_length_.resize(union_type_->max_type_code() + 1); + + for (int i = 0; i < this->type_->num_fields(); ++i) { + RETURN_NOT_OK(Taker>::Make( + this->type_->field(i)->type(), &dense_children_[i])); } return Status::OK(); @@ -525,18 +596,12 @@ class TakerImpl : public Taker { pool_ = ctx->memory_pool(); null_bitmap_builder_.reset(new TypedBufferBuilder(pool_)); type_code_builder_.reset(new TypedBufferBuilder(pool_)); + offset_builder_.reset(new TypedBufferBuilder(pool_)); - if (union_type_->mode() == UnionMode::DENSE) { - offset_builder_.reset(new TypedBufferBuilder(pool_)); - std::fill(child_length_.begin(), child_length_.end(), 0); - } + std::fill(child_length_.begin(), child_length_.end(), 0); for (int i = 0; i < this->type_->num_fields(); ++i) { - if (union_type_->mode() == UnionMode::SPARSE) { - RETURN_NOT_OK(sparse_children_[i]->SetContext(ctx)); - } else { - RETURN_NOT_OK(dense_children_[i]->SetContext(ctx)); - } + RETURN_NOT_OK(dense_children_[i]->SetContext(ctx)); } return Status::OK(); @@ -547,81 +612,65 @@ class TakerImpl : public Taker { const auto& union_array = checked_cast(values); auto type_codes = union_array.raw_type_codes(); - if (union_type_->mode() == UnionMode::SPARSE) { - RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length())); - RETURN_NOT_OK(type_code_builder_->Reserve(indices.length())); - RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) { - null_bitmap_builder_->UnsafeAppend(is_valid); - type_code_builder_->UnsafeAppend(type_codes[index]); - return Status::OK(); - })); + // Gathering from the offsets into child arrays is a bit tricky. + std::vector child_counts(union_type_->max_type_code() + 1); + RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length())); + RETURN_NOT_OK(type_code_builder_->Reserve(indices.length())); + RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) { + null_bitmap_builder_->UnsafeAppend(is_valid); + type_code_builder_->UnsafeAppend(type_codes[index]); + child_counts[type_codes[index]] += is_valid; + return Status::OK(); + })); - // bounds checking was done while appending to the null bitmap - indices.set_never_out_of_bounds(); + // bounds checking was done while appending to the null bitmap + indices.set_never_out_of_bounds(); - for (int i = 0; i < this->type_->num_fields(); ++i) { - RETURN_NOT_OK(sparse_children_[i]->Take(*union_array.field(i), indices)); - } - } else { - // Gathering from the offsets into child arrays is a bit tricky. - std::vector child_counts(union_type_->max_type_code() + 1); - RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length())); - RETURN_NOT_OK(type_code_builder_->Reserve(indices.length())); - RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) { - null_bitmap_builder_->UnsafeAppend(is_valid); - type_code_builder_->UnsafeAppend(type_codes[index]); - child_counts[type_codes[index]] += is_valid; - return Status::OK(); - })); - - // bounds checking was done while appending to the null bitmap - indices.set_never_out_of_bounds(); - - // Allocate temporary storage for the offsets of all valid slots - auto child_offsets_storage_size = - std::accumulate(child_counts.begin(), child_counts.end(), 0); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr child_offsets_storage, - AllocateBuffer(child_offsets_storage_size * sizeof(int32_t), pool_)); - - // Partition offsets by type_code: child_offset_partitions[type_code] will - // point to storage for child_counts[type_code] offsets - std::vector child_offset_partitions(child_counts.size()); - auto child_offsets_storage_data = GetInt32(child_offsets_storage); - for (auto type_code : union_type_->type_codes()) { - child_offset_partitions[type_code] = child_offsets_storage_data; - child_offsets_storage_data += child_counts[type_code]; - } - DCHECK_EQ(child_offsets_storage_data - GetInt32(child_offsets_storage), - child_offsets_storage_size); - - // Fill child_offsets_storage with the taken offsets - RETURN_NOT_OK(offset_builder_->Reserve(indices.length())); - RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) { - auto type_code = type_codes[index]; - if (is_valid) { - offset_builder_->UnsafeAppend(child_length_[type_code]++); - *child_offset_partitions[type_code] = union_array.value_offset(index); - ++child_offset_partitions[type_code]; - } else { - offset_builder_->UnsafeAppend(0); - } - return Status::OK(); - })); - - // Take from each child at those offsets - int64_t taken_offset_begin = 0; - for (int i = 0; i < this->type_->num_fields(); ++i) { - auto type_code = union_type_->type_codes()[i]; - auto length = child_counts[type_code]; - Int32Array taken_offsets(length, SliceBuffer(child_offsets_storage, - sizeof(int32_t) * taken_offset_begin, - sizeof(int32_t) * length)); - ArrayIndexSequence child_indices(taken_offsets); - child_indices.set_never_out_of_bounds(); - RETURN_NOT_OK(dense_children_[i]->Take(*union_array.field(i), child_indices)); - taken_offset_begin += length; + // Allocate temporary storage for the offsets of all valid slots + auto child_offsets_storage_size = + std::accumulate(child_counts.begin(), child_counts.end(), 0); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr child_offsets_storage, + AllocateBuffer(child_offsets_storage_size * sizeof(int32_t), pool_)); + + // Partition offsets by type_code: child_offset_partitions[type_code] will + // point to storage for child_counts[type_code] offsets + std::vector child_offset_partitions(child_counts.size()); + auto child_offsets_storage_data = GetInt32(child_offsets_storage); + for (auto type_code : union_type_->type_codes()) { + child_offset_partitions[type_code] = child_offsets_storage_data; + child_offsets_storage_data += child_counts[type_code]; + } + DCHECK_EQ(child_offsets_storage_data - GetInt32(child_offsets_storage), + child_offsets_storage_size); + + // Fill child_offsets_storage with the taken offsets + RETURN_NOT_OK(offset_builder_->Reserve(indices.length())); + RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) { + auto type_code = type_codes[index]; + if (is_valid) { + offset_builder_->UnsafeAppend(child_length_[type_code]++); + *child_offset_partitions[type_code] = + checked_cast(union_array).value_offset(index); + ++child_offset_partitions[type_code]; + } else { + offset_builder_->UnsafeAppend(0); } + return Status::OK(); + })); + + // Take from each child at those offsets + int64_t taken_offset_begin = 0; + for (int i = 0; i < this->type_->num_fields(); ++i) { + auto type_code = union_type_->type_codes()[i]; + auto length = child_counts[type_code]; + Int32Array taken_offsets( + length, SliceBuffer(child_offsets_storage, sizeof(int32_t) * taken_offset_begin, + sizeof(int32_t) * length)); + ArrayIndexSequence child_indices(taken_offsets); + child_indices.set_never_out_of_bounds(); + RETURN_NOT_OK(dense_children_[i]->Take(*union_array.field(i), child_indices)); + taken_offset_begin += length; } return Status::OK(); @@ -630,26 +679,18 @@ class TakerImpl : public Taker { Status Finish(std::shared_ptr* out) override { auto null_count = null_bitmap_builder_->false_count(); auto length = null_bitmap_builder_->length(); - std::shared_ptr null_bitmap, type_codes; + std::shared_ptr null_bitmap, type_codes, offsets; RETURN_NOT_OK(null_bitmap_builder_->Finish(&null_bitmap)); RETURN_NOT_OK(type_code_builder_->Finish(&type_codes)); - - std::shared_ptr offsets; - if (union_type_->mode() == UnionMode::DENSE) { - RETURN_NOT_OK(offset_builder_->Finish(&offsets)); - } + RETURN_NOT_OK(offset_builder_->Finish(&offsets)); ArrayVector fields(this->type_->num_fields()); for (int i = 0; i < this->type_->num_fields(); ++i) { - if (union_type_->mode() == UnionMode::SPARSE) { - RETURN_NOT_OK(sparse_children_[i]->Finish(&fields[i])); - } else { - RETURN_NOT_OK(dense_children_[i]->Finish(&fields[i])); - } + RETURN_NOT_OK(dense_children_[i]->Finish(&fields[i])); } - out->reset(new UnionArray(this->type_, length, std::move(fields), type_codes, offsets, - null_bitmap, null_count)); + out->reset(new DenseUnionArray(this->type_, length, std::move(fields), type_codes, + offsets, null_bitmap, null_count)); return Status::OK(); } @@ -658,7 +699,7 @@ class TakerImpl : public Taker { return reinterpret_cast(b->mutable_data()); } - const UnionType* union_type_ = nullptr; + const DenseUnionType* union_type_ = nullptr; MemoryPool* pool_ = nullptr; std::unique_ptr> null_bitmap_builder_; std::unique_ptr> type_code_builder_; diff --git a/cpp/src/arrow/compute/kernels/vector_take_test.cc b/cpp/src/arrow/compute/kernels/vector_take_test.cc index 51d183c6013..a7568dfd7c5 100644 --- a/cpp/src/arrow/compute/kernels/vector_take_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_take_test.cc @@ -514,10 +514,9 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) { class TestTakeKernelWithUnion : public TestTakeKernel {}; // TODO: Restore Union take functionality - TEST_F(TestTakeKernelWithUnion, DISABLED_TakeUnion) { - for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) { - auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}, mode); + for (auto union_ : UnionTypeFactories()) { + auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}); auto union_json = R"([ null, [2, 222], diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index f658c3391ef..9d66002eee4 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -668,7 +668,8 @@ class ArrayWriter { WriteIntegerField("TYPE_ID", array.raw_type_codes(), array.length()); if (type.mode() == UnionMode::DENSE) { - WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length()); + auto offsets = checked_cast(array).raw_value_offsets(); + WriteIntegerField("OFFSET", offsets, array.length()); } std::vector> children; children.reserve(array.num_fields()); @@ -945,7 +946,11 @@ static Status GetUnion(const RjObject& json_type, type_codes.push_back(static_cast(val.GetInt())); } - *type = union_(children, type_codes, mode); + if (mode == UnionMode::SPARSE) { + *type = sparse_union(std::move(children), std::move(type_codes)); + } else { + *type = dense_union(std::move(children), std::move(type_codes)); + } return Status::OK(); } @@ -1471,7 +1476,6 @@ class ArrayReader { std::shared_ptr validity_buffer; std::shared_ptr type_id_buffer; - std::shared_ptr offsets_buffer; RETURN_NOT_OK(GetValidityBuffer(is_valid_, &null_count, &validity_buffer)); @@ -1480,18 +1484,24 @@ class ArrayReader { RETURN_NOT_OK( GetIntArray(json_type_ids->value.GetArray(), length_, &type_id_buffer)); - if (type.mode() == UnionMode::DENSE) { + std::vector> children; + RETURN_NOT_OK(GetChildren(obj_, type, &children)); + + if (type.mode() == UnionMode::SPARSE) { + result_ = std::make_shared( + type_, length_, children, type_id_buffer, validity_buffer, null_count); + } else { const auto& json_offsets = obj_.FindMember("OFFSET"); RETURN_NOT_ARRAY("OFFSET", json_offsets, obj_); + + std::shared_ptr offsets_buffer; RETURN_NOT_OK( GetIntArray(json_offsets->value.GetArray(), length_, &offsets_buffer)); - } - std::vector> children; - RETURN_NOT_OK(GetChildren(obj_, type, &children)); - - result_ = std::make_shared(type_, length_, children, type_id_buffer, - offsets_buffer, validity_buffer, null_count); + result_ = + std::make_shared(type_, length_, children, type_id_buffer, + offsets_buffer, validity_buffer, null_count); + } return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index e189fd166c0..1ca69bf85e4 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -805,7 +805,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) - SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::SPARSE_UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::DENSE_UNION, UnionConverter) SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTHS, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INTERVAL_DAY_TIME, DayTimeIntervalConverter) default: diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 5b21fd598b3..bf6098dfbb3 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -994,8 +994,8 @@ TEST(TestDenseUnion, Basics) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); - auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); - auto array = checked_pointer_cast( + auto type = dense_union({field_a, field_b}, {4, 8}); + auto array = checked_pointer_cast( ArrayFromJSON(type, "[null, [4, 122], [8, true], [4, null], null, [8, false]]")); auto expected_types = ArrayFromJSON(int8(), "[null, 4, 8, 4, null, 8]"); @@ -1004,7 +1004,7 @@ TEST(TestDenseUnion, Basics) { auto expected_b = ArrayFromJSON(boolean(), "[true, false]"); ASSERT_OK_AND_ASSIGN( - auto expected, UnionArray::MakeDense(*expected_types, *expected_offsets, + auto expected, DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_a, expected_b}, {"a", "b"}, {4, 8})); ASSERT_ARRAYS_EQUAL(*expected, *array); @@ -1019,7 +1019,7 @@ TEST(TestSparseUnion, Basics) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); - auto type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + auto type = sparse_union({field_a, field_b}, {4, 8}); auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]"); auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]"); @@ -1027,7 +1027,7 @@ TEST(TestSparseUnion, Basics) { auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]"); ASSERT_OK_AND_ASSIGN(auto expected, - UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, + SparseUnionArray::Make(*expected_types, {expected_a, expected_b}, {"a", "b"}, {4, 8})); ASSERT_ARRAYS_EQUAL(*expected, *array); @@ -1036,7 +1036,7 @@ TEST(TestSparseUnion, Basics) { TEST(TestDenseUnion, ListOfUnion) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); - auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + auto union_type = dense_union({field_a, field_b}, {4, 8}); auto list_type = list(union_type); auto array = checked_pointer_cast(ArrayFromJSON(list_type, @@ -1052,7 +1052,7 @@ TEST(TestDenseUnion, ListOfUnion) { ASSERT_OK_AND_ASSIGN( auto expected_values, - UnionArray::MakeDense(*expected_types, *expected_offsets, {expected_a, expected_b}, + DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_a, expected_b}, {"a", "b"}, {4, 8})); auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]"); ASSERT_OK_AND_ASSIGN(auto expected, @@ -1061,9 +1061,9 @@ TEST(TestDenseUnion, ListOfUnion) { ASSERT_ARRAYS_EQUAL(*expected, *array); // ensure that the array is as dense as we expect - auto array_values = checked_pointer_cast(array->values()); + auto array_values = checked_pointer_cast(array->values()); ASSERT_TRUE(array_values->value_offsets()->Equals( - *checked_pointer_cast(expected_values)->value_offsets())); + *checked_pointer_cast(expected_values)->value_offsets())); ASSERT_ARRAYS_EQUAL(*expected_a, *array_values->field(0)); ASSERT_ARRAYS_EQUAL(*expected_b, *array_values->field(1)); } @@ -1071,7 +1071,7 @@ TEST(TestDenseUnion, ListOfUnion) { TEST(TestSparseUnion, ListOfUnion) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); - auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + auto union_type = sparse_union({field_a, field_b}, {4, 8}); auto list_type = list(union_type); auto array = ArrayFromJSON(list_type, "[" @@ -1084,7 +1084,7 @@ TEST(TestSparseUnion, ListOfUnion) { auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]"); ASSERT_OK_AND_ASSIGN(auto expected_values, - UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, + SparseUnionArray::Make(*expected_types, {expected_a, expected_b}, {"a", "b"}, {4, 8})); auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]"); ASSERT_OK_AND_ASSIGN(auto expected, @@ -1099,8 +1099,8 @@ TEST(TestDenseUnion, UnionOfStructs) { field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), field("foxtrot", list(int8()))})), field("q", struct_({field("quebec", utf8())}))}; - auto type = union_(fields, {0, 23, 47}, UnionMode::DENSE); - auto array = checked_pointer_cast(ArrayFromJSON(type, R"([ + auto type = dense_union(fields, {0, 23, 47}); + auto array = checked_pointer_cast(ArrayFromJSON(type, R"([ [0, {"alpha": 0.0, "bravo": "charlie"}], [23, {"whiskey": 99}], [0, {"bravo": "mike"}], @@ -1122,7 +1122,7 @@ TEST(TestDenseUnion, UnionOfStructs) { ASSERT_OK_AND_ASSIGN( auto expected, - UnionArray::MakeDense(*expected_types, *expected_offsets, expected_fields, + DenseUnionArray::Make(*expected_types, *expected_offsets, expected_fields, {"ab", "wtf", "q"}, {0, 23, 47})); ASSERT_ARRAYS_EQUAL(*expected, *array); @@ -1141,7 +1141,7 @@ TEST(TestSparseUnion, UnionOfStructs) { field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), field("foxtrot", list(int8()))})), field("q", struct_({field("quebec", utf8())}))}; - auto type = union_(fields, {0, 23, 47}, UnionMode::SPARSE); + auto type = sparse_union(fields, {0, 23, 47}); auto array = ArrayFromJSON(type, R"([ [0, {"alpha": 0.0, "bravo": "charlie"}], [23, {"whiskey": 99}], @@ -1169,7 +1169,7 @@ TEST(TestSparseUnion, UnionOfStructs) { ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")}; ASSERT_OK_AND_ASSIGN(auto expected, - UnionArray::MakeSparse(*expected_types, expected_fields, + SparseUnionArray::Make(*expected_types, expected_fields, {"ab", "wtf", "q"}, {0, 23, 47})); ASSERT_ARRAYS_EQUAL(*expected, *array); @@ -1178,7 +1178,7 @@ TEST(TestSparseUnion, UnionOfStructs) { TEST(TestDenseUnion, Errors) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); - std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); + std::shared_ptr type = dense_union({field_a, field_b}, {4, 8}); std::shared_ptr array; ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]", &array)); @@ -1195,7 +1195,7 @@ TEST(TestDenseUnion, Errors) { TEST(TestSparseUnion, Errors) { auto field_a = field("a", int8()); auto field_b = field("b", boolean()); - std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); + std::shared_ptr type = sparse_union({field_a, field_b}, {4, 8}); std::shared_ptr array; ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]", &array)); diff --git a/cpp/src/arrow/ipc/json_test.cc b/cpp/src/arrow/ipc/json_test.cc index 944d3521cb9..e55a448b95e 100644 --- a/cpp/src/arrow/ipc/json_test.cc +++ b/cpp/src/arrow/ipc/json_test.cc @@ -399,8 +399,9 @@ TEST(TestJsonSchemaWriter, FlatTypes) { field("f15", date64()), field("f16", timestamp(TimeUnit::NANO)), field("f17", time64(TimeUnit::MICRO)), - field("f18", union_({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))}, - {0, 1}, UnionMode::DENSE)), + field("f18", + dense_union({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))}, + {0, 1})), field("f19", large_list(uint8())), field("f20", null()), }; diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 4ac20dbe11a..e1b14e227e5 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -172,7 +172,14 @@ Status UnionFromFlatbuffer(const flatbuf::Union* union_data, } } - return UnionType::Make(children, type_codes, mode).Value(out); + if (mode == UnionMode::SPARSE) { + ARROW_ASSIGN_OR_RAISE( + *out, SparseUnionType::Make(std::move(children), std::move(type_codes))); + } else { + ARROW_ASSIGN_OR_RAISE( + *out, DenseUnionType::Make(std::move(children), std::move(type_codes))); + } + return Status::OK(); } #define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \ diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 9393c442d76..92367083464 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -629,24 +629,24 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { CheckArray(a1); // Sparse Union - auto union_type = union_({field("f0", a0->type())}, {0}); + auto union_type = sparse_union({field("f0", a0->type())}, {0}); std::vector type_ids(a0->length()); std::shared_ptr ids_buffer; ASSERT_OK(CopyBufferFromVector(type_ids, default_memory_pool(), &ids_buffer)); - a1 = - std::make_shared(union_type, a0->length(), struct_children, ids_buffer); + a1 = std::make_shared(union_type, a0->length(), struct_children, + ids_buffer); CheckArray(a1); // Dense union - auto dense_union_type = union_({field("f0", a0->type())}, {0}, UnionMode::DENSE); + auto dense_union_type = dense_union({field("f0", a0->type())}, {0}); std::vector type_offsets; for (int32_t i = 0; i < a0->length(); ++i) { type_offsets.push_back(i); } std::shared_ptr offsets_buffer; ASSERT_OK(CopyBufferFromVector(type_offsets, default_memory_pool(), &offsets_buffer)); - a1 = std::make_shared(dense_union_type, a0->length(), struct_children, - ids_buffer, offsets_buffer); + a1 = std::make_shared(dense_union_type, a0->length(), struct_children, + ids_buffer, offsets_buffer); CheckArray(a1); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index dbd476e8caf..e20ee06f252 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -303,7 +303,8 @@ class ArrayLoader { } Status Visit(const UnionType& type) { - out_->buffers.resize(3); + int n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3; + out_->buffers.resize(n_buffers); RETURN_NOT_OK(LoadCommon()); if (out_->length > 0) { @@ -312,7 +313,7 @@ class ArrayLoader { RETURN_NOT_OK(GetBuffer(buffer_index_ + 1, &out_->buffers[2])); } } - buffer_index_ += type.mode() == UnionMode::DENSE ? 2 : 1; + buffer_index_ += n_buffers - 1; return LoadChildren(type.fields()); } diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 00c39420f7d..608c2f1548e 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -435,15 +435,12 @@ Status MakeStruct(std::shared_ptr* out) { Status MakeUnion(std::shared_ptr* out) { // Define schema - std::vector> union_types( + std::vector> union_fields( {field("u0", int32()), field("u1", uint8())}); std::vector type_codes = {5, 10}; - auto sparse_type = - std::make_shared(union_types, type_codes, UnionMode::SPARSE); - - auto dense_type = - std::make_shared(union_types, type_codes, UnionMode::DENSE); + auto sparse_type = sparse_union(union_fields, type_codes); + auto dense_type = dense_union(union_fields, type_codes); auto f0 = field("sparse_nonnull", sparse_type, false); auto f1 = field("sparse", sparse_type); @@ -483,14 +480,14 @@ Status MakeUnion(std::shared_ptr* out) { ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes)); // construct individual nullable/non-nullable struct arrays - auto sparse_no_nulls = - std::make_shared(sparse_type, length, sparse_children, type_ids_buffer); - auto sparse = std::make_shared(sparse_type, length, sparse_children, - type_ids_buffer, NULLPTR, null_bitmap, 1); + auto sparse_no_nulls = std::make_shared( + sparse_type, length, sparse_children, type_ids_buffer); + auto sparse = std::make_shared(sparse_type, length, sparse_children, + type_ids_buffer, null_bitmap, 1); auto dense = - std::make_shared(dense_type, length, dense_children, type_ids_buffer, - offsets_buffer, null_bitmap, 1); + std::make_shared(dense_type, length, dense_children, + type_ids_buffer, offsets_buffer, null_bitmap, 1); // construct batch std::vector> arrays = {sparse_no_nulls, sparse, dense}; diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 76c5ad4cd25..d0d7deb65b4 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -398,7 +398,7 @@ class RecordBatchSerializer { return Status::OK(); } - Status Visit(const UnionArray& array) { + Status Visit(const SparseUnionArray& array) { const int64_t offset = array.offset(); const int64_t length = array.length(); @@ -409,91 +409,103 @@ class RecordBatchSerializer { out_->body_buffers.emplace_back(type_codes); --max_recursion_depth_; - if (array.mode() == UnionMode::DENSE) { - const auto& type = checked_cast(*array.type()); - - std::shared_ptr value_offsets; - RETURN_NOT_OK(GetTruncatedBuffer( - offset, length, static_cast(sizeof(int32_t)), array.value_offsets(), - options_.memory_pool, &value_offsets)); - - // The Union type codes are not necessary 0-indexed - int8_t max_code = 0; - for (int8_t code : type.type_codes()) { - if (code > max_code) { - max_code = code; - } - } + for (int i = 0; i < array.num_fields(); ++i) { + // Sparse union, slicing is done for us by field() + RETURN_NOT_OK(VisitArray(*array.field(i))); + } + ++max_recursion_depth_; + return Status::OK(); + } - // Allocate an array of child offsets. Set all to -1 to indicate that we - // haven't observed a first occurrence of a particular child yet - std::vector child_offsets(max_code + 1, -1); - std::vector child_lengths(max_code + 1, 0); + Status Visit(const DenseUnionArray& array) { + const int64_t offset = array.offset(); + const int64_t length = array.length(); - if (offset != 0) { - // This is an unpleasant case. Because the offsets are different for - // each child array, when we have a sliced array, we need to "rebase" - // the value_offsets for each array - - const int32_t* unshifted_offsets = array.raw_value_offsets(); - const int8_t* type_codes = array.raw_type_codes(); - - // Allocate the shifted offsets - ARROW_ASSIGN_OR_RAISE( - auto shifted_offsets_buffer, - AllocateBuffer(length * sizeof(int32_t), options_.memory_pool)); - int32_t* shifted_offsets = - reinterpret_cast(shifted_offsets_buffer->mutable_data()); - - // Offsets may not be ascending, so we need to find out the start offset - // for each child - for (int64_t i = 0; i < length; ++i) { - const uint8_t code = type_codes[i]; - if (child_offsets[code] == -1) { - child_offsets[code] = unshifted_offsets[i]; - } else { - child_offsets[code] = std::min(child_offsets[code], unshifted_offsets[i]); - } - } + std::shared_ptr type_codes; + RETURN_NOT_OK(GetTruncatedBuffer( + offset, length, static_cast(sizeof(UnionArray::type_code_t)), + array.type_codes(), options_.memory_pool, &type_codes)); + out_->body_buffers.emplace_back(type_codes); - // Now compute shifted offsets by subtracting child offset - for (int64_t i = 0; i < length; ++i) { - const int8_t code = type_codes[i]; - shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code]; - // Update the child length to account for observed value - child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1); - } + --max_recursion_depth_; + const auto& type = checked_cast(*array.type()); - value_offsets = std::move(shifted_offsets_buffer); + std::shared_ptr value_offsets; + RETURN_NOT_OK( + GetTruncatedBuffer(offset, length, static_cast(sizeof(int32_t)), + array.value_offsets(), options_.memory_pool, &value_offsets)); + + // The Union type codes are not necessary 0-indexed + int8_t max_code = 0; + for (int8_t code : type.type_codes()) { + if (code > max_code) { + max_code = code; } - out_->body_buffers.emplace_back(value_offsets); - - // Visit children and slice accordingly - for (int i = 0; i < type.num_fields(); ++i) { - std::shared_ptr child = array.field(i); - - // TODO: ARROW-809, for sliced unions, tricky to know how much to - // truncate the children. For now, we are truncating the children to be - // no longer than the parent union. - if (offset != 0) { - const int8_t code = type.type_codes()[i]; - const int64_t child_offset = child_offsets[code]; - const int64_t child_length = child_lengths[code]; - - if (child_offset > 0) { - child = child->Slice(child_offset, child_length); - } else if (child_length < child->length()) { - // This case includes when child is not encountered at all - child = child->Slice(0, child_length); - } + } + + // Allocate an array of child offsets. Set all to -1 to indicate that we + // haven't observed a first occurrence of a particular child yet + std::vector child_offsets(max_code + 1, -1); + std::vector child_lengths(max_code + 1, 0); + + if (offset != 0) { + // This is an unpleasant case. Because the offsets are different for + // each child array, when we have a sliced array, we need to "rebase" + // the value_offsets for each array + + const int32_t* unshifted_offsets = array.raw_value_offsets(); + const int8_t* type_codes = array.raw_type_codes(); + + // Allocate the shifted offsets + ARROW_ASSIGN_OR_RAISE( + auto shifted_offsets_buffer, + AllocateBuffer(length * sizeof(int32_t), options_.memory_pool)); + int32_t* shifted_offsets = + reinterpret_cast(shifted_offsets_buffer->mutable_data()); + + // Offsets may not be ascending, so we need to find out the start offset + // for each child + for (int64_t i = 0; i < length; ++i) { + const uint8_t code = type_codes[i]; + if (child_offsets[code] == -1) { + child_offsets[code] = unshifted_offsets[i]; + } else { + child_offsets[code] = std::min(child_offsets[code], unshifted_offsets[i]); } - RETURN_NOT_OK(VisitArray(*child)); } - } else { - for (int i = 0; i < array.num_fields(); ++i) { - // Sparse union, slicing is done for us by field() - RETURN_NOT_OK(VisitArray(*array.field(i))); + + // Now compute shifted offsets by subtracting child offset + for (int64_t i = 0; i < length; ++i) { + const int8_t code = type_codes[i]; + shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code]; + // Update the child length to account for observed value + child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1); + } + + value_offsets = std::move(shifted_offsets_buffer); + } + out_->body_buffers.emplace_back(value_offsets); + + // Visit children and slice accordingly + for (int i = 0; i < type.num_fields(); ++i) { + std::shared_ptr child = array.field(i); + + // TODO: ARROW-809, for sliced unions, tricky to know how much to + // truncate the children. For now, we are truncating the children to be + // no longer than the parent union. + if (offset != 0) { + const int8_t code = type.type_codes()[i]; + const int64_t child_offset = child_offsets[code]; + const int64_t child_length = child_lengths[code]; + + if (child_offset > 0) { + child = child->Slice(child_offset, child_length); + } else if (child_length < child->length()) { + // This case includes when child is not encountered at all + child = child->Slice(0, child_length); + } } + RETURN_NOT_OK(VisitArray(*child)); } ++max_recursion_depth_; return Status::OK(); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index d47f2d75391..70b6fd0419b 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -353,8 +353,9 @@ class ArrayPrinter : public PrettyPrinter { if (array.mode() == UnionMode::DENSE) { Newline(); Write("-- value_offsets: "); - Int32Array value_offsets(array.length(), array.value_offsets(), nullptr, 0, - array.offset()); + Int32Array value_offsets( + array.length(), checked_cast(array).value_offsets(), + nullptr, 0, array.offset()); RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + options_.indent_size, sink_)); } diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 261c06bcb32..dc0a287b067 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -981,7 +981,7 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_base_of::value || std::is_same::value || - std::is_same::value, + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc index 97080df4859..c7d99d23268 100644 --- a/cpp/src/arrow/python/deserialize.cc +++ b/cpp/src/arrow/python/deserialize.cc @@ -249,7 +249,7 @@ Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_ const SerializedPyObject& blobs, CreateSequenceFn&& create_sequence, SetItemFn&& set_item, PyObject** out) { - const auto& data = checked_cast(array); + const auto& data = checked_cast(array); OwnedRef result(create_sequence(stop_idx - start_idx)); RETURN_IF_PYERROR(); const int8_t* type_codes = data.raw_type_codes(); diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 89312bb08fb..9d10041e3d2 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -301,8 +301,8 @@ class SequenceBuilder { class DictBuilder { public: explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) { - builder_.reset(new StructBuilder(struct_({field("keys", union_(UnionMode::DENSE)), - field("vals", union_(UnionMode::DENSE))}), + builder_.reset(new StructBuilder(struct_({field("keys", dense_union(FieldVector{})), + field("vals", dense_union(FieldVector{}))}), pool, {keys_.builder(), vals_.builder()})); } diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index cc32d8aa549..030b2cc2a5a 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -470,7 +470,12 @@ struct FromTypeVisitor { return Status::Invalid("attempting to cast scalar of type null to ", *to_type_); } - Status Visit(const UnionType&) { return Status::NotImplemented("cast to ", *to_type_); } + Status Visit(const SparseUnionType&) { + return Status::NotImplemented("cast to ", *to_type_); + } + Status Visit(const DenseUnionType&) { + return Status::NotImplemented("cast to ", *to_type_); + } Status Visit(const DictionaryType&) { return Status::NotImplemented("cast to ", *to_type_); } @@ -499,7 +504,10 @@ struct ToTypeVisitor { return Status::OK(); } - Status Visit(const UnionType&) { + Status Visit(const SparseUnionType&) { + return Status::NotImplemented("cast from ", *from_.type); + } + Status Visit(const DenseUnionType&) { return Status::NotImplemented("cast from ", *from_.type); } Status Visit(const DictionaryType&) { diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 5caf04d86d8..57751e7a077 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -373,7 +373,16 @@ struct ARROW_EXPORT StructScalar : public Scalar { struct ARROW_EXPORT UnionScalar : public Scalar { using Scalar::Scalar; - using TypeClass = UnionType; +}; + +struct ARROW_EXPORT SparseUnionScalar : public UnionScalar { + using UnionScalar::UnionScalar; + using TypeClass = SparseUnionType; +}; + +struct ARROW_EXPORT DenseUnionScalar : public UnionScalar { + using UnionScalar::UnionScalar; + using TypeClass = DenseUnionType; }; struct ARROW_EXPORT DictionaryScalar : public Scalar { diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 94cd8679450..6022f7eda04 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -169,6 +169,11 @@ Result> ArrayFromBuilderVisitor( return ArrayFromBuilderVisitor(type, length, length, std::forward(fn)); } +static inline std::vector (*)(FieldVector, std::vector)> +UnionTypeFactories() { + return {sparse_union, dense_union}; +} + // Get a TCP port number to listen on. This is a different number every time, // as reusing the same port across tests can produce spurious bind errors on // Windows. diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5aef34b1be7..a6b383b0c5f 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -40,6 +40,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" +#include "arrow/util/range.h" #include "arrow/util/vector.h" #include "arrow/visitor_inline.h" @@ -67,7 +68,9 @@ constexpr Type::type StructType::type_id; constexpr Type::type Decimal128Type::type_id; -constexpr Type::type UnionType::type_id; +constexpr Type::type SparseUnionType::type_id; + +constexpr Type::type DenseUnionType::type_id; constexpr Type::type Date32Type::type_id; @@ -141,8 +144,10 @@ std::string ToString(Type::type id) { return "LIST"; case Type::STRUCT: return "STRUCT"; - case Type::UNION: - return "UNION"; + case Type::SPARSE_UNION: + return "SPARSE_UNION"; + case Type::DENSE_UNION: + return "DENSE_UNION"; case Type::DICTIONARY: return "DICTIONARY"; case Type::MAP: @@ -553,27 +558,23 @@ std::string DurationType::ToString() const { constexpr int8_t UnionType::kMaxTypeCode; constexpr int UnionType::kInvalidChildId; -UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode::type mode) - : NestedType(Type::UNION), - mode_(mode), - type_codes_(type_codes), +UnionMode::type UnionType::mode() const { + return id_ == Type::SPARSE_UNION ? UnionMode::SPARSE : UnionMode::DENSE; +} + +UnionType::UnionType(std::vector> fields, + std::vector type_codes, Type::type id) + : NestedType(id), + type_codes_(std::move(type_codes)), child_ids_(kMaxTypeCode + 1, kInvalidChildId) { - DCHECK_OK(ValidateParameters(fields, type_codes, mode)); - children_ = fields; + children_ = std::move(fields); + DCHECK_OK(ValidateParameters(children_, type_codes_, mode())); for (int child_id = 0; child_id < static_cast(type_codes_.size()); ++child_id) { const auto type_code = type_codes_[child_id]; child_ids_[type_code] = child_id; } } -Result> UnionType::Make( - const std::vector>& fields, - const std::vector& type_codes, UnionMode::type mode) { - RETURN_NOT_OK(ValidateParameters(fields, type_codes, mode)); - return std::make_shared(fields, type_codes, mode); -} - Status UnionType::ValidateParameters(const std::vector>& fields, const std::vector& type_codes, UnionMode::type mode) { @@ -589,10 +590,9 @@ Status UnionType::ValidateParameters(const std::vector>& } DataTypeLayout UnionType::layout() const { - if (mode_ == UnionMode::SPARSE) { - return DataTypeLayout({DataTypeLayout::Bitmap(), - DataTypeLayout::FixedWidth(sizeof(uint8_t)), - DataTypeLayout::AlwaysNull()}); + if (mode() == UnionMode::SPARSE) { + return DataTypeLayout( + {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(uint8_t))}); } else { return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(uint8_t)), @@ -609,11 +609,7 @@ uint8_t UnionType::max_type_code() const { std::string UnionType::ToString() const { std::stringstream s; - if (mode_ == UnionMode::SPARSE) { - s << "union[sparse]<"; - } else { - s << "union[dense]<"; - } + s << name() << "<"; for (size_t i = 0; i < children_.size(); ++i) { if (i) { @@ -625,6 +621,26 @@ std::string UnionType::ToString() const { return s.str(); } +SparseUnionType::SparseUnionType(std::vector> fields, + std::vector type_codes) + : UnionType(fields, type_codes, Type::SPARSE_UNION) {} + +Result> SparseUnionType::Make( + std::vector> fields, std::vector type_codes) { + RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::SPARSE)); + return std::make_shared(fields, type_codes); +} + +DenseUnionType::DenseUnionType(std::vector> fields, + std::vector type_codes) + : UnionType(fields, type_codes, Type::DENSE_UNION) {} + +Result> DenseUnionType::Make( + std::vector> fields, std::vector type_codes) { + RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::DENSE)); + return std::make_shared(fields, type_codes); +} + // ---------------------------------------------------------------------- // Struct type @@ -1867,7 +1883,7 @@ std::string StructType::ComputeFingerprint() const { std::string UnionType::ComputeFingerprint() const { std::stringstream ss; ss << TypeIdFingerprint(*this); - switch (mode_) { + switch (mode()) { case UnionMode::SPARSE: ss << "[s"; break; @@ -2025,45 +2041,59 @@ std::shared_ptr struct_(const std::vector>& fie return std::make_shared(fields); } -std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, - UnionMode::type mode) { - return std::make_shared(child_fields, type_codes, mode); +std::shared_ptr sparse_union(FieldVector child_fields, + std::vector type_codes) { + if (type_codes.empty()) { + type_codes = internal::Iota(static_cast(child_fields.size())); + } + return std::make_shared(std::move(child_fields), + std::move(type_codes)); +} +std::shared_ptr dense_union(FieldVector child_fields, + std::vector type_codes) { + if (type_codes.empty()) { + type_codes = internal::Iota(static_cast(child_fields.size())); + } + return std::make_shared(std::move(child_fields), std::move(type_codes)); } -std::shared_ptr union_(const std::vector>& child_fields, - UnionMode::type mode) { - std::vector type_codes(child_fields.size()); - for (int i = 0; i < static_cast(child_fields.size()); ++i) { - type_codes[i] = static_cast(i); +FieldVector FieldsFromArraysAndNames(std::vector names, + const ArrayVector& arrays) { + FieldVector fields(arrays.size()); + int i = 0; + if (names.empty()) { + for (const auto& array : arrays) { + fields[i] = field(std::to_string(i), array->type()); + ++i; + } + } else { + DCHECK_EQ(names.size(), arrays.size()); + for (const auto& array : arrays) { + fields[i] = field(std::move(names[i]), array->type()); + ++i; + } } - return std::make_shared(child_fields, type_codes, mode); + return fields; } -std::shared_ptr union_(UnionMode::type mode) { - std::vector> child_fields; - return union_(child_fields, mode); +std::shared_ptr sparse_union(const ArrayVector& children, + std::vector field_names, + std::vector type_codes) { + if (type_codes.empty()) { + type_codes = internal::Iota(static_cast(children.size())); + } + auto fields = FieldsFromArraysAndNames(std::move(field_names), children); + return sparse_union(std::move(fields), std::move(type_codes)); } -std::shared_ptr union_(const std::vector>& children, - const std::vector& field_names, - const std::vector& given_type_codes, - UnionMode::type mode) { - std::vector> fields; - std::vector type_codes(given_type_codes); - int8_t counter = 0; - for (const auto& child : children) { - if (field_names.size() == 0) { - fields.push_back(field(std::to_string(counter), child->type())); - } else { - fields.push_back(field(std::move(field_names[counter]), child->type())); - } - if (given_type_codes.size() == 0) { - type_codes.push_back(counter); - } - counter++; +std::shared_ptr dense_union(const ArrayVector& children, + std::vector field_names, + std::vector type_codes) { + if (type_codes.empty()) { + type_codes = internal::Iota(static_cast(children.size())); } - return union_(fields, std::move(type_codes), mode); + auto fields = FieldsFromArraysAndNames(std::move(field_names), children); + return dense_union(std::move(fields), std::move(type_codes)); } std::shared_ptr dictionary(const std::shared_ptr& index_type, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8bc75322b1e..ad4c91a80cf 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -127,8 +127,11 @@ struct Type { /// Struct of logical types STRUCT, - /// Unions of logical types - UNION, + /// Sparse unions of logical types + SPARSE_UNION, + + /// Dense unions of logical types + DENSE_UNION, /// Dictionary-encoded type, also called "categorical" or "factor" /// in other programming languages. Holds the dictionary value @@ -1015,25 +1018,12 @@ class ARROW_EXPORT Decimal128Type : public DecimalType { /// \brief Concrete type class for union data class ARROW_EXPORT UnionType : public NestedType { public: - static constexpr Type::type type_id = Type::UNION; static constexpr int8_t kMaxTypeCode = 127; static constexpr int kInvalidChildId = -1; - static constexpr const char* type_name() { return "union"; } - - UnionType(const std::vector>& fields, - const std::vector& type_codes, - UnionMode::type mode = UnionMode::SPARSE); - - // A constructor variant that validates input parameters - static Result> Make( - const std::vector>& fields, - const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); - DataTypeLayout layout() const override; std::string ToString() const override; - std::string name() const override { return "union"; } /// The array of logical type ids. /// @@ -1046,21 +1036,55 @@ class ARROW_EXPORT UnionType : public NestedType { uint8_t max_type_code() const; - UnionMode::type mode() const { return mode_; } + UnionMode::type mode() const; - private: - std::string ComputeFingerprint() const override; + protected: + UnionType(std::vector> fields, std::vector type_codes, + Type::type id); static Status ValidateParameters(const std::vector>& fields, const std::vector& type_codes, UnionMode::type mode); - UnionMode::type mode_; + private: + std::string ComputeFingerprint() const override; std::vector type_codes_; std::vector child_ids_; }; +class ARROW_EXPORT SparseUnionType : public UnionType { + public: + static constexpr Type::type type_id = Type::SPARSE_UNION; + + static constexpr const char* type_name() { return "sparse_union"; } + + SparseUnionType(std::vector> fields, + std::vector type_codes); + + // A constructor variant that validates input parameters + static Result> Make( + std::vector> fields, std::vector type_codes); + + std::string name() const override { return "sparse_union"; } +}; + +class ARROW_EXPORT DenseUnionType : public UnionType { + public: + static constexpr Type::type type_id = Type::DENSE_UNION; + + static constexpr const char* type_name() { return "dense_union"; } + + DenseUnionType(std::vector> fields, + std::vector type_codes); + + // A constructor variant that validates input parameters + static Result> Make( + std::vector> fields, std::vector type_codes); + + std::string name() const override { return "dense_union"; } +}; + // ---------------------------------------------------------------------- // Date and time types diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index db514477191..a56e27bf1ce 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -148,13 +148,20 @@ class Decimal128Array; class Decimal128Builder; struct Decimal128Scalar; -class UnionType; -class UnionArray; -struct UnionScalar; struct UnionMode { enum type { SPARSE, DENSE }; }; +class SparseUnionType; +class SparseUnionArray; +class SparseUnionBuilder; +struct SparseUnionScalar; + +class DenseUnionType; +class DenseUnionArray; +class DenseUnionBuilder; +struct DenseUnionScalar; + template class NumericArray; @@ -363,39 +370,21 @@ std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); std::shared_ptr ARROW_EXPORT struct_(const std::vector>& fields); -/// \brief Create a UnionType instance -std::shared_ptr ARROW_EXPORT -union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); +/// \brief Create a SparseUnionType instance +std::shared_ptr ARROW_EXPORT sparse_union(FieldVector child_fields, + std::vector type_codes = {}); +/// \brief Create a DenseUnionType instance +std::shared_ptr ARROW_EXPORT dense_union(FieldVector child_fields, + std::vector type_codes = {}); -/// \brief Create a UnionType instance +/// \brief Create a SparseUnionType instance std::shared_ptr ARROW_EXPORT -union_(const std::vector>& child_fields, - UnionMode::type mode = UnionMode::SPARSE); - -/// \brief Create a UnionType instance -std::shared_ptr ARROW_EXPORT union_(UnionMode::type mode = UnionMode::SPARSE); - -/// \brief Create a UnionType instance +sparse_union(const ArrayVector& children, std::vector field_names = {}, + std::vector type_codes = {}); +/// \brief Create a DenseUnionType instance std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, - const std::vector& field_names, const std::vector& type_codes, - UnionMode::type mode = UnionMode::SPARSE); - -/// \brief Create a UnionType instance -inline std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, - const std::vector& field_names, - UnionMode::type mode = UnionMode::SPARSE) { - return union_(children, field_names, {}, mode); -} - -/// \brief Create a UnionType instance -inline std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, - UnionMode::type mode = UnionMode::SPARSE) { - return union_(children, {}, {}, mode); -} +dense_union(const ArrayVector& children, std::vector field_names = {}, + std::vector type_codes = {}); /// \brief Create a DictionaryType instance /// \param[in] index_type the type of the dictionary indices (must be diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 159913e0a66..7cfde3461f6 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -615,8 +615,10 @@ TEST_F(TestSchema, TestDeeplyNestedMetadataComparison) { auto item0 = field("item", int32(), true); auto item1 = field("item", int32(), true, key_value_metadata({{"foo", "baz"}})); - Schema schema0({field("f", list(list(union_({field("struct", struct_({item0}))}))))}); - Schema schema1({field("f", list(list(union_({field("struct", struct_({item1}))}))))}); + Schema schema0( + {field("f", list(list(sparse_union({field("struct", struct_({item0}))}))))}); + Schema schema1( + {field("f", list(list(sparse_union({field("struct", struct_({item1}))}))))}); ASSERT_EQ(schema0.fingerprint(), schema1.fingerprint()); ASSERT_NE(schema0.metadata_fingerprint(), schema1.metadata_fingerprint()); @@ -1368,8 +1370,7 @@ TEST(TestNestedType, Equals) { auto f_type = field(inner_name, int32()); std::vector> fields = {f_type}; std::vector codes = {42}; - auto u_type = std::make_shared(fields, codes, UnionMode::SPARSE); - return field(union_name, u_type); + return field(union_name, sparse_union(fields, codes)); }; auto s0 = create_struct("f0", "s0"); @@ -1513,16 +1514,12 @@ TEST(TestUnionType, Basics) { child_ids2[11] = 1; child_ids2[12] = 2; - auto ty1 = checked_pointer_cast(union_(fields, UnionMode::DENSE)); - auto ty2 = - checked_pointer_cast(union_(fields, type_codes1, UnionMode::DENSE)); - auto ty3 = - checked_pointer_cast(union_(fields, type_codes2, UnionMode::DENSE)); - auto ty4 = checked_pointer_cast(union_(fields, UnionMode::SPARSE)); - auto ty5 = - checked_pointer_cast(union_(fields, type_codes1, UnionMode::SPARSE)); - auto ty6 = - checked_pointer_cast(union_(fields, type_codes2, UnionMode::SPARSE)); + auto ty1 = checked_pointer_cast(dense_union(fields)); + auto ty2 = checked_pointer_cast(dense_union(fields, type_codes1)); + auto ty3 = checked_pointer_cast(dense_union(fields, type_codes2)); + auto ty4 = checked_pointer_cast(sparse_union(fields)); + auto ty5 = checked_pointer_cast(sparse_union(fields, type_codes1)); + auto ty6 = checked_pointer_cast(sparse_union(fields, type_codes2)); ASSERT_EQ(ty1->type_codes(), type_codes1); ASSERT_EQ(ty2->type_codes(), type_codes1); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index d59af0dfe90..dbe584426e8 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -358,9 +358,18 @@ struct TypeTraits { }; template <> -struct TypeTraits { - using ArrayType = UnionArray; - using ScalarType = UnionScalar; +struct TypeTraits { + using ArrayType = SparseUnionArray; + using BuilderType = SparseUnionBuilder; + using ScalarType = SparseUnionScalar; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = DenseUnionArray; + using BuilderType = DenseUnionBuilder; + using ScalarType = DenseUnionScalar; constexpr static bool is_parameter_free = false; }; @@ -784,7 +793,8 @@ static inline bool is_nested(Type::type type_id) { case Type::FIXED_SIZE_LIST: case Type::MAP: case Type::STRUCT: - case Type::UNION: + case Type::SPARSE_UNION: + case Type::DENSE_UNION: return true; default: break; diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index 111b9d9d0d8..0a452d5c594 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -63,7 +63,8 @@ ARRAY_VISITOR_DEFAULT(LargeListArray) ARRAY_VISITOR_DEFAULT(MapArray) ARRAY_VISITOR_DEFAULT(FixedSizeListArray) ARRAY_VISITOR_DEFAULT(StructArray) -ARRAY_VISITOR_DEFAULT(UnionArray) +ARRAY_VISITOR_DEFAULT(SparseUnionArray) +ARRAY_VISITOR_DEFAULT(DenseUnionArray) ARRAY_VISITOR_DEFAULT(DictionaryArray) ARRAY_VISITOR_DEFAULT(Decimal128Array) ARRAY_VISITOR_DEFAULT(ExtensionArray) @@ -110,7 +111,8 @@ TYPE_VISITOR_DEFAULT(LargeListType) TYPE_VISITOR_DEFAULT(MapType) TYPE_VISITOR_DEFAULT(FixedSizeListType) TYPE_VISITOR_DEFAULT(StructType) -TYPE_VISITOR_DEFAULT(UnionType) +TYPE_VISITOR_DEFAULT(SparseUnionType) +TYPE_VISITOR_DEFAULT(DenseUnionType) TYPE_VISITOR_DEFAULT(DictionaryType) TYPE_VISITOR_DEFAULT(ExtensionType) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 5e83d35c524..7ab136c066f 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -59,7 +59,8 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const MapArray& array); virtual Status Visit(const FixedSizeListArray& array); virtual Status Visit(const StructArray& array); - virtual Status Visit(const UnionArray& array); + virtual Status Visit(const SparseUnionArray& array); + virtual Status Visit(const DenseUnionArray& array); virtual Status Visit(const DictionaryArray& array); virtual Status Visit(const ExtensionArray& array); }; @@ -100,7 +101,8 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const MapType& type); virtual Status Visit(const FixedSizeListType& type); virtual Status Visit(const StructType& type); - virtual Status Visit(const UnionType& type); + virtual Status Visit(const SparseUnionType& type); + virtual Status Visit(const DenseUnionType& type); virtual Status Visit(const DictionaryType& type); virtual Status Visit(const ExtensionType& type); }; diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index 1600ad95e3b..e1023f1bdcf 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -74,7 +74,8 @@ namespace arrow { ACTION(Map); \ ACTION(FixedSizeList); \ ACTION(Struct); \ - ACTION(Union); \ + ACTION(SparseUnion); \ + ACTION(DenseUnion); \ ACTION(Dictionary); \ ACTION(Extension) diff --git a/cpp/src/gandiva/llvm_types_test.cc b/cpp/src/gandiva/llvm_types_test.cc index cbcad352991..66696830618 100644 --- a/cpp/src/gandiva/llvm_types_test.cc +++ b/cpp/src/gandiva/llvm_types_test.cc @@ -53,7 +53,8 @@ TEST_F(TestLLVMTypes, TestFound) { } TEST_F(TestLLVMTypes, TestNotFound) { - EXPECT_EQ(types_->IRType(arrow::Type::type::UNION), nullptr); + EXPECT_EQ(types_->IRType(arrow::Type::SPARSE_UNION), nullptr); + EXPECT_EQ(types_->IRType(arrow::Type::DENSE_UNION), nullptr); EXPECT_EQ(types_->DataVecType(arrow::null()), nullptr); } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8c9b9ce14ba..1fa57b9d8a6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1590,7 +1590,7 @@ cdef class UnionArray(Array): """ if self.type.mode != "dense": raise ArrowTypeError("Can only get value offsets for dense arrays") - buf = pyarrow_wrap_buffer(( self.ap).value_offsets()) + buf = pyarrow_wrap_buffer(( self.ap).value_offsets()) return Array.from_buffers(int32(), len(self), [None, buf]) @staticmethod @@ -1626,7 +1626,7 @@ cdef class UnionArray(Array): for x in type_codes: c_type_codes.push_back(x) with nogil: - out = GetResultValue(CUnionArray.MakeDense( + out = GetResultValue(CDenseUnionArray.Make( deref(types.ap), deref(value_offsets.ap), c, c_field_names, c_type_codes)) cdef Array result = pyarrow_wrap_array(out) @@ -1665,7 +1665,7 @@ cdef class UnionArray(Array): for x in type_codes: c_type_codes.push_back(x) with nogil: - out = GetResultValue(CUnionArray.MakeSparse( + out = GetResultValue(CSparseUnionArray.Make( deref(types.ap), c, c_field_names, c_type_codes)) cdef Array result = pyarrow_wrap_array(out) result.validate() @@ -2055,7 +2055,8 @@ cdef dict _array_classes = { _Type_LARGE_LIST: LargeListArray, _Type_MAP: MapArray, _Type_FIXED_SIZE_LIST: FixedSizeListArray, - _Type_UNION: UnionArray, + _Type_SPARSE_UNION: UnionArray, + _Type_DENSE_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, _Type_LARGE_BINARY: LargeBinaryArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9ed9af35993..450560a1178 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -83,7 +83,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST" _Type_STRUCT" arrow::Type::STRUCT" - _Type_UNION" arrow::Type::UNION" + _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION" + _Type_DENSE_UNION" arrow::Type::DENSE_UNION" _Type_DICTIONARY" arrow::Type::DICTIONARY" _Type_MAP" arrow::Type::MAP" @@ -368,12 +369,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[int] GetAllFieldIndices(const c_string& name) cdef cppclass CUnionType" arrow::UnionType"(CDataType): - CUnionType(const vector[shared_ptr[CField]]& fields, - const vector[int8_t]& type_codes, UnionMode mode) UnionMode mode() const vector[int8_t]& type_codes() const vector[int]& child_ids() + cdef shared_ptr[CDataType] CMakeSparseUnionType" arrow::sparse_union"( + vector[shared_ptr[CField]] fields, + vector[int8_t] type_codes) + + cdef shared_ptr[CDataType] CMakeDenseUnionType" arrow::dense_union"( + vector[shared_ptr[CField]] fields, + vector[int8_t] type_codes) + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -548,28 +555,32 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() cdef cppclass CUnionArray" arrow::UnionArray"(CArray): + shared_ptr[CBuffer] type_codes() + int8_t* raw_type_codes() + int child_id(int64_t index) + shared_ptr[CArray] field(int pos) + const CArray* UnsafeField(int pos) + UnionMode mode() + + cdef cppclass CSparseUnionArray" arrow::SparseUnionArray"(CUnionArray): @staticmethod - CResult[shared_ptr[CArray]] MakeSparse( + CResult[shared_ptr[CArray]] Make( const CArray& type_codes, const vector[shared_ptr[CArray]]& children, const vector[c_string]& field_names, const vector[int8_t]& type_codes) + cdef cppclass CDenseUnionArray" arrow::DenseUnionArray"(CUnionArray): @staticmethod - CResult[shared_ptr[CArray]] MakeDense( + CResult[shared_ptr[CArray]] Make( const CArray& type_codes, const CArray& value_offsets, const vector[shared_ptr[CArray]]& children, const vector[c_string]& field_names, const vector[int8_t]& type_codes) - shared_ptr[CBuffer] type_codes() - int8_t* raw_type_codes() int32_t value_offset(int i) shared_ptr[CBuffer] value_offsets() - int child_id(int64_t index) - shared_ptr[CArray] field(int pos) - UnionMode mode() cdef cppclass CBinaryArray" arrow::BinaryArray"(CArray): const uint8_t* GetValue(int i, int32_t* length) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index e33f5d01565..cd5a2527dd4 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -93,7 +93,8 @@ Type_LARGE_LIST = _Type_LARGE_LIST Type_MAP = _Type_MAP Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST Type_STRUCT = _Type_STRUCT -Type_UNION = _Type_UNION +Type_SPARSE_UNION = _Type_SPARSE_UNION +Type_DENSE_UNION = _Type_DENSE_UNION Type_DICTIONARY = _Type_DICTIONARY UnionMode_SPARSE = _UnionMode_SPARSE diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 20898ffec02..fc400d7a08c 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -93,7 +93,9 @@ cdef api object pyarrow_wrap_data_type( out = FixedSizeListType.__new__(FixedSizeListType) elif type.get().id() == _Type_STRUCT: out = StructType.__new__(StructType) - elif type.get().id() == _Type_UNION: + elif type.get().id() == _Type_SPARSE_UNION: + out = UnionType.__new__(UnionType) + elif type.get().id() == _Type_DENSE_UNION: out = UnionType.__new__(UnionType) elif type.get().id() == _Type_TIMESTAMP: out = TimestampType.__new__(TimestampType) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 128bc7ca097..606ae87d4fd 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -850,11 +850,13 @@ cdef class UnionValue(ArrayValue): cdef getitem(self, int64_t i): cdef int child_id = self.ap.child_id(i) cdef shared_ptr[CArray] child = self.ap.field(child_id) + cdef CDenseUnionArray* dense if self.ap.mode() == _UnionMode_SPARSE: return box_scalar(self.type[child_id].type, child, i) else: + dense = self.ap return box_scalar(self.type[child_id].type, child, - self.ap.value_offset(i)) + dense.value_offset(i)) def as_py(self): """ @@ -984,7 +986,8 @@ cdef dict _array_value_classes = { _Type_LARGE_LIST: LargeListValue, _Type_MAP: MapValue, _Type_FIXED_SIZE_LIST: FixedSizeListValue, - _Type_UNION: UnionValue, + _Type_SPARSE_UNION: UnionValue, + _Type_DENSE_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, _Type_LARGE_BINARY: LargeBinaryValue, diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 8c59b907bb5..0c3cc070473 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2429,11 +2429,9 @@ def union(children_fields, mode, type_codes=None): c_type_codes = range(c_fields.size()) if mode == UnionMode_SPARSE: - union_type.reset(new CUnionType(c_fields, c_type_codes, - _UnionMode_SPARSE)) + union_type = CMakeSparseUnionType(c_fields, c_type_codes) else: - union_type.reset(new CUnionType(c_fields, c_type_codes, - _UnionMode_DENSE)) + union_type = CMakeDenseUnionType(c_fields, c_type_codes) return pyarrow_wrap_data_type(union_type) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index cc34cb6a684..66791543fec 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -35,8 +35,9 @@ _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64} _TEMPORAL_TYPES = {lib.Type_TIMESTAMP, lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES +_UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION} _NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT, - lib.Type_UNION, lib.Type_MAP} + lib.Type_MAP} | _UNION_TYPES def is_null(t): @@ -190,7 +191,7 @@ def is_union(t): """ Return True if value is an instance of a union type. """ - return t.id == lib.Type_UNION + return t.id in _UNION_TYPES def is_nested(t): From a9922a104a31b12bf74a8e8a7155545e7b7b29c8 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 8 Jun 2020 17:31:52 -0400 Subject: [PATCH 2/7] lint fixes --- c_glib/arrow-glib/basic-array.cpp | 15 +++++---------- c_glib/arrow-glib/basic-data-type.cpp | 15 +++++---------- c_glib/arrow-glib/composite-array.cpp | 10 ---------- c_glib/arrow-glib/type.cpp | 6 ++++-- c_glib/arrow-glib/type.h | 6 ++++-- c_glib/test/test-dense-union-data-type.rb | 4 ++-- c_glib/test/test-sparse-union-data-type.rb | 4 ++-- python/pyarrow/array.pxi | 3 ++- python/pyarrow/includes/libarrow.pxd | 8 ++++---- r/R/enums.R | 7 ++++--- r/R/type.R | 3 ++- 11 files changed, 34 insertions(+), 47 deletions(-) diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index dc45db7401b..edce2ed840d 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -2649,16 +2649,11 @@ garrow_array_new_raw_valist(std::shared_ptr *arrow_array, case arrow::Type::type::MAP: type = GARROW_TYPE_MAP_ARRAY; break; - case arrow::Type::type::UNION: - { - auto arrow_union_array = - std::static_pointer_cast(*arrow_array); - if (arrow_union_array->mode() == arrow::UnionMode::SPARSE) { - type = GARROW_TYPE_SPARSE_UNION_ARRAY; - } else { - type = GARROW_TYPE_DENSE_UNION_ARRAY; - } - } + case arrow::Type::type::SPARSE_UNION: + type = GARROW_TYPE_SPARSE_UNION_ARRAY; + break; + case arrow::Type::type::DENSE_UNION: + type = GARROW_TYPE_DENSE_UNION_ARRAY; break; case arrow::Type::type::DICTIONARY: type = GARROW_TYPE_DICTIONARY_ARRAY; diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 0febfe1f1ff..75984cdc9ce 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1376,16 +1376,11 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::STRUCT: type = GARROW_TYPE_STRUCT_DATA_TYPE; break; - case arrow::Type::type::UNION: - { - auto arrow_union_data_type = - std::static_pointer_cast(*arrow_data_type); - if (arrow_union_data_type->mode() == arrow::UnionMode::SPARSE) { - type = GARROW_TYPE_SPARSE_UNION_DATA_TYPE; - } else { - type = GARROW_TYPE_DENSE_UNION_DATA_TYPE; - } - } + case arrow::Type::type::SPARSE_UNION: + type = GARROW_TYPE_SPARSE_UNION_DATA_TYPE; + break; + case arrow::Type::type::DENSE_UNION: + type = GARROW_TYPE_DENSE_UNION_DATA_TYPE; break; case arrow::Type::type::DICTIONARY: type = GARROW_TYPE_DICTIONARY_DATA_TYPE; diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index 58bebb8179c..14dda373575 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -1345,7 +1345,6 @@ garrow_dictionary_array_dispose(GObject *object) g_object_unref(priv->dictionary); priv->dictionary = NULL; } -<<<<<<< HEAD G_OBJECT_CLASS(garrow_dictionary_array_parent_class)->dispose(object); } @@ -1368,15 +1367,6 @@ garrow_dictionary_array_set_property(GObject *object, default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; -======= - auto arrow_union_array = arrow::DenseUnionArray::Make( - *arrow_type_ids, *arrow_value_offsets, arrow_fields, arrow_field_names, - arrow_union_data_type->type_codes()); - if (garrow::check(error, arrow_union_array, "[dense-union-array][new][data-type]")) { - return GARROW_DENSE_UNION_ARRAY(garrow_array_new_raw(&(*arrow_union_array))); - } else { - return NULL; ->>>>>>> ARROW-8866: [C++] Split UNION into SPARSE_UNION and DENSE_UNION } } diff --git a/c_glib/arrow-glib/type.cpp b/c_glib/arrow-glib/type.cpp index 8c1667f7d22..6d46845c53e 100644 --- a/c_glib/arrow-glib/type.cpp +++ b/c_glib/arrow-glib/type.cpp @@ -96,8 +96,10 @@ garrow_type_from_raw(arrow::Type::type type) return GARROW_TYPE_STRUCT; case arrow::Type::type::MAP: return GARROW_TYPE_MAP; - case arrow::Type::type::UNION: - return GARROW_TYPE_UNION; + case arrow::Type::type::SPARSE_UNION: + return GARROW_TYPE_SPARSE_UNION; + case arrow::Type::type::DENSE_UNION: + return GARROW_TYPE_DENSE_UNION; case arrow::Type::type::DICTIONARY: return GARROW_TYPE_DICTIONARY; default: diff --git a/c_glib/arrow-glib/type.h b/c_glib/arrow-glib/type.h index 9c9974c28b0..ac327072c2a 100644 --- a/c_glib/arrow-glib/type.h +++ b/c_glib/arrow-glib/type.h @@ -54,7 +54,8 @@ G_BEGIN_DECLS * type. Storage type depends on the parameters. * @GARROW_TYPE_LIST: A list of some logical data type. * @GARROW_TYPE_STRUCT: Struct of logical types. - * @GARROW_TYPE_UNION: Unions of logical types. + * @GARROW_TYPE_SPARSE_UNION: Sparse unions of logical types. + * @GARROW_TYPE_DENSE_UNION: Dense unions of logical types. * @GARROW_TYPE_DICTIONARY: Dictionary aka Category type. * @GARROW_TYPE_MAP: A repeated struct logical type. * @GARROW_TYPE_EXTENSION: Custom data type, implemented by user. @@ -94,7 +95,8 @@ typedef enum { GARROW_TYPE_DECIMAL, GARROW_TYPE_LIST, GARROW_TYPE_STRUCT, - GARROW_TYPE_UNION, + GARROW_TYPE_SPARSE_UNION, + GARROW_TYPE_DENSE_UNION, GARROW_TYPE_DICTIONARY, GARROW_TYPE_MAP, GARROW_TYPE_EXTENSION, diff --git a/c_glib/test/test-dense-union-data-type.rb b/c_glib/test/test-dense-union-data-type.rb index 231767f8a54..85d1d8562ea 100644 --- a/c_glib/test/test-dense-union-data-type.rb +++ b/c_glib/test/test-dense-union-data-type.rb @@ -33,11 +33,11 @@ def setup end def test_type - assert_equal(Arrow::Type::UNION, @data_type.id) + assert_equal(Arrow::Type::DENSE_UNION, @data_type.id) end def test_to_s - assert_equal("union[dense]", + assert_equal("dense_union", @data_type.to_s) end diff --git a/c_glib/test/test-sparse-union-data-type.rb b/c_glib/test/test-sparse-union-data-type.rb index 30e24f7a11c..34626bb9368 100644 --- a/c_glib/test/test-sparse-union-data-type.rb +++ b/c_glib/test/test-sparse-union-data-type.rb @@ -33,11 +33,11 @@ def setup end def test_type - assert_equal(Arrow::Type::UNION, @data_type.id) + assert_equal(Arrow::Type::SPARSE_UNION, @data_type.id) end def test_to_s - assert_equal("union[sparse]", + assert_equal("sparse_union", @data_type.to_s) end diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1fa57b9d8a6..b0244237421 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1590,7 +1590,8 @@ cdef class UnionArray(Array): """ if self.type.mode != "dense": raise ArrowTypeError("Can only get value offsets for dense arrays") - buf = pyarrow_wrap_buffer(( self.ap).value_offsets()) + cdef CDenseUnionArray* dense = self.ap + buf = pyarrow_wrap_buffer(dense.value_offsets()) return Array.from_buffers(int32(), len(self), [None, buf]) @staticmethod diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 450560a1178..af434d51edf 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -374,12 +374,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const vector[int]& child_ids() cdef shared_ptr[CDataType] CMakeSparseUnionType" arrow::sparse_union"( - vector[shared_ptr[CField]] fields, - vector[int8_t] type_codes) + vector[shared_ptr[CField]] fields, + vector[int8_t] type_codes) cdef shared_ptr[CDataType] CMakeDenseUnionType" arrow::dense_union"( - vector[shared_ptr[CField]] fields, - vector[int8_t] type_codes) + vector[shared_ptr[CField]] fields, + vector[int8_t] type_codes) cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) diff --git a/r/R/enums.R b/r/R/enums.R index a818a3575c7..223d5a799a4 100644 --- a/r/R/enums.R +++ b/r/R/enums.R @@ -68,9 +68,10 @@ Type <- enum("Type::type", DECIMAL = 23L, LIST = 24L, STRUCT = 25L, - UNION = 26L, - DICTIONARY = 27L, - MAP = 28L + SPARSE_UNION = 26L, + DENSE_UNION = 27L, + DICTIONARY = 28L, + MAP = 29L ) #' @rdname enums diff --git a/r/R/type.R b/r/R/type.R index 4b0db1dbb12..ab30524ed3f 100644 --- a/r/R/type.R +++ b/r/R/type.R @@ -70,7 +70,8 @@ DataType <- R6Class("DataType", DECIMAL = shared_ptr(Decimal128Type, self$pointer()), LIST = shared_ptr(ListType, self$pointer()), STRUCT = shared_ptr(StructType, self$pointer()), - UNION = stop("Type UNION not implemented yet"), + SPARSE_UNION = stop("Type SPARSE_UNION not implemented yet"), + DENSE_UNION = stop("Type DENSE_UNION not implemented yet"), DICTIONARY = shared_ptr(DictionaryType, self$pointer()), MAP = stop("Type MAP not implemented yet") ) From 5ee17ef0fd1ff04842a04bf363882927b0d87f56 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 9 Jun 2020 13:49:47 -0400 Subject: [PATCH 3/7] fix ruby binding --- ruby/red-arrow/ext/arrow/converters.hpp | 17 ++++++++++------- ruby/red-arrow/ext/arrow/raw-records.cpp | 3 ++- ruby/red-arrow/ext/arrow/values.cpp | 3 ++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/ruby/red-arrow/ext/arrow/converters.hpp b/ruby/red-arrow/ext/arrow/converters.hpp index e357eda6ed6..fd180501f37 100644 --- a/ruby/red-arrow/ext/arrow/converters.hpp +++ b/ruby/red-arrow/ext/arrow/converters.hpp @@ -285,7 +285,8 @@ namespace red_arrow { // VISIT(Interval) VISIT(List) VISIT(Struct) - VISIT(Union) + VISIT(SparseUnion) + VISIT(DenseUnion) VISIT(Dictionary) VISIT(Decimal128) // TODO @@ -388,7 +389,8 @@ namespace red_arrow { // VISIT(Interval) VISIT(List) VISIT(Struct) - VISIT(Union) + VISIT(SparseUnion) + VISIT(DenseUnion) VISIT(Dictionary) VISIT(Decimal128) // TODO @@ -432,10 +434,10 @@ namespace red_arrow { index_ = index; switch (array.mode()) { case arrow::UnionMode::SPARSE: - convert_sparse(array); + convert_sparse(static_cast(array)); break; case arrow::UnionMode::DENSE: - convert_dense(array); + convert_dense(static_cast(array)); break; default: rb_raise(rb_eArgError, "Invalid union mode"); @@ -479,7 +481,8 @@ namespace red_arrow { // VISIT(Interval) VISIT(List) VISIT(Struct) - VISIT(Union) + VISIT(SparseUnion) + VISIT(DenseUnion) VISIT(Dictionary) VISIT(Decimal128) // TODO @@ -516,7 +519,7 @@ namespace red_arrow { return 0; } - void convert_sparse(const arrow::UnionArray& array) { + void convert(const arrow::SparseUnionArray& array) { const auto type = std::static_pointer_cast(array.type()).get(); const auto tag = "[raw-records][union-sparse-array]"; @@ -530,7 +533,7 @@ namespace red_arrow { field_name_ = field_name_keep; } - void convert_dense(const arrow::UnionArray& array) { + void convert_dense(const arrow::DenseUnionArray& array) { const auto type = std::static_pointer_cast(array.type()).get(); const auto tag = "[raw-records][union-dense-array]"; diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 28662632b65..28e857f897b 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -100,7 +100,8 @@ namespace red_arrow { // VISIT(Interval) VISIT(List) VISIT(Struct) - VISIT(Union) + VISIT(SparseUnion) + VISIT(DenseUnion) VISIT(Dictionary) VISIT(Decimal128) // TODO diff --git a/ruby/red-arrow/ext/arrow/values.cpp b/ruby/red-arrow/ext/arrow/values.cpp index 624d265370c..fd2f1272a54 100644 --- a/ruby/red-arrow/ext/arrow/values.cpp +++ b/ruby/red-arrow/ext/arrow/values.cpp @@ -81,7 +81,8 @@ namespace red_arrow { // VISIT(Interval) VISIT(List) VISIT(Struct) - VISIT(Union) + VISIT(SparseUnion) + VISIT(DenseUnion) VISIT(Dictionary) VISIT(Decimal128) // TODO From f1612d09c82cbf6091697a8bd03fe8b944ea2b1a Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 12 Jun 2020 11:15:11 -0400 Subject: [PATCH 4/7] review comments --- cpp/src/arrow/array/array_nested.h | 53 ++++++++++++++----------- ruby/red-arrow/ext/arrow/converters.hpp | 2 +- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 81bcbbab1c1..666b00f7933 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -335,7 +335,7 @@ class ARROW_EXPORT StructArray : public Array { // ---------------------------------------------------------------------- // Union -/// Concrete Array class for union data +/// Base class for SparseUnionArray and DenseUnionArray class ARROW_EXPORT UnionArray : public Array { public: using type_code_t = int8_t; @@ -376,7 +376,7 @@ class ARROW_EXPORT UnionArray : public Array { mutable std::vector> boxed_fields_; }; -/// Concrete Array class for union data +/// Concrete Array class for sparse union data class ARROW_EXPORT SparseUnionArray : public UnionArray { public: using TypeClass = SparseUnionType; @@ -390,30 +390,28 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { /// \brief Construct SparseUnionArray from type_ids and children /// - /// This function does the bare minimum of validation of the offsets and - /// input types. + /// This function does the bare minimum of validation of the input types. /// /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. static Result> Make(const Array& type_ids, ArrayVector children, - std::vector field_names = {}, - std::vector type_codes = {}); + std::vector type_codes) { + return Make(std::move(type_ids), std::move(children), std::vector{}, + std::move(type_codes)); + } - /// \brief Construct Sparse UnionArray from type_ids and children + /// \brief Construct SparseUnionArray with custom field names from type_ids and children /// - /// This function does the bare minimum of validation of the offsets and - /// input types. + /// This function does the bare minimum of validation of the input types. /// /// \param[in] type_ids An array of logical type ids for the union type /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. static Result> Make(const Array& type_ids, ArrayVector children, - std::vector type_codes) { - return Make(std::move(type_ids), std::move(children), std::vector{}, - std::move(type_codes)); - } + std::vector field_names = {}, + std::vector type_codes = {}); const SparseUnionType* union_type() const { return internal::checked_cast(union_type_); @@ -423,7 +421,7 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { void SetData(std::shared_ptr data); }; -/// Concrete Array class for union data +/// Concrete Array class for dense union data class ARROW_EXPORT DenseUnionArray : public UnionArray { public: using TypeClass = DenseUnionType; @@ -436,36 +434,43 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); - /// \brief Construct DenseUnionArray from type_ids and children + /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children /// /// This function does the bare minimum of validation of the offsets and /// input types. /// /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. static Result> Make(const Array& type_ids, const Array& value_offsets, ArrayVector children, - std::vector field_names = {}, - std::vector type_codes = {}); + std::vector type_codes) { + return Make(type_ids, value_offsets, std::move(children), std::vector{}, + std::move(type_codes)); + } - /// \brief Construct Dense UnionArray from type_ids and children + /// \brief Construct DenseUnionArray with custom field names from type_ids, + /// value_offsets, and children /// /// This function does the bare minimum of validation of the offsets and /// input types. /// /// \param[in] type_ids An array of logical type ids for the union type + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. static Result> Make(const Array& type_ids, const Array& value_offsets, ArrayVector children, - std::vector type_codes) { - return Make(type_ids, value_offsets, std::move(children), std::vector{}, - std::move(type_codes)); - } + std::vector field_names = {}, + std::vector type_codes = {}); const DenseUnionType* union_type() const { return internal::checked_cast(union_type_); diff --git a/ruby/red-arrow/ext/arrow/converters.hpp b/ruby/red-arrow/ext/arrow/converters.hpp index fd180501f37..3a19c8ca0ea 100644 --- a/ruby/red-arrow/ext/arrow/converters.hpp +++ b/ruby/red-arrow/ext/arrow/converters.hpp @@ -519,7 +519,7 @@ namespace red_arrow { return 0; } - void convert(const arrow::SparseUnionArray& array) { + void convert_sparse(const arrow::SparseUnionArray& array) { const auto type = std::static_pointer_cast(array.type()).get(); const auto tag = "[raw-records][union-sparse-array]"; From 0961cd43553cb9c5806f68ba539851b27475cc6e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 Jun 2020 12:03:52 -0500 Subject: [PATCH 5/7] Misc changes per code reviews. Try to fix Ruby unit tests --- cpp/src/arrow/array/array_primitive.h | 4 +- cpp/src/arrow/type.h | 10 ++++ cpp/src/arrow/type_fwd.h | 52 +++++++++++++++++++ .../test/test-dense-union-data-type.rb | 4 +- .../test/test-sparse-union-data-type.rb | 4 +- 5 files changed, 68 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index 2716573ee93..e58f5f4c8b6 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -108,11 +108,11 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { // For compatibility with Take kernel. TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } + int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } protected: - static constexpr int32_t byte_width() { return sizeof(TypeClass::DayMilliseconds); } - inline void SetData(const std::shared_ptr& data) { this->PrimitiveArray::SetData(data); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ad4c91a80cf..df830b6ec26 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1021,6 +1021,16 @@ class ARROW_EXPORT UnionType : public NestedType { static constexpr int8_t kMaxTypeCode = 127; static constexpr int kInvalidChildId = -1; + static Result> Make( + const std::vector>& fields, + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE) { + if (mode == UnionMode::SPARSE) { + return sparse_union(fields, type_codes); + } else { + return dense_union(fields, type_codes); + } + } + DataTypeLayout layout() const override; std::string ToString() const override; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index a56e27bf1ce..05995e4c54a 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -386,6 +386,58 @@ std::shared_ptr ARROW_EXPORT dense_union(const ArrayVector& children, std::vector field_names = {}, std::vector type_codes = {}); +/// \brief Create a UnionType instance +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& child_fields, + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE) { + if (mode == UnionMode::SPARSE) { + return sparse_union(child_fields, type_codes); + } else { + return dense_union(child_fields, type_codes); + } +} + +/// \brief Create a UnionType instance +ARROW_DEPRECATED("Deprecated in 1.0.0") +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& child_fields, + UnionMode::type mode = UnionMode::SPARSE) { + if (mode == UnionMode::SPARSE) { + return sparse_union(child_fields); + } else { + return dense_union(child_fields); + } +} + +/// \brief Create a UnionType instance +ARROW_DEPRECATED("Deprecated in 1.0.0") +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + const std::vector& field_names, const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE) { + if (mode == UnionMode::SPARSE) { + return sparse_union(children, field_names, type_codes); + } else { + return dense_union(children, field_names, type_codes); + } +} + +/// \brief Create a UnionType instance +ARROW_DEPRECATED("Deprecated in 1.0.0") +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + const std::vector& field_names, + UnionMode::type mode = UnionMode::SPARSE) { + return union_(children, field_names, {}, mode); +} + +/// \brief Create a UnionType instance +ARROW_DEPRECATED("Deprecated in 1.0.0") +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + UnionMode::type mode = UnionMode::SPARSE) { + return union_(children, {}, {}, mode); +} /// \brief Create a DictionaryType instance /// \param[in] index_type the type of the dictionary indices (must be /// a signed integer) diff --git a/ruby/red-arrow/test/test-dense-union-data-type.rb b/ruby/red-arrow/test/test-dense-union-data-type.rb index 96699e52e45..d8da6f77299 100644 --- a/ruby/red-arrow/test/test-dense-union-data-type.rb +++ b/ruby/red-arrow/test/test-dense-union-data-type.rb @@ -28,12 +28,12 @@ def setup end test("ordered arguments") do - assert_equal("union[dense]", + assert_equal("dense_union", Arrow::DenseUnionDataType.new(@fields, [2, 9]).to_s) end test("description") do - assert_equal("union[dense]", + assert_equal("dense_union", Arrow::DenseUnionDataType.new(fields: @fields, type_codes: [2, 9]).to_s) end diff --git a/ruby/red-arrow/test/test-sparse-union-data-type.rb b/ruby/red-arrow/test/test-sparse-union-data-type.rb index 4159b42268d..e672f82d46f 100644 --- a/ruby/red-arrow/test/test-sparse-union-data-type.rb +++ b/ruby/red-arrow/test/test-sparse-union-data-type.rb @@ -28,12 +28,12 @@ def setup end test("ordered arguments") do - assert_equal("union[sparse]", + assert_equal("sparse_union", Arrow::SparseUnionDataType.new(@fields, [2, 9]).to_s) end test("description") do - assert_equal("union[sparse]", + assert_equal("sparse_union", Arrow::SparseUnionDataType.new(fields: @fields, type_codes: [2, 9]).to_s) end From d4c5b1bed4967bfd6a95183e64ee1c1d799296da Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 Jun 2020 12:05:59 -0500 Subject: [PATCH 6/7] Add additional ARROW_DEPRECATED --- cpp/src/arrow/type_fwd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 05995e4c54a..511f6eee956 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -387,6 +387,7 @@ dense_union(const ArrayVector& children, std::vector field_names = std::vector type_codes = {}); /// \brief Create a UnionType instance +ARROW_DEPRECATED("Deprecated in 1.0.0") inline std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE) { From 8be24e05a0763908d327e0b1ce8b3faf13223160 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 12 Jun 2020 12:12:58 -0500 Subject: [PATCH 7/7] Fix deprecated usages --- cpp/src/arrow/type_fwd.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 511f6eee956..4c432cc634b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -429,7 +429,11 @@ inline std::shared_ptr ARROW_EXPORT union_(const std::vector>& children, const std::vector& field_names, UnionMode::type mode = UnionMode::SPARSE) { - return union_(children, field_names, {}, mode); + if (mode == UnionMode::SPARSE) { + return sparse_union(children, field_names); + } else { + return dense_union(children, field_names); + } } /// \brief Create a UnionType instance @@ -437,7 +441,11 @@ ARROW_DEPRECATED("Deprecated in 1.0.0") inline std::shared_ptr ARROW_EXPORT union_(const std::vector>& children, UnionMode::type mode = UnionMode::SPARSE) { - return union_(children, {}, {}, mode); + if (mode == UnionMode::SPARSE) { + return sparse_union(children); + } else { + return dense_union(children); + } } /// \brief Create a DictionaryType instance /// \param[in] index_type the type of the dictionary indices (must be