diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index b523876bf0e..9c91d619cc7 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -393,6 +393,62 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } +Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out) { + if (value_offsets.length() == 0) { + return Status::Invalid("UnionArray offsets must have non-zero length"); + } + + if (value_offsets.type_id() != Type::INT32) { + return Status::Invalid("UnionArray offsets must be signed int32"); + } + + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + + if (value_offsets.null_count() != 0) { + return Status::Invalid("MakeDense does not allow NAs in value_offsets"); + } + + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), + static_cast(value_offsets).values()}; + auto union_type = union_(children, UnionMode::DENSE); + auto internal_data = + std::make_shared(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + +Status UnionArray::MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out) { + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), nullptr}; + auto union_type = union_(children, UnionMode::SPARSE); + auto internal_data = + std::make_shared(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + if (child->length() != type_ids.length()) { + return Status::Invalid( + "Sparse UnionArray must have len(child) == len(type_ids) for all children"); + } + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { boxed_fields_[i] = MakeArray(data_->child_data[i]); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index afbd780dd3a..f7762ce1043 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -612,16 +612,47 @@ class ARROW_EXPORT UnionArray : public Array { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out); + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out); + /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode mode() const { return static_cast(*type()).mode(); } + UnionMode::type mode() const { return static_cast(*type()).mode(); } std::shared_ptr child(int pos) const; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2ec86c3695a..a2d4de7b73a 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -152,7 +152,7 @@ class RangeEqualsVisitor { bool CompareUnions(const UnionArray& left) const { const auto& right = static_cast(right_); - const UnionMode union_mode = left.mode(); + const UnionMode::type union_mode = left.mode(); if (union_mode != right.mode()) { return false; } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index c1c0661d6ad..1b9baee7daf 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -774,7 +774,7 @@ static Status GetUnion(const RjObject& json_type, RETURN_NOT_STRING("mode", it_mode, json_type); std::string mode_str = it_mode->value.GetString(); - UnionMode mode; + UnionMode::type mode; if (mode_str == "SPARSE") { mode = UnionMode::SPARSE; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index f0f0f675853..63ef8a549f2 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -163,8 +163,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& type, static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, const std::vector>& children, std::shared_ptr* out) { - UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE - : UnionMode::DENSE; + UnionMode::type mode = + (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE + : UnionMode::DENSE); std::vector type_codes; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a9bf5919185..0d1985fb2d9 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -190,7 +190,7 @@ std::string TimestampType::ToString() const { // Union type UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode) + const std::vector& type_codes, UnionMode::type mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { children_ = fields; } @@ -440,10 +440,24 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode) { + const std::vector& type_codes, + UnionMode::type mode) { return std::make_shared(child_fields, type_codes, mode); } +std::shared_ptr union_(const std::vector>& children, + UnionMode::type mode) { + std::vector> types; + std::vector type_codes; + uint8_t counter = 0; + for (const auto& child : children) { + types.push_back(field(std::to_string(counter), child->type())); + type_codes.push_back(counter); + counter++; + } + return union_(types, type_codes, mode); +} + std::shared_ptr dictionary(const std::shared_ptr& index_type, const std::shared_ptr& dict_values, bool ordered) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 446f4d3a0b3..9e11a034420 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -517,14 +517,17 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { int32_t scale_; }; -enum class UnionMode : char { SPARSE, DENSE }; +struct UnionMode { + enum type { SPARSE, DENSE }; +}; class ARROW_EXPORT UnionType : public NestedType { public: static constexpr Type::type type_id = Type::UNION; UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); std::string ToString() const override; std::string name() const override { return "union"; } @@ -534,10 +537,10 @@ class ARROW_EXPORT UnionType : public NestedType { const std::vector& type_codes() const { return type_codes_; } - UnionMode mode() const { return mode_; } + UnionMode::type mode() const { return mode_; } private: - UnionMode mode_; + UnionMode::type mode_; // The type id used in the data to indicate each data type in the union. For // example, the first type in the union might be denoted by the id 5 (instead @@ -842,7 +845,12 @@ struct_(const std::vector>& fields); /// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create and instance of Union type +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + UnionMode::type mode = UnionMode::SPARSE); /// \brief Create an instance of Dictionary type std::shared_ptr ARROW_EXPORT diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1215c822d2e..2d7d7288b38 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -36,7 +36,7 @@ time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, - list_, struct, dictionary, field, + list_, struct, union, dictionary, field, type_for_alias, DataType, NAType, Field, @@ -52,7 +52,7 @@ Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, - ListArray, + ListArray, UnionArray, BinaryArray, StringArray, FixedSizeBinaryArray, DictionaryArray, diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 04a5b1368ce..7e5e575096d 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -192,7 +192,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: int64_t num_values() const shared_ptr[ColumnPath] path_in_schema() const bint is_stats_set() const - shared_ptr[CRowGroupStatistics] statistics() const; + shared_ptr[CRowGroupStatistics] statistics() const ParquetCompression compression() const const vector[ParquetEncoding]& encodings() const diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7752d062a77..9991411e55d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -631,6 +631,58 @@ cdef class ListArray(Array): return pyarrow_wrap_array(out) +cdef class UnionArray(Array): + + @staticmethod + def from_dense(Array types, Array value_offsets, list children): + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + + Returns + ------- + union_array : UnionArray + """ + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.MakeDense( + deref(types.ap), deref(value_offsets.ap), c, &out)) + return pyarrow_wrap_array(out) + + @staticmethod + def from_sparse(Array types, list children): + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + + Returns + ------- + union_array : UnionArray + """ + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out)) + return pyarrow_wrap_array(out) + cdef class StringArray(Array): pass @@ -789,6 +841,7 @@ cdef dict _array_classes = { _Type_FLOAT: FloatArray, _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, + _Type_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, _Type_DICTIONARY: DictionaryArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 731ef94971d..dfafd371b28 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_DICTIONARY" arrow::Type::DICTIONARY" _Type_MAP" arrow::Type::MAP" + enum UnionMode" arrow::UnionMode::type": + _UnionMode_SPARSE" arrow::UnionMode::SPARSE" + _UnionMode_DENSE" arrow::UnionMode::DENSE" + enum TimeUnit" arrow::TimeUnit::type": TimeUnit_SECOND" arrow::TimeUnit::SECOND" TimeUnit_MILLI" arrow::TimeUnit::MILLI" @@ -222,6 +226,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) + cdef cppclass CUnionType" arrow::UnionType"(CDataType): + CUnionType(const vector[shared_ptr[CField]]& fields, + const vector[uint8_t]& type_codes, UnionMode mode) + UnionMode mode() + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -317,6 +326,22 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CUnionArray" arrow::UnionArray"(CArray): + @staticmethod + CStatus MakeSparse(const CArray& type_ids, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) + + @staticmethod + CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) + uint8_t* raw_type_ids() + int32_t value_offset(int i) + shared_ptr[CArray] child(int pos) + const CArray* UnsafeChild(int pos) + UnionMode mode() + cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): const uint8_t* GetValue(int i, int32_t* length) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 8fdcf553c13..53148949075 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -56,6 +56,11 @@ cdef class DictionaryType(DataType): const CDictionaryType* dict_type +cdef class UnionType(DataType): + cdef: + list child_types + + cdef class TimestampType(DataType): cdef: const CTimestampType* ts_type @@ -139,6 +144,13 @@ cdef class ListValue(ArrayValue): cdef getitem(self, int64_t i) +cdef class UnionValue(ArrayValue): + cdef: + CUnionArray* ap + list value_types + + cdef getitem(self, int64_t i) + cdef class StringValue(ArrayValue): pass @@ -242,6 +254,10 @@ cdef class ListArray(Array): pass +cdef class UnionArray(Array): + pass + + cdef class StringArray(Array): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 6f4451e3f5a..b4ca49cafe1 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -92,6 +92,8 @@ Type_UNION = _Type_UNION Type_DICTIONARY = _Type_DICTIONARY Type_MAP = _Type_MAP +UnionMode_SPARSE = _UnionMode_SPARSE +UnionMode_DENSE = _UnionMode_DENSE # Exception types include "error.pxi" diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 9f105122804..90aff9e936d 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -72,7 +72,7 @@ cdef public api object pyarrow_wrap_data_type( elif type.get().id() == _Type_STRUCT: out = StructType() elif type.get().id() == _Type_UNION: - out = StructType() + out = UnionType() elif type.get().id() == _Type_TIMESTAMP: out = TimestampType() elif type.get().id() == _Type_FIXED_SIZE_BINARY: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index c37ed3b200e..a396fa763c8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -315,6 +315,24 @@ cdef class ListValue(ArrayValue): return result +cdef class UnionValue(ArrayValue): + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + + cdef getitem(self, int64_t i): + cdef int8_t type_id = self.ap.raw_type_ids()[i] + cdef shared_ptr[CArray] child = self.ap.child(type_id) + if self.ap.mode() == _UnionMode_SPARSE: + return box_scalar(self.type[type_id], child, i) + else: + return box_scalar(self.type[type_id], child, + self.ap.value_offset(i)) + + def as_py(self): + return self.getitem(self.index).as_py() + cdef class FixedSizeBinaryValue(ArrayValue): def as_py(self): @@ -364,6 +382,7 @@ cdef dict _scalar_classes = { _Type_FLOAT: FloatValue, _Type_DOUBLE: DoubleValue, _Type_LIST: ListValue, + _Type_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index e3a4c97567e..7dc93c28ea7 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -235,6 +235,28 @@ def test_list_from_arrays(): assert result.equals(expected) +def test_union_from_dense(): + binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') + int64 = pa.array([1, 2, 3], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') + + result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd'] + + +def test_union_from_sparse(): + binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], + type='binary') + int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + + result = pa.UnionArray.from_sparse(types, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd'] + + def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index d6b2655b7c6..116f3978333 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -319,6 +319,14 @@ def test_type_schema_pickling(): pa.field('a', 'int8'), pa.field('b', 'string') ]), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_SPARSE), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e6ff5b1560c..0e3ea1fd40b 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -85,16 +85,17 @@ def test_is_nested_or_struct(): assert not types.is_nested(pa.int32()) -# TODO(wesm): Union types not yet implemented in pyarrow +def test_is_union(): + assert types.is_union(pa.union([pa.field('a', pa.int32()), + pa.field('b', pa.int8()), + pa.field('c', pa.string())], + pa.lib.UnionMode_SPARSE)) + assert not types.is_union(pa.list_(pa.int32())) -# def test_is_union(): -# assert types.is_union(pa.union([pa.field('a', pa.int32()), -# pa.field('b', pa.int8()), -# pa.field('c', pa.string())])) -# assert not types.is_union(pa.list_(pa.int32())) # TODO(wesm): is_map, once implemented + def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index c9a490960ec..d2e68ff79a5 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -186,7 +186,32 @@ cdef class UnionType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) + self.child_types = [ + pyarrow_wrap_data_type(type.get().child(i).get().type()) + for i in range(self.num_children)] + property num_children: + + def __get__(self): + return self.type.num_children() + + property mode: + + def __get__(self): + cdef CUnionType* type = self.sp_type.get() + return type.mode() + + def __getitem__(self, i): + return self.child_types[i] + + def __getstate__(self): + children = [pyarrow_wrap_field(self.type.child(i)) + for i in range(self.num_children)] + return children, self.mode + + def __setstate__(self, state): + cdef DataType reconstituted = union(*state) + self.init(reconstituted.sp_type) cdef class TimestampType(DataType): @@ -1056,6 +1081,30 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +def union(children_fields, mode): + """ + Create UnionType from children fields. + """ + cdef: + Field child_field + vector[shared_ptr[CField]] c_fields + vector[uint8_t] type_codes + shared_ptr[CDataType] union_type + int i + + for i, child_field in enumerate(children_fields): + type_codes.push_back(i) + c_fields.push_back(child_field.sp_field) + + if mode == UnionMode_SPARSE: + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_SPARSE)) + else: + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_DENSE)) + + return pyarrow_wrap_data_type(union_type) + cdef dict _type_aliases = { 'null': null, 'i1': int8,