From b796ce645065f6565383963dbcb57aab70124b7b Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 19 Oct 2017 15:51:19 -0700 Subject: [PATCH 01/10] Implement UnionArray in pyarrow --- cpp/src/arrow/array.cc | 30 ++++++++++++++++++++++++++++ cpp/src/arrow/array.h | 8 ++++++++ python/pyarrow/__init__.py | 2 +- python/pyarrow/array.pxi | 14 +++++++++++++ python/pyarrow/includes/libarrow.pxd | 11 ++++++++++ python/pyarrow/lib.pxd | 11 ++++++++++ python/pyarrow/scalar.pxi | 16 +++++++++++++++ python/pyarrow/tests/test_array.py | 17 ++++++++++++++++ 8 files changed, 108 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index b523876bf0e..b4545bfad03 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -393,6 +393,32 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } +Status UnionArray::FromArrays(const std::vector>& children, + const Array& type_ids, const Array& value_offsets, + std::shared_ptr* out) { + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), + static_cast(value_offsets).values()}; + auto types = std::vector>(); + std::vector type_codes; + uint8_t counter = 0; + for (const auto& child : children) { + types.push_back(field("", child->type())); + type_codes.push_back(counter); + counter++; + } + // TODO(pcm): Do not hardcode UnionMode::DENSE here + auto union_type = union_(types, type_codes, UnionMode::DENSE); + auto internal_data = + std::make_shared(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { boxed_fields_[i] = MakeArray(data_->child_data[i]); @@ -409,6 +435,10 @@ const Array* UnionArray::UnsafeChild(int i) const { return boxed_fields_[i].get(); } +std::shared_ptr UnionArray::value_type(int pos) const { + return child(pos)->type(); +} + // ---------------------------------------------------------------------- // DictionaryArray diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index afbd780dd3a..091e03dc454 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -612,12 +612,18 @@ class ARROW_EXPORT UnionArray : public Array { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); + static Status FromArrays(const std::vector>& children, + const Array& type_ids, const Array& value_offsets, + std::shared_ptr* out); + /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } @@ -628,6 +634,8 @@ class ARROW_EXPORT UnionArray : public Array { /// Only use this while the UnionArray is in scope const Array* UnsafeChild(int pos) const; + std::shared_ptr value_type(int pos) const; + protected: void SetData(const std::shared_ptr& data); diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1215c822d2e..b9feec07425 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -52,7 +52,7 @@ Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, - ListArray, + ListArray, UnionArray, BinaryArray, StringArray, FixedSizeBinaryArray, DictionaryArray, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7752d062a77..5683e153f27 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -631,6 +631,19 @@ cdef class ListArray(Array): return pyarrow_wrap_array(out) +cdef class UnionArray(Array): + + @staticmethod + def from_arrays(list children, Array types, Array value_offsets): + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.FromArrays(c, deref(types.ap), deref(value_offsets.ap), &out)) + return pyarrow_wrap_array(out) + cdef class StringArray(Array): pass @@ -789,6 +802,7 @@ cdef dict _array_classes = { _Type_FLOAT: FloatArray, _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, + _Type_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, _Type_DICTIONARY: DictionaryArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 731ef94971d..537eb752bfd 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -317,6 +317,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CUnionArray" arrow::UnionArray"(CArray): + @staticmethod + CStatus FromArrays(const vector[shared_ptr[CArray]]& children, + const CArray& type_ids, const CArray& value_offsets, + shared_ptr[CArray]* out) + uint8_t* raw_type_ids() + int32_t value_offset(int i) + shared_ptr[CArray] child(int pos) + const CArray* UnsafeChild(int pos) + shared_ptr[CDataType] value_type(int pos) + cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): const uint8_t* GetValue(int i, int32_t* length) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 8fdcf553c13..19972af8408 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -139,6 +139,13 @@ cdef class ListValue(ArrayValue): cdef getitem(self, int64_t i) +cdef class UnionValue(ArrayValue): + cdef: + CUnionArray* ap + list value_types + + cdef getitem(self, int64_t i) + cdef class StringValue(ArrayValue): pass @@ -242,6 +249,10 @@ cdef class ListArray(Array): pass +cdef class UnionArray(Array): + pass + + cdef class StringArray(Array): pass diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index c37ed3b200e..2eccbcdc2de 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -315,6 +315,21 @@ cdef class ListValue(ArrayValue): return result +cdef class UnionValue(ArrayValue): + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + self.value_types = [pyarrow_wrap_data_type(self.ap.value_type(i)) for i in range(self.ap.num_fields())] + + cdef getitem(self, int64_t i): + cdef int8_t type_id = self.ap.raw_type_ids()[i] + cdef shared_ptr[CArray] child = self.ap.child(type_id) + return box_scalar(self.value_types[type_id], child, self.ap.value_offset(i)) + + def as_py(self): + return self.getitem(self.index).as_py() + cdef class FixedSizeBinaryValue(ArrayValue): def as_py(self): @@ -364,6 +379,7 @@ cdef dict _scalar_classes = { _Type_FLOAT: FloatValue, _Type_DOUBLE: DoubleValue, _Type_LIST: ListValue, + _Type_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index e3a4c97567e..6eabe664e7c 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -235,6 +235,23 @@ def test_list_from_arrays(): assert result.equals(expected) +def test_union_from_arrays(): + binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') + int64 = pa.array([1, 2, 3], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') + + result = pa.UnionArray.from_arrays([binary, int64], types, value_offsets) + + assert result[0].as_py() == b'a' + assert result[1].as_py() == 1 + assert result[2].as_py() == b'c' + assert result[3].as_py() == b'b' + assert result[4].as_py() == 2 + assert result[5].as_py() == 3 + assert result[6].as_py() == b'd' + + def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case From cbdedc7a036c1da5404b838ba3d3f3b46512fb96 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 19 Oct 2017 18:12:24 -0700 Subject: [PATCH 02/10] make fields in UnionArray unique to be compatiable with Java --- cpp/src/arrow/array.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index b4545bfad03..88bd37b0376 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -403,7 +403,7 @@ Status UnionArray::FromArrays(const std::vector>& childre std::vector type_codes; uint8_t counter = 0; for (const auto& child : children) { - types.push_back(field("", child->type())); + types.push_back(field(std::to_string(counter), child->type())); type_codes.push_back(counter); counter++; } From d8da01706a9eca9c0397dd13df74166a1657d9c0 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 4 Nov 2017 14:24:02 -0700 Subject: [PATCH 03/10] implement dense and sparse UnionArrays --- cpp/src/arrow/array.cc | 54 +++++++++++++++++++++++----- cpp/src/arrow/array.h | 29 +++++++++++++-- python/pyarrow/array.pxi | 15 ++++++-- python/pyarrow/includes/libarrow.pxd | 13 +++++-- python/pyarrow/scalar.pxi | 5 ++- python/pyarrow/tests/test_array.py | 22 +++++++----- 6 files changed, 114 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 88bd37b0376..5107ed95ec5 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -393,12 +393,7 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } -Status UnionArray::FromArrays(const std::vector>& children, - const Array& type_ids, const Array& value_offsets, - std::shared_ptr* out) { - BufferVector buffers = {type_ids.null_bitmap(), - static_cast(type_ids).values(), - static_cast(value_offsets).values()}; +std::shared_ptr MakeUnionArrayType(UnionMode mode, const std::vector>& children) { auto types = std::vector>(); std::vector type_codes; uint8_t counter = 0; @@ -407,8 +402,28 @@ Status UnionArray::FromArrays(const std::vector>& childre type_codes.push_back(counter); counter++; } - // TODO(pcm): Do not hardcode UnionMode::DENSE here - auto union_type = union_(types, type_codes, UnionMode::DENSE); + return union_(types, type_codes, mode); +} + +Status UnionArray::FromDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out) { + if (value_offsets.length() == 0) { + return Status::Invalid("UnionArray offsets must have non-zero length"); + } + + if (value_offsets.type_id() != Type::INT32) { + return Status::Invalid("UnionArray offsets must be signed int32"); + } + + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), + static_cast(value_offsets).values()}; + auto union_type = MakeUnionArrayType(UnionMode::DENSE, children); auto internal_data = std::make_shared(union_type, type_ids.length(), std::move(buffers), type_ids.null_count(), type_ids.offset()); @@ -419,6 +434,29 @@ Status UnionArray::FromArrays(const std::vector>& childre return Status::OK(); } +Status UnionArray::FromSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out) { + if (type_ids.type_id() != Type::INT8) { + return Status::Invalid("UnionArray type_ids must be signed int8"); + } + BufferVector buffers = {type_ids.null_bitmap(), + static_cast(type_ids).values(), + nullptr}; + auto union_type = MakeUnionArrayType(UnionMode::SPARSE, children); + auto internal_data = + std::make_shared(union_type, type_ids.length(), std::move(buffers), + type_ids.null_count(), type_ids.offset()); + for (const auto& child : children) { + internal_data->child_data.push_back(child->data()); + if (child->length() != type_ids.length()) { + return Status::Invalid("Sparse UnionArray must have len(child) == len(type_ids) for all children"); + } + } + *out = std::make_shared(internal_data); + return Status::OK(); +} + std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { boxed_fields_[i] = MakeArray(data_->child_data[i]); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 091e03dc454..c205e44242f 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -612,8 +612,33 @@ class ARROW_EXPORT UnionArray : public Array { const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = 0, int64_t offset = 0); - static Status FromArrays(const std::vector>& children, - const Array& type_ids, const Array& value_offsets, + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to value_offsets.length() + static Status FromDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out); + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to type_ids.length() + static Status FromSparse(const Array& type_ids, + const std::vector>& children, std::shared_ptr* out); /// Note that this buffer does not account for any slice offset diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5683e153f27..b73205c9b5d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -634,14 +634,25 @@ cdef class ListArray(Array): cdef class UnionArray(Array): @staticmethod - def from_arrays(list children, Array types, Array value_offsets): + def from_dense(Array types, Array value_offsets, list children): cdef shared_ptr[CArray] out cdef vector[shared_ptr[CArray]] c cdef Array child for child in children: c.push_back(child.sp_array) with nogil: - check_status(CUnionArray.FromArrays(c, deref(types.ap), deref(value_offsets.ap), &out)) + check_status(CUnionArray.FromDense(deref(types.ap), deref(value_offsets.ap), c, &out)) + return pyarrow_wrap_array(out) + + @staticmethod + def from_sparse(Array types, list children): + cdef shared_ptr[CArray] out + cdef vector[shared_ptr[CArray]] c + cdef Array child + for child in children: + c.push_back(child.sp_array) + with nogil: + check_status(CUnionArray.FromSparse(deref(types.ap), c, &out)) return pyarrow_wrap_array(out) cdef class StringArray(Array): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 537eb752bfd..19c794eb9cd 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_DICTIONARY" arrow::Type::DICTIONARY" _Type_MAP" arrow::Type::MAP" + enum UnionMode" arrow::UnionMode": + UnionMode_SPARSE" arrow::UnionMode::SPARSE" + UnionMode_DENSE" arrow::UnionMode::DENSE" + enum TimeUnit" arrow::TimeUnit::type": TimeUnit_SECOND" arrow::TimeUnit::SECOND" TimeUnit_MILLI" arrow::TimeUnit::MILLI" @@ -319,14 +323,19 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CUnionArray" arrow::UnionArray"(CArray): @staticmethod - CStatus FromArrays(const vector[shared_ptr[CArray]]& children, - const CArray& type_ids, const CArray& value_offsets, + CStatus FromSparse(const CArray& type_ids, + const vector[shared_ptr[CArray]]& children, shared_ptr[CArray]* out) + @staticmethod + CStatus FromDense(const CArray& type_ids, const CArray& value_offsets, + const vector[shared_ptr[CArray]]& children, + shared_ptr[CArray]* out) uint8_t* raw_type_ids() int32_t value_offset(int i) shared_ptr[CArray] child(int pos) const CArray* UnsafeChild(int pos) shared_ptr[CDataType] value_type(int pos) + UnionMode mode() cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): const uint8_t* GetValue(int i, int32_t* length) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2eccbcdc2de..37ef86589f0 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -325,7 +325,10 @@ cdef class UnionValue(ArrayValue): cdef getitem(self, int64_t i): cdef int8_t type_id = self.ap.raw_type_ids()[i] cdef shared_ptr[CArray] child = self.ap.child(type_id) - return box_scalar(self.value_types[type_id], child, self.ap.value_offset(i)) + if self.ap.mode() == UnionMode_SPARSE: + return box_scalar(self.value_types[type_id], child, i) + else: + return box_scalar(self.value_types[type_id], child, self.ap.value_offset(i)) def as_py(self): return self.getitem(self.index).as_py() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 6eabe664e7c..0e37340823e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -235,21 +235,25 @@ def test_list_from_arrays(): assert result.equals(expected) -def test_union_from_arrays(): +def test_union_from_dense(): binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') int64 = pa.array([1, 2, 3], type='int64') types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') - result = pa.UnionArray.from_arrays([binary, int64], types, value_offsets) + result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) - assert result[0].as_py() == b'a' - assert result[1].as_py() == 1 - assert result[2].as_py() == b'c' - assert result[3].as_py() == b'b' - assert result[4].as_py() == 2 - assert result[5].as_py() == 3 - assert result[6].as_py() == b'd' + assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd'] + + +def test_union_from_sparse(): + binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], type='binary') + int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + + result = pa.UnionArray.from_sparse(types, [binary, int64]) + + assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd'] def _check_cast_case(case, safe=True): From 9068bbb53c8c6763a3e95099d237d81399f37fd0 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 4 Nov 2017 14:29:45 -0700 Subject: [PATCH 04/10] linting --- cpp/src/arrow/array.cc | 9 +++++---- python/pyarrow/tests/test_array.py | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 5107ed95ec5..68898ca35b1 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -393,7 +393,8 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } -std::shared_ptr MakeUnionArrayType(UnionMode mode, const std::vector>& children) { +std::shared_ptr MakeUnionArrayType( + UnionMode mode, const std::vector>& children) { auto types = std::vector>(); std::vector type_codes; uint8_t counter = 0; @@ -441,8 +442,7 @@ Status UnionArray::FromSparse(const Array& type_ids, return Status::Invalid("UnionArray type_ids must be signed int8"); } BufferVector buffers = {type_ids.null_bitmap(), - static_cast(type_ids).values(), - nullptr}; + static_cast(type_ids).values(), nullptr}; auto union_type = MakeUnionArrayType(UnionMode::SPARSE, children); auto internal_data = std::make_shared(union_type, type_ids.length(), std::move(buffers), @@ -450,7 +450,8 @@ Status UnionArray::FromSparse(const Array& type_ids, for (const auto& child : children) { internal_data->child_data.push_back(child->data()); if (child->length() != type_ids.length()) { - return Status::Invalid("Sparse UnionArray must have len(child) == len(type_ids) for all children"); + return Status::Invalid( + "Sparse UnionArray must have len(child) == len(type_ids) for all children"); } } *out = std::make_shared(internal_data); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 0e37340823e..7dc93c28ea7 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -247,7 +247,8 @@ def test_union_from_dense(): def test_union_from_sparse(): - binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], type='binary') + binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], + type='binary') int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') From c6c85491b52c53edb33e170359d23a2b30bb8d62 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sun, 5 Nov 2017 07:08:27 -0800 Subject: [PATCH 05/10] add doc strings --- python/pyarrow/array.pxi | 29 ++++++++++++++++++++++++++++- python/pyarrow/scalar.pxi | 6 ++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b73205c9b5d..0f7ae456f36 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -635,17 +635,44 @@ cdef class UnionArray(Array): @staticmethod def from_dense(Array types, Array value_offsets, list children): + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + + Returns + ------- + union_array : UnionArray + """ cdef shared_ptr[CArray] out cdef vector[shared_ptr[CArray]] c cdef Array child for child in children: c.push_back(child.sp_array) with nogil: - check_status(CUnionArray.FromDense(deref(types.ap), deref(value_offsets.ap), c, &out)) + check_status(CUnionArray.FromDense( + deref(types.ap), deref(value_offsets.ap), c, &out)) return pyarrow_wrap_array(out) @staticmethod def from_sparse(Array types, list children): + """ + Construct sparse UnionArray from arrays of int8 types and children arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + + Returns + ------- + union_array : UnionArray + """ cdef shared_ptr[CArray] out cdef vector[shared_ptr[CArray]] c cdef Array child diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 37ef86589f0..3cdc1dae84d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -320,7 +320,8 @@ cdef class UnionValue(ArrayValue): cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.value_types = [pyarrow_wrap_data_type(self.ap.value_type(i)) for i in range(self.ap.num_fields())] + self.value_types = [pyarrow_wrap_data_type(self.ap.value_type(i)) + for i in range(self.ap.num_fields())] cdef getitem(self, int64_t i): cdef int8_t type_id = self.ap.raw_type_ids()[i] @@ -328,7 +329,8 @@ cdef class UnionValue(ArrayValue): if self.ap.mode() == UnionMode_SPARSE: return box_scalar(self.value_types[type_id], child, i) else: - return box_scalar(self.value_types[type_id], child, self.ap.value_offset(i)) + return box_scalar(self.value_types[type_id], child, + self.ap.value_offset(i)) def as_py(self): return self.getitem(self.index).as_py() From 502c335ac49a731f16f7b8e0987247c241024969 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 8 Nov 2017 12:24:59 -0800 Subject: [PATCH 06/10] fixes --- cpp/src/arrow/array.cc | 29 ++++++++-------------------- cpp/src/arrow/array.h | 6 ++---- cpp/src/arrow/type.cc | 12 ++++++++++++ cpp/src/arrow/type.h | 4 ++++ python/pyarrow/array.pxi | 4 ++-- python/pyarrow/includes/libarrow.pxd | 5 ++--- python/pyarrow/lib.pxd | 5 +++++ python/pyarrow/public-api.pxi | 2 +- python/pyarrow/scalar.pxi | 6 ++---- python/pyarrow/types.pxi | 10 ++++++++++ 10 files changed, 48 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 68898ca35b1..9c91d619cc7 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -393,20 +393,7 @@ UnionArray::UnionArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } -std::shared_ptr MakeUnionArrayType( - UnionMode mode, const std::vector>& children) { - auto types = std::vector>(); - std::vector type_codes; - uint8_t counter = 0; - for (const auto& child : children) { - types.push_back(field(std::to_string(counter), child->type())); - type_codes.push_back(counter); - counter++; - } - return union_(types, type_codes, mode); -} - -Status UnionArray::FromDense(const Array& type_ids, const Array& value_offsets, +Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, std::shared_ptr* out) { if (value_offsets.length() == 0) { @@ -421,10 +408,14 @@ Status UnionArray::FromDense(const Array& type_ids, const Array& value_offsets, return Status::Invalid("UnionArray type_ids must be signed int8"); } + if (value_offsets.null_count() != 0) { + return Status::Invalid("MakeDense does not allow NAs in value_offsets"); + } + BufferVector buffers = {type_ids.null_bitmap(), static_cast(type_ids).values(), static_cast(value_offsets).values()}; - auto union_type = MakeUnionArrayType(UnionMode::DENSE, children); + auto union_type = union_(children, UnionMode::DENSE); auto internal_data = std::make_shared(union_type, type_ids.length(), std::move(buffers), type_ids.null_count(), type_ids.offset()); @@ -435,7 +426,7 @@ Status UnionArray::FromDense(const Array& type_ids, const Array& value_offsets, return Status::OK(); } -Status UnionArray::FromSparse(const Array& type_ids, +Status UnionArray::MakeSparse(const Array& type_ids, const std::vector>& children, std::shared_ptr* out) { if (type_ids.type_id() != Type::INT8) { @@ -443,7 +434,7 @@ Status UnionArray::FromSparse(const Array& type_ids, } BufferVector buffers = {type_ids.null_bitmap(), static_cast(type_ids).values(), nullptr}; - auto union_type = MakeUnionArrayType(UnionMode::SPARSE, children); + auto union_type = union_(children, UnionMode::SPARSE); auto internal_data = std::make_shared(union_type, type_ids.length(), std::move(buffers), type_ids.null_count(), type_ids.offset()); @@ -474,10 +465,6 @@ const Array* UnionArray::UnsafeChild(int i) const { return boxed_fields_[i].get(); } -std::shared_ptr UnionArray::value_type(int pos) const { - return child(pos)->type(); -} - // ---------------------------------------------------------------------- // DictionaryArray diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c205e44242f..c6a1d4b1e31 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -624,7 +624,7 @@ class ARROW_EXPORT UnionArray : public Array { /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[out] out Will have length equal to value_offsets.length() - static Status FromDense(const Array& type_ids, const Array& value_offsets, + static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, std::shared_ptr* out); @@ -637,7 +637,7 @@ class ARROW_EXPORT UnionArray : public Array { /// 0 corresponding to each type. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[out] out Will have length equal to type_ids.length() - static Status FromSparse(const Array& type_ids, + static Status MakeSparse(const Array& type_ids, const std::vector>& children, std::shared_ptr* out); @@ -659,8 +659,6 @@ class ARROW_EXPORT UnionArray : public Array { /// Only use this while the UnionArray is in scope const Array* UnsafeChild(int pos) const; - std::shared_ptr value_type(int pos) const; - protected: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a9bf5919185..b51c82b20b7 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -444,6 +444,18 @@ std::shared_ptr union_(const std::vector>& chil return std::make_shared(child_fields, type_codes, mode); } +std::shared_ptr union_(const std::vector>& children, UnionMode mode) { + std::vector> types; + std::vector type_codes; + uint8_t counter = 0; + for (const auto& child : children) { + types.push_back(field(std::to_string(counter), child->type())); + type_codes.push_back(counter); + counter++; + } + return union_(types, type_codes, mode); +} + std::shared_ptr dictionary(const std::shared_ptr& index_type, const std::shared_ptr& dict_values, bool ordered) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 446f4d3a0b3..fd190493af5 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -844,6 +844,10 @@ std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); +/// \brief Create and instance of Union type +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, UnionMode mode); + /// \brief Create an instance of Dictionary type std::shared_ptr ARROW_EXPORT dictionary(const std::shared_ptr& index_type, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 0f7ae456f36..44da9615fb2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -655,7 +655,7 @@ cdef class UnionArray(Array): for child in children: c.push_back(child.sp_array) with nogil: - check_status(CUnionArray.FromDense( + check_status(CUnionArray.MakeDense( deref(types.ap), deref(value_offsets.ap), c, &out)) return pyarrow_wrap_array(out) @@ -679,7 +679,7 @@ cdef class UnionArray(Array): for child in children: c.push_back(child.sp_array) with nogil: - check_status(CUnionArray.FromSparse(deref(types.ap), c, &out)) + check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out)) return pyarrow_wrap_array(out) cdef class StringArray(Array): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 19c794eb9cd..5731abeafa0 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -323,18 +323,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CUnionArray" arrow::UnionArray"(CArray): @staticmethod - CStatus FromSparse(const CArray& type_ids, + CStatus MakeSparse(const CArray& type_ids, const vector[shared_ptr[CArray]]& children, shared_ptr[CArray]* out) @staticmethod - CStatus FromDense(const CArray& type_ids, const CArray& value_offsets, + CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets, const vector[shared_ptr[CArray]]& children, shared_ptr[CArray]* out) uint8_t* raw_type_ids() int32_t value_offset(int i) shared_ptr[CArray] child(int pos) const CArray* UnsafeChild(int pos) - shared_ptr[CDataType] value_type(int pos) UnionMode mode() cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 19972af8408..53148949075 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -56,6 +56,11 @@ cdef class DictionaryType(DataType): const CDictionaryType* dict_type +cdef class UnionType(DataType): + cdef: + list child_types + + cdef class TimestampType(DataType): cdef: const CTimestampType* ts_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 9f105122804..90aff9e936d 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -72,7 +72,7 @@ cdef public api object pyarrow_wrap_data_type( elif type.get().id() == _Type_STRUCT: out = StructType() elif type.get().id() == _Type_UNION: - out = StructType() + out = UnionType() elif type.get().id() == _Type_TIMESTAMP: out = TimestampType() elif type.get().id() == _Type_FIXED_SIZE_BINARY: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 3cdc1dae84d..560b7c4c44c 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -320,16 +320,14 @@ cdef class UnionValue(ArrayValue): cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() - self.value_types = [pyarrow_wrap_data_type(self.ap.value_type(i)) - for i in range(self.ap.num_fields())] cdef getitem(self, int64_t i): cdef int8_t type_id = self.ap.raw_type_ids()[i] cdef shared_ptr[CArray] child = self.ap.child(type_id) if self.ap.mode() == UnionMode_SPARSE: - return box_scalar(self.value_types[type_id], child, i) + return box_scalar(self.type[type_id], child, i) else: - return box_scalar(self.value_types[type_id], child, + return box_scalar(self.type[type_id], child, self.ap.value_offset(i)) def as_py(self): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index c9a490960ec..95b3b1045d8 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -186,6 +186,16 @@ cdef class UnionType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) + self.child_types = [pyarrow_wrap_data_type( + type.get().child(i).get().type()) for i in range(self.num_children)] + + property num_children: + + def __get__(self): + return self.type.num_children() + + def __getitem__(self, i): + return self.child_types[i] cdef class TimestampType(DataType): From eeef7226fcdcdd98b602f4431d0a15b97767677f Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 8 Nov 2017 13:47:22 -0800 Subject: [PATCH 07/10] linting --- cpp/src/arrow/type.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index b51c82b20b7..c8229344974 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -444,7 +444,8 @@ std::shared_ptr union_(const std::vector>& chil return std::make_shared(child_fields, type_codes, mode); } -std::shared_ptr union_(const std::vector>& children, UnionMode mode) { +std::shared_ptr union_(const std::vector>& children, + UnionMode mode) { std::vector> types; std::vector type_codes; uint8_t counter = 0; From 9e602a8d2f4fd83a6bffed2b7368cb381ad36afa Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 8 Nov 2017 14:04:10 -0800 Subject: [PATCH 08/10] wrap UnionType in pyarrow --- python/pyarrow/__init__.py | 2 +- python/pyarrow/includes/libarrow.pxd | 9 +++++-- python/pyarrow/lib.pyx | 2 ++ python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/test_schema.py | 8 +++++++ python/pyarrow/tests/test_types.py | 13 ++++++----- python/pyarrow/types.pxi | 35 ++++++++++++++++++++++++++++ 7 files changed, 61 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index b9feec07425..2d7d7288b38 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -36,7 +36,7 @@ time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, - list_, struct, dictionary, field, + list_, struct, union, dictionary, field, type_for_alias, DataType, NAType, Field, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5731abeafa0..76ff30b49dc 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -68,8 +68,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_MAP" arrow::Type::MAP" enum UnionMode" arrow::UnionMode": - UnionMode_SPARSE" arrow::UnionMode::SPARSE" - UnionMode_DENSE" arrow::UnionMode::DENSE" + _UnionMode_SPARSE" arrow::UnionMode::SPARSE" + _UnionMode_DENSE" arrow::UnionMode::DENSE" enum TimeUnit" arrow::TimeUnit::type": TimeUnit_SECOND" arrow::TimeUnit::SECOND" @@ -226,6 +226,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) + cdef cppclass CUnionType" arrow::UnionType"(CDataType): + CUnionType(const vector[shared_ptr[CField]]& fields, + const vector[uint8_t]& type_codes, UnionMode mode) + UnionMode mode() + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 6f4451e3f5a..b4ca49cafe1 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -92,6 +92,8 @@ Type_UNION = _Type_UNION Type_DICTIONARY = _Type_DICTIONARY Type_MAP = _Type_MAP +UnionMode_SPARSE = _UnionMode_SPARSE +UnionMode_DENSE = _UnionMode_DENSE # Exception types include "error.pxi" diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 560b7c4c44c..a396fa763c8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -324,7 +324,7 @@ cdef class UnionValue(ArrayValue): cdef getitem(self, int64_t i): cdef int8_t type_id = self.ap.raw_type_ids()[i] cdef shared_ptr[CArray] child = self.ap.child(type_id) - if self.ap.mode() == UnionMode_SPARSE: + if self.ap.mode() == _UnionMode_SPARSE: return box_scalar(self.type[type_id], child, i) else: return box_scalar(self.type[type_id], child, diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index d6b2655b7c6..116f3978333 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -319,6 +319,14 @@ def test_type_schema_pickling(): pa.field('a', 'int8'), pa.field('b', 'string') ]), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_SPARSE), + pa.union([ + pa.field('a', pa.int8()), + pa.field('b', pa.int16()) + ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e6ff5b1560c..0e3ea1fd40b 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -85,16 +85,17 @@ def test_is_nested_or_struct(): assert not types.is_nested(pa.int32()) -# TODO(wesm): Union types not yet implemented in pyarrow +def test_is_union(): + assert types.is_union(pa.union([pa.field('a', pa.int32()), + pa.field('b', pa.int8()), + pa.field('c', pa.string())], + pa.lib.UnionMode_SPARSE)) + assert not types.is_union(pa.list_(pa.int32())) -# def test_is_union(): -# assert types.is_union(pa.union([pa.field('a', pa.int32()), -# pa.field('b', pa.int8()), -# pa.field('c', pa.string())])) -# assert not types.is_union(pa.list_(pa.int32())) # TODO(wesm): is_map, once implemented + def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 95b3b1045d8..ac60713d295 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -194,9 +194,22 @@ cdef class UnionType(DataType): def __get__(self): return self.type.num_children() + property mode: + + def __get__(self): + cdef CUnionType* type = self.sp_type.get() + return type.mode() + def __getitem__(self, i): return self.child_types[i] + def __getstate__(self): + children = [pyarrow_wrap_field(self.type.child(i)) for i in range(self.num_children)] + return children, self.mode + + def __setstate__(self, state): + cdef DataType reconstituted = union(*state) + self.init(reconstituted.sp_type) cdef class TimestampType(DataType): @@ -1066,6 +1079,28 @@ def struct(fields): return pyarrow_wrap_data_type(struct_type) +def union(children_fields, mode): + """ + Create UnionType from children fields. + """ + cdef: + Field child_field + vector[shared_ptr[CField]] c_fields + vector[uint8_t] type_codes + shared_ptr[CDataType] union_type + int i + + for i, child_field in enumerate(children_fields): + type_codes.push_back(i) + c_fields.push_back(child_field.sp_field) + + if mode == UnionMode_SPARSE: + union_type.reset(new CUnionType(c_fields, type_codes, _UnionMode_SPARSE)) + else: + union_type.reset(new CUnionType(c_fields, type_codes, _UnionMode_DENSE)) + + return pyarrow_wrap_data_type(union_type) + cdef dict _type_aliases = { 'null': null, 'i1': int8, From 9f33076b7d1af3da65b5d1290d1823b9ad620a2e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 9 Nov 2017 00:06:27 -0500 Subject: [PATCH 09/10] Change UnionMode to scoped enumeration Change-Id: Iae3f69070e595a7b65689dda1197c749935fe4b5 --- cpp/src/arrow/array.h | 4 +++- cpp/src/arrow/compare.cc | 2 +- cpp/src/arrow/ipc/json-internal.cc | 2 +- cpp/src/arrow/ipc/metadata-internal.cc | 5 +++-- cpp/src/arrow/type.h | 19 +++++++++++++------ 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c6a1d4b1e31..e15bb2e4683 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -652,7 +652,9 @@ class ARROW_EXPORT UnionArray : public Array { const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode mode() const { return static_cast(*type()).mode(); } + UnionMode::type mode() const { + return static_cast(*type()).mode(); + } std::shared_ptr child(int pos) const; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2ec86c3695a..a2d4de7b73a 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -152,7 +152,7 @@ class RangeEqualsVisitor { bool CompareUnions(const UnionArray& left) const { const auto& right = static_cast(right_); - const UnionMode union_mode = left.mode(); + const UnionMode::type union_mode = left.mode(); if (union_mode != right.mode()) { return false; } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index c1c0661d6ad..1b9baee7daf 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -774,7 +774,7 @@ static Status GetUnion(const RjObject& json_type, RETURN_NOT_STRING("mode", it_mode, json_type); std::string mode_str = it_mode->value.GetString(); - UnionMode mode; + UnionMode::type mode; if (mode_str == "SPARSE") { mode = UnionMode::SPARSE; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index f0f0f675853..9866bc30de9 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -163,8 +163,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& type, static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, const std::vector>& children, std::shared_ptr* out) { - UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE - : UnionMode::DENSE; + UnionMode::type mode = + union_data->mode() == (flatbuf::UnionMode_Sparse ? + UnionMode::SPARSE : UnionMode::DENSE); std::vector type_codes; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fd190493af5..699b68181b4 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -517,14 +517,19 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { int32_t scale_; }; -enum class UnionMode : char { SPARSE, DENSE }; +struct UnionMode { + enum type { + SPARSE, DENSE + }; +}; class ARROW_EXPORT UnionType : public NestedType { public: static constexpr Type::type type_id = Type::UNION; UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); std::string ToString() const override; std::string name() const override { return "union"; } @@ -534,10 +539,10 @@ class ARROW_EXPORT UnionType : public NestedType { const std::vector& type_codes() const { return type_codes_; } - UnionMode mode() const { return mode_; } + UnionMode::type mode() const { return mode_; } private: - UnionMode mode_; + UnionMode::type mode_; // The type id used in the data to indicate each data type in the union. For // example, the first type in the union might be denoted by the id 5 (instead @@ -842,11 +847,13 @@ struct_(const std::vector>& fields); /// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); /// \brief Create and instance of Union type std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, UnionMode mode); +union_(const std::vector>& children, + UnionMode::type mode = UnionMode::SPARSE); /// \brief Create an instance of Dictionary type std::shared_ptr ARROW_EXPORT From 7f3ca3131af65d8975bf6d7b120b3114e1636626 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 9 Nov 2017 00:17:23 -0500 Subject: [PATCH 10/10] Fix flakes Change-Id: Id4b6f445f3e041633eefa327eb1c4716d7d9b18a --- cpp/src/arrow/array.h | 4 +--- cpp/src/arrow/ipc/metadata-internal.cc | 4 ++-- cpp/src/arrow/type.cc | 7 ++++--- cpp/src/arrow/type.h | 7 ++----- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/array.pxi | 3 ++- python/pyarrow/includes/libarrow.pxd | 3 ++- python/pyarrow/types.pxi | 16 ++++++++++------ 8 files changed, 24 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index e15bb2e4683..f7762ce1043 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -652,9 +652,7 @@ class ARROW_EXPORT UnionArray : public Array { const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - UnionMode::type mode() const { - return static_cast(*type()).mode(); - } + UnionMode::type mode() const { return static_cast(*type()).mode(); } std::shared_ptr child(int pos) const; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 9866bc30de9..63ef8a549f2 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -164,8 +164,8 @@ static Status UnionFromFlatbuffer(const flatbuf::Union* union_data, const std::vector>& children, std::shared_ptr* out) { UnionMode::type mode = - union_data->mode() == (flatbuf::UnionMode_Sparse ? - UnionMode::SPARSE : UnionMode::DENSE); + (union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE + : UnionMode::DENSE); std::vector type_codes; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index c8229344974..0d1985fb2d9 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -190,7 +190,7 @@ std::string TimestampType::ToString() const { // Union type UnionType::UnionType(const std::vector>& fields, - const std::vector& type_codes, UnionMode mode) + const std::vector& type_codes, UnionMode::type mode) : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { children_ = fields; } @@ -440,12 +440,13 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode mode) { + const std::vector& type_codes, + UnionMode::type mode) { return std::make_shared(child_fields, type_codes, mode); } std::shared_ptr union_(const std::vector>& children, - UnionMode mode) { + UnionMode::type mode) { std::vector> types; std::vector type_codes; uint8_t counter = 0; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 699b68181b4..9e11a034420 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -518,9 +518,7 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { }; struct UnionMode { - enum type { - SPARSE, DENSE - }; + enum type { SPARSE, DENSE }; }; class ARROW_EXPORT UnionType : public NestedType { @@ -847,8 +845,7 @@ struct_(const std::vector>& fields); /// \brief Create an instance of Union type std::shared_ptr ARROW_EXPORT union_(const std::vector>& child_fields, - const std::vector& type_codes, - UnionMode::type mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); /// \brief Create and instance of Union type std::shared_ptr ARROW_EXPORT diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 04a5b1368ce..7e5e575096d 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -192,7 +192,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: int64_t num_values() const shared_ptr[ColumnPath] path_in_schema() const bint is_stats_set() const - shared_ptr[CRowGroupStatistics] statistics() const; + shared_ptr[CRowGroupStatistics] statistics() const ParquetCompression compression() const const vector[ParquetEncoding]& encodings() const diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 44da9615fb2..9991411e55d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -662,7 +662,8 @@ cdef class UnionArray(Array): @staticmethod def from_sparse(Array types, list children): """ - Construct sparse UnionArray from arrays of int8 types and children arrays + Construct sparse UnionArray from arrays of int8 types and children + arrays Parameters ---------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 76ff30b49dc..dfafd371b28 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -67,7 +67,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_DICTIONARY" arrow::Type::DICTIONARY" _Type_MAP" arrow::Type::MAP" - enum UnionMode" arrow::UnionMode": + enum UnionMode" arrow::UnionMode::type": _UnionMode_SPARSE" arrow::UnionMode::SPARSE" _UnionMode_DENSE" arrow::UnionMode::DENSE" @@ -331,6 +331,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CStatus MakeSparse(const CArray& type_ids, const vector[shared_ptr[CArray]]& children, shared_ptr[CArray]* out) + @staticmethod CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets, const vector[shared_ptr[CArray]]& children, diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ac60713d295..d2e68ff79a5 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -186,13 +186,14 @@ cdef class UnionType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) - self.child_types = [pyarrow_wrap_data_type( - type.get().child(i).get().type()) for i in range(self.num_children)] + self.child_types = [ + pyarrow_wrap_data_type(type.get().child(i).get().type()) + for i in range(self.num_children)] property num_children: def __get__(self): - return self.type.num_children() + return self.type.num_children() property mode: @@ -204,7 +205,8 @@ cdef class UnionType(DataType): return self.child_types[i] def __getstate__(self): - children = [pyarrow_wrap_field(self.type.child(i)) for i in range(self.num_children)] + children = [pyarrow_wrap_field(self.type.child(i)) + for i in range(self.num_children)] return children, self.mode def __setstate__(self, state): @@ -1095,9 +1097,11 @@ def union(children_fields, mode): c_fields.push_back(child_field.sp_field) if mode == UnionMode_SPARSE: - union_type.reset(new CUnionType(c_fields, type_codes, _UnionMode_SPARSE)) + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_SPARSE)) else: - union_type.reset(new CUnionType(c_fields, type_codes, _UnionMode_DENSE)) + union_type.reset(new CUnionType(c_fields, type_codes, + _UnionMode_DENSE)) return pyarrow_wrap_data_type(union_type)