From a72533831761949d43768937cf3387fb0dee7309 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 12 Jun 2019 17:55:32 -0500 Subject: [PATCH 1/2] Implement Array.from_buffers for nested types, add DataType.num_buffers, more checks --- python/pyarrow/array.pxi | 51 +++++++------ python/pyarrow/includes/libarrow.pxd | 6 ++ python/pyarrow/tests/test_array.py | 107 +++++++++++++++++++-------- python/pyarrow/types.pxi | 29 ++++---- 4 files changed, 126 insertions(+), 67 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ae6104f0029..94a8ecd01e8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -561,9 +561,10 @@ cdef class Array(_PandasConvertible): (_reduce_array_data(self.sp_array.get().data().get()),) @staticmethod - def from_buffers(DataType type, length, buffers, null_count=-1, offset=0): + def from_buffers(DataType type, length, buffers, null_count=-1, offset=0, + children=None): """ - Construct an Array from a sequence of buffers. The concrete type + Construct an Array from a sequence of buffers. The concrete type returned depends on the datatype. Parameters @@ -585,19 +586,36 @@ cdef class Array(_PandasConvertible): """ cdef: Buffer buf + Array child vector[shared_ptr[CBuffer]] c_buffers - shared_ptr[CArrayData] ad + vector[shared_ptr[CArrayData]] c_child_data + shared_ptr[CArrayData] array_data - if not is_primitive(type.id): - raise NotImplementedError("from_buffers is only supported for " - "primitive arrays yet.") + children = children or [] + + if type.num_children != len(children): + raise ValueError("Type's expected number of children " + "({0}) did not match the passed number " + "({1}).".format(type.num_children, len(children))) + + if type.num_buffers != len(buffers): + raise ValueError("Type's expected number of buffers " + "({0}) did not match the passed number " + "({1}).".format(type.num_buffers, len(buffers))) for buf in buffers: # None will produce a null buffer pointer c_buffers.push_back(pyarrow_unwrap_buffer(buf)) - ad = CArrayData.Make(type.sp_type, length, c_buffers, - null_count, offset) - return pyarrow_wrap_array(MakeArray(ad)) + + for child in children: + c_child_data.push_back(child.ap.data()) + + array_data = CArrayData.MakeWithChildren(type.sp_type, length, + c_buffers, c_child_data, + null_count, offset) + cdef Array result = pyarrow_wrap_array(MakeArray(array_data)) + result.validate() + return result @property def null_count(self): @@ -1214,18 +1232,9 @@ cdef class StringArray(Array): ------- string_array : StringArray """ - cdef shared_ptr[CBuffer] c_null_bitmap - cdef shared_ptr[CArray] out - - if null_bitmap is not None: - c_null_bitmap = null_bitmap.buffer - else: - null_count = 0 - - out.reset(new CStringArray( - length, value_offsets.buffer, data.buffer, c_null_bitmap, - null_count, offset)) - return pyarrow_wrap_array(out) + return Array.from_buffers(utf8(), length, + [null_bitmap, value_offsets, data], + null_count, offset) cdef class BinaryArray(Array): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 1e32b87127a..f979cd6cb65 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -83,6 +83,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: TimeUnit_MICRO" arrow::TimeUnit::MICRO" TimeUnit_NANO" arrow::TimeUnit::NANO" + cdef cppclass CDataTypeLayout" arrow::DataTypeLayout": + vector[int64_t] bit_widths + c_bool has_dictionary + cdef cppclass CDataType" arrow::DataType": Type id() @@ -94,6 +98,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int num_children() + CDataTypeLayout layout() + c_string ToString() c_bool is_primitive(Type type) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 0b79017a694..f4fc23cdafc 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -310,8 +310,81 @@ def test_array_from_buffers(): with pytest.raises(TypeError): pa.Array.from_buffers(pa.int16(), 3, [u'', u''], offset=1) - with pytest.raises(NotImplementedError): - pa.Array.from_buffers(pa.list_(pa.int16()), 4, [None, values_buf]) + +def test_string_binary_from_buffers(): + array = pa.array(["a", None, "b", "c"]) + + buffers = array.buffers() + copied = pa.StringArray.from_buffers( + len(array), buffers[1], buffers[2], buffers[0], array.null_count, + array.offset) + assert copied.to_pylist() == ["a", None, "b", "c"] + + binary_copy = pa.Array.from_buffers(pa.binary(), len(array), + array.buffers(), array.null_count, + array.offset) + assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"] + + copied = pa.StringArray.from_buffers( + len(array), buffers[1], buffers[2], buffers[0]) + assert copied.to_pylist() == ["a", None, "b", "c"] + + sliced = array[1:] + buffers = sliced.buffers() + copied = pa.StringArray.from_buffers( + len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset) + assert copied.to_pylist() == [None, "b", "c"] + assert copied.null_count == 1 + + # Slice but exclude all null entries so that we don't need to pass + # the null bitmap. + sliced = array[2:] + buffers = sliced.buffers() + copied = pa.StringArray.from_buffers( + len(sliced), buffers[1], buffers[2], None, -1, sliced.offset) + assert copied.to_pylist() == ["b", "c"] + assert copied.null_count == 0 + + +def test_list_from_buffers(): + ty = pa.list_(pa.int16()) + array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty) + + buffers = array.buffers() + + with pytest.raises(ValueError): + # No children + pa.Array.from_buffers(ty, 4, [None, buffers[1]]) + + child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:]) + copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child]) + assert copied.equals(array) + + with pytest.raises(ValueError): + # too many children + pa.Array.from_buffers(ty, 4, [None, buffers[1]], + children=[child, child]) + + +def test_struct_from_buffers(): + ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())]) + array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}], + type=ty) + buffers = array.buffers() + + with pytest.raises(ValueError): + # No children + pa.Array.from_buffers(ty, 3, [None, buffers[1]]) + + children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]), + pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])] + copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children) + assert copied.equals(array) + + with pytest.raises(ValueError): + # not enough many children + pa.Array.from_buffers(ty, 3, [buffers[0]], + children=children[:1]) def test_dictionary_from_numpy(): @@ -499,36 +572,6 @@ def test_union_array_slice(): assert arr[i:j].to_pylist() == lst[i:j] -def test_string_from_buffers(): - array = pa.array(["a", None, "b", "c"]) - - buffers = array.buffers() - copied = pa.StringArray.from_buffers( - len(array), buffers[1], buffers[2], buffers[0], array.null_count, - array.offset) - assert copied.to_pylist() == ["a", None, "b", "c"] - - copied = pa.StringArray.from_buffers( - len(array), buffers[1], buffers[2], buffers[0]) - assert copied.to_pylist() == ["a", None, "b", "c"] - - sliced = array[1:] - buffers = sliced.buffers() - copied = pa.StringArray.from_buffers( - len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset) - assert copied.to_pylist() == [None, "b", "c"] - assert copied.null_count == 1 - - # Slice but exclude all null entries so that we don't need to pass - # the null bitmap. - sliced = array[2:] - buffers = sliced.buffers() - copied = pa.StringArray.from_buffers( - len(sliced), buffers[1], buffers[2], None, -1, sliced.offset) - assert copied.to_pylist() == ["b", "c"] - assert copied.null_count == 0 - - def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case if isinstance(out_data, pa.Array): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 24feec758e8..cb58e50fbfc 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -124,6 +124,21 @@ cdef class DataType: raise ValueError("Non-fixed width type") return ty.bit_width() + @property + def num_children(self): + """ + The number of struct fields. + """ + return self.type.num_children() + + @property + def num_buffers(self): + """ + Number of data buffers required to construct Array type + excluding children + """ + return self.type.layout().bit_widths.size() + def __str__(self): return frombytes(self.type.ToString()) @@ -297,13 +312,6 @@ cdef class StructType(DataType): def __reduce__(self): return struct, (list(self),) - @property - def num_children(self): - """ - The number of struct fields. - """ - return self.type.num_children() - cdef class UnionType(DataType): """ @@ -313,13 +321,6 @@ cdef class UnionType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): DataType.init(self, type) - @property - def num_children(self): - """ - The number of union members. - """ - return self.type.num_children() - @property def mode(self): """ From ec0695d86a5ccfd0969b93fdeff918c40a351bae Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Jun 2019 12:56:26 -0500 Subject: [PATCH 2/2] Address code review feedback --- python/pyarrow/array.pxi | 2 ++ python/pyarrow/types.pxi | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 94a8ecd01e8..cce967efa32 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -579,6 +579,8 @@ cdef class Array(_PandasConvertible): offset : int, default 0 The array's logical offset (in values, not in bytes) from the start of each buffer + children : List[Array], default None + Nested type children with length matching type.num_children Returns ------- diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index cb58e50fbfc..9a927612e82 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -127,7 +127,7 @@ cdef class DataType: @property def num_children(self): """ - The number of struct fields. + The number of child fields. """ return self.type.num_children()