Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 32 additions & 21 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -561,9 +561,10 @@ cdef class Array(_PandasConvertible):
(_reduce_array_data(self.sp_array.get().data().get()),)

@staticmethod
def from_buffers(DataType type, length, buffers, null_count=-1, offset=0):
def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
children=None):
"""
Construct an Array from a sequence of buffers. The concrete type
Construct an Array from a sequence of buffers. The concrete type
returned depends on the datatype.

Parameters
Expand All @@ -578,26 +579,45 @@ cdef class Array(_PandasConvertible):
offset : int, default 0
The array's logical offset (in values, not in bytes) from the
start of each buffer
children : List[Array], default None
Nested type children with length matching type.num_children

Returns
-------
array : Array
"""
cdef:
Buffer buf
Array child
vector[shared_ptr[CBuffer]] c_buffers
shared_ptr[CArrayData] ad
vector[shared_ptr[CArrayData]] c_child_data
shared_ptr[CArrayData] array_data

if not is_primitive(type.id):
raise NotImplementedError("from_buffers is only supported for "
"primitive arrays yet.")
children = children or []

if type.num_children != len(children):
raise ValueError("Type's expected number of children "
"({0}) did not match the passed number "
"({1}).".format(type.num_children, len(children)))

if type.num_buffers != len(buffers):
raise ValueError("Type's expected number of buffers "
"({0}) did not match the passed number "
"({1}).".format(type.num_buffers, len(buffers)))

for buf in buffers:
# None will produce a null buffer pointer
c_buffers.push_back(pyarrow_unwrap_buffer(buf))
ad = CArrayData.Make(type.sp_type, length, c_buffers,
null_count, offset)
return pyarrow_wrap_array(MakeArray(ad))

for child in children:
c_child_data.push_back(child.ap.data())

array_data = CArrayData.MakeWithChildren(type.sp_type, length,
c_buffers, c_child_data,
null_count, offset)
cdef Array result = pyarrow_wrap_array(MakeArray(array_data))
result.validate()
return result

@property
def null_count(self):
Expand Down Expand Up @@ -1214,18 +1234,9 @@ cdef class StringArray(Array):
-------
string_array : StringArray
"""
cdef shared_ptr[CBuffer] c_null_bitmap
cdef shared_ptr[CArray] out

if null_bitmap is not None:
c_null_bitmap = null_bitmap.buffer
else:
null_count = 0

out.reset(new CStringArray(
length, value_offsets.buffer, data.buffer, c_null_bitmap,
null_count, offset))
return pyarrow_wrap_array(out)
return Array.from_buffers(utf8(), length,
[null_bitmap, value_offsets, data],
null_count, offset)


cdef class BinaryArray(Array):
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
TimeUnit_MICRO" arrow::TimeUnit::MICRO"
TimeUnit_NANO" arrow::TimeUnit::NANO"

cdef cppclass CDataTypeLayout" arrow::DataTypeLayout":
vector[int64_t] bit_widths
c_bool has_dictionary

cdef cppclass CDataType" arrow::DataType":
Type id()

Expand All @@ -94,6 +98,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:

int num_children()

CDataTypeLayout layout()

c_string ToString()

c_bool is_primitive(Type type)
Expand Down
107 changes: 75 additions & 32 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,81 @@ def test_array_from_buffers():
with pytest.raises(TypeError):
pa.Array.from_buffers(pa.int16(), 3, [u'', u''], offset=1)

with pytest.raises(NotImplementedError):
pa.Array.from_buffers(pa.list_(pa.int16()), 4, [None, values_buf])

def test_string_binary_from_buffers():
array = pa.array(["a", None, "b", "c"])

buffers = array.buffers()
copied = pa.StringArray.from_buffers(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I hadn't noticed that StringArray had a different from_buffers method.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Frankly I question whether it is useful, I am inclined to remove it

@xhochy

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving it here for now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this, it is probably only used by fletcher and pyarrow.jvm.

len(array), buffers[1], buffers[2], buffers[0], array.null_count,
array.offset)
assert copied.to_pylist() == ["a", None, "b", "c"]

binary_copy = pa.Array.from_buffers(pa.binary(), len(array),
array.buffers(), array.null_count,
array.offset)
assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"]

copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0])
assert copied.to_pylist() == ["a", None, "b", "c"]

sliced = array[1:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
assert copied.to_pylist() == [None, "b", "c"]
assert copied.null_count == 1

# Slice but exclude all null entries so that we don't need to pass
# the null bitmap.
sliced = array[2:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
assert copied.to_pylist() == ["b", "c"]
assert copied.null_count == 0


def test_list_from_buffers():
ty = pa.list_(pa.int16())
array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)

buffers = array.buffers()

with pytest.raises(ValueError):
# No children
pa.Array.from_buffers(ty, 4, [None, buffers[1]])

child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
assert copied.equals(array)

with pytest.raises(ValueError):
# too many children
pa.Array.from_buffers(ty, 4, [None, buffers[1]],
children=[child, child])


def test_struct_from_buffers():
ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())])
array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}],
type=ty)
buffers = array.buffers()

with pytest.raises(ValueError):
# No children
pa.Array.from_buffers(ty, 3, [None, buffers[1]])

children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]),
pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])]
copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children)
assert copied.equals(array)

with pytest.raises(ValueError):
# not enough many children
pa.Array.from_buffers(ty, 3, [buffers[0]],
children=children[:1])


def test_dictionary_from_numpy():
Expand Down Expand Up @@ -499,36 +572,6 @@ def test_union_array_slice():
assert arr[i:j].to_pylist() == lst[i:j]


def test_string_from_buffers():
array = pa.array(["a", None, "b", "c"])

buffers = array.buffers()
copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0], array.null_count,
array.offset)
assert copied.to_pylist() == ["a", None, "b", "c"]

copied = pa.StringArray.from_buffers(
len(array), buffers[1], buffers[2], buffers[0])
assert copied.to_pylist() == ["a", None, "b", "c"]

sliced = array[1:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
assert copied.to_pylist() == [None, "b", "c"]
assert copied.null_count == 1

# Slice but exclude all null entries so that we don't need to pass
# the null bitmap.
sliced = array[2:]
buffers = sliced.buffers()
copied = pa.StringArray.from_buffers(
len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
assert copied.to_pylist() == ["b", "c"]
assert copied.null_count == 0


def _check_cast_case(case, safe=True):
in_data, in_type, out_data, out_type = case
if isinstance(out_data, pa.Array):
Expand Down
29 changes: 15 additions & 14 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,21 @@ cdef class DataType:
raise ValueError("Non-fixed width type")
return ty.bit_width()

@property
def num_children(self):
"""
The number of child fields.
"""
return self.type.num_children()

@property
def num_buffers(self):
"""
Number of data buffers required to construct Array type
excluding children
"""
return self.type.layout().bit_widths.size()

def __str__(self):
return frombytes(self.type.ToString())

Expand Down Expand Up @@ -297,13 +312,6 @@ cdef class StructType(DataType):
def __reduce__(self):
return struct, (list(self),)

@property
def num_children(self):
"""
The number of struct fields.
"""
return self.type.num_children()


cdef class UnionType(DataType):
"""
Expand All @@ -313,13 +321,6 @@ cdef class UnionType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)

@property
def num_children(self):
"""
The number of union members.
"""
return self.type.num_children()

@property
def mode(self):
"""
Expand Down