Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 43 additions & 25 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -226,24 +226,35 @@ cdef class ChunkedArray(_PandasConvertible):
def _to_pandas(self, options, **kwargs):
return _array_like_to_pandas(self, options)

def __array__(self, dtype=None):
def to_numpy(self):
"""
Return a NumPy copy of this array (experimental).

Returns
-------
array : numpy.ndarray
"""
cdef:
PyObject* out
PandasOptions c_options
object values

if self.type.id == _Type_EXTENSION:
return (
chunked_array(
[self.chunk(i).storage for i in range(self.num_chunks)]
).__array__(dtype)
storage_array = chunked_array(
[chunk.storage for chunk in self.iterchunks()],
type=self.type.storage_type
)
return storage_array.to_numpy()

with nogil:
check_status(libarrow.ConvertChunkedArrayToPandas(
c_options,
self.sp_chunked_array,
self, &out))
check_status(
ConvertChunkedArrayToPandas(
c_options,
self.sp_chunked_array,
self,
&out
)
)

# wrap_array_output uses pandas to convert to Categorical, here
# always convert to numpy array
Expand All @@ -252,6 +263,10 @@ cdef class ChunkedArray(_PandasConvertible):
if isinstance(values, dict):
values = np.take(values['dictionary'], values['indices'])

return values

def __array__(self, dtype=None):
values = self.to_numpy()
if dtype is None:
return values
return values.astype(dtype)
Expand Down Expand Up @@ -416,7 +431,6 @@ def chunked_array(arrays, type=None):
Must all be the same data type. Can be empty only if type also passed.
type : DataType or string coercible to DataType


Returns
-------
ChunkedArray
Expand All @@ -425,31 +439,35 @@ def chunked_array(arrays, type=None):
Array arr
vector[shared_ptr[CArray]] c_arrays
shared_ptr[CChunkedArray] sp_chunked_array
shared_ptr[CDataType] sp_data_type

type = ensure_type(type, allow_none=True)

if isinstance(arrays, Array):
arrays = [arrays]

for x in arrays:
if isinstance(x, Array):
arr = x
if type is not None:
assert x.type == type
arr = x if isinstance(x, Array) else array(x, type=type)

if type is None:
# it allows more flexible chunked array construction from to coerce
# subsequent arrays to the firstly inferred array type
# it also spares the inference overhead after the first chunk
type = arr.type
else:
arr = array(x, type=type)
if arr.type != type:
raise TypeError(
"All array chunks must have type {}".format(type)
)

c_arrays.push_back(arr.sp_array)

if type:
type = ensure_type(type)
sp_data_type = pyarrow_unwrap_data_type(type)
sp_chunked_array.reset(new CChunkedArray(c_arrays, sp_data_type))
else:
if c_arrays.size() == 0:
raise ValueError("When passing an empty collection of arrays "
"you must also pass the data type")
sp_chunked_array.reset(new CChunkedArray(c_arrays))
if c_arrays.size() == 0 and type is None:
raise ValueError("When passing an empty collection of arrays "
"you must also pass the data type")

sp_chunked_array.reset(
new CChunkedArray(c_arrays, pyarrow_unwrap_data_type(type))
)
with nogil:
check_status(sp_chunked_array.get().Validate())

Expand Down
21 changes: 17 additions & 4 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,20 @@ def test_to_numpy():
np.testing.assert_array_equal(result, expected)

# chunked array
charr = pa.chunked_array([arr])

result = np.asarray(charr)
np.testing.assert_array_equal(result, expected)
a1 = pa.chunked_array([arr, arr])
a2 = pa.chunked_array([arr, arr], type=period_type)
expected = np.hstack([expected, expected])

for charr in [a1, a2]:
assert charr.type == period_type
for result in [np.asarray(charr), charr.to_numpy()]:
assert result.dtype == np.int64
np.testing.assert_array_equal(result, expected)

# zero chunks
charr = pa.chunked_array([], type=period_type)
assert charr.type == period_type

for result in [np.asarray(charr), charr.to_numpy()]:
assert result.dtype == np.int64
np.testing.assert_array_equal(result, np.array([], dtype='int64'))
74 changes: 72 additions & 2 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,79 @@ def test_chunked_array_basics():
assert wr() is None


def test_chunked_array_construction():
arr = pa.chunked_array([
[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
])
assert arr.type == pa.int64()
assert len(arr) == 9
assert len(arr.chunks) == 3

arr = pa.chunked_array([
[1, 2, 3],
[4., 5., 6.],
[7, 8, 9],
])
assert arr.type == pa.int64()
assert len(arr) == 9
assert len(arr.chunks) == 3

arr = pa.chunked_array([
[1, 2, 3],
[4., 5., 6.],
[7, 8, 9],
], type=pa.int8())
assert arr.type == pa.int8()
assert len(arr) == 9
assert len(arr.chunks) == 3

arr = pa.chunked_array([
[1, 2, 3],
[]
])
assert arr.type == pa.int64()
assert len(arr) == 3
assert len(arr.chunks) == 2

msg = (
"When passing an empty collection of arrays you must also pass the "
"data type"
)
with pytest.raises(ValueError, match=msg):
assert pa.chunked_array([])

assert pa.chunked_array([], type=pa.string()).type == pa.string()
assert pa.chunked_array([[]]).type == pa.null()
assert pa.chunked_array([[]], type=pa.string()).type == pa.string()


def test_chunked_array_to_numpy():
data = pa.chunked_array([
[1, 2, 3],
[4, 5, 6],
[]
])
arr1 = np.asarray(data)
arr2 = data.to_numpy()

assert isinstance(arr2, np.ndarray)
assert arr2.shape == (6,)
assert np.array_equal(arr1, arr2)


def test_chunked_array_mismatch_types():
with pytest.raises(pa.ArrowInvalid):
pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
with pytest.raises(TypeError):
# Given array types are different
pa.chunked_array([
pa.array([1, 2, 3]),
pa.array([1., 2., 3.])
])

with pytest.raises(TypeError):
# Given array type is different from explicit type argument
pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64())


def test_chunked_array_str():
Expand Down