diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 37064a51dc5..b8205a31c02 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -226,24 +226,35 @@ cdef class ChunkedArray(_PandasConvertible): def _to_pandas(self, options, **kwargs): return _array_like_to_pandas(self, options) - def __array__(self, dtype=None): + def to_numpy(self): + """ + Return a NumPy copy of this array (experimental). + + Returns + ------- + array : numpy.ndarray + """ cdef: PyObject* out PandasOptions c_options object values if self.type.id == _Type_EXTENSION: - return ( - chunked_array( - [self.chunk(i).storage for i in range(self.num_chunks)] - ).__array__(dtype) + storage_array = chunked_array( + [chunk.storage for chunk in self.iterchunks()], + type=self.type.storage_type ) + return storage_array.to_numpy() with nogil: - check_status(libarrow.ConvertChunkedArrayToPandas( - c_options, - self.sp_chunked_array, - self, &out)) + check_status( + ConvertChunkedArrayToPandas( + c_options, + self.sp_chunked_array, + self, + &out + ) + ) # wrap_array_output uses pandas to convert to Categorical, here # always convert to numpy array @@ -252,6 +263,10 @@ cdef class ChunkedArray(_PandasConvertible): if isinstance(values, dict): values = np.take(values['dictionary'], values['indices']) + return values + + def __array__(self, dtype=None): + values = self.to_numpy() if dtype is None: return values return values.astype(dtype) @@ -416,7 +431,6 @@ def chunked_array(arrays, type=None): Must all be the same data type. Can be empty only if type also passed. type : DataType or string coercible to DataType - Returns ------- ChunkedArray @@ -425,31 +439,35 @@ def chunked_array(arrays, type=None): Array arr vector[shared_ptr[CArray]] c_arrays shared_ptr[CChunkedArray] sp_chunked_array - shared_ptr[CDataType] sp_data_type + + type = ensure_type(type, allow_none=True) if isinstance(arrays, Array): arrays = [arrays] for x in arrays: - if isinstance(x, Array): - arr = x - if type is not None: - assert x.type == type + arr = x if isinstance(x, Array) else array(x, type=type) + + if type is None: + # it allows more flexible chunked array construction from to coerce + # subsequent arrays to the firstly inferred array type + # it also spares the inference overhead after the first chunk + type = arr.type else: - arr = array(x, type=type) + if arr.type != type: + raise TypeError( + "All array chunks must have type {}".format(type) + ) c_arrays.push_back(arr.sp_array) - if type: - type = ensure_type(type) - sp_data_type = pyarrow_unwrap_data_type(type) - sp_chunked_array.reset(new CChunkedArray(c_arrays, sp_data_type)) - else: - if c_arrays.size() == 0: - raise ValueError("When passing an empty collection of arrays " - "you must also pass the data type") - sp_chunked_array.reset(new CChunkedArray(c_arrays)) + if c_arrays.size() == 0 and type is None: + raise ValueError("When passing an empty collection of arrays " + "you must also pass the data type") + sp_chunked_array.reset( + new CChunkedArray(c_arrays, pyarrow_unwrap_data_type(type)) + ) with nogil: check_status(sp_chunked_array.get().Validate()) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index dafa4f0d8f3..a3ef336a22f 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -482,7 +482,20 @@ def test_to_numpy(): np.testing.assert_array_equal(result, expected) # chunked array - charr = pa.chunked_array([arr]) - - result = np.asarray(charr) - np.testing.assert_array_equal(result, expected) + a1 = pa.chunked_array([arr, arr]) + a2 = pa.chunked_array([arr, arr], type=period_type) + expected = np.hstack([expected, expected]) + + for charr in [a1, a2]: + assert charr.type == period_type + for result in [np.asarray(charr), charr.to_numpy()]: + assert result.dtype == np.int64 + np.testing.assert_array_equal(result, expected) + + # zero chunks + charr = pa.chunked_array([], type=period_type) + assert charr.type == period_type + + for result in [np.asarray(charr), charr.to_numpy()]: + assert result.dtype == np.int64 + np.testing.assert_array_equal(result, np.array([], dtype='int64')) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index af32e180ec1..4012a919548 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -57,9 +57,79 @@ def test_chunked_array_basics(): assert wr() is None +def test_chunked_array_construction(): + arr = pa.chunked_array([ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], + ]) + assert arr.type == pa.int64() + assert len(arr) == 9 + assert len(arr.chunks) == 3 + + arr = pa.chunked_array([ + [1, 2, 3], + [4., 5., 6.], + [7, 8, 9], + ]) + assert arr.type == pa.int64() + assert len(arr) == 9 + assert len(arr.chunks) == 3 + + arr = pa.chunked_array([ + [1, 2, 3], + [4., 5., 6.], + [7, 8, 9], + ], type=pa.int8()) + assert arr.type == pa.int8() + assert len(arr) == 9 + assert len(arr.chunks) == 3 + + arr = pa.chunked_array([ + [1, 2, 3], + [] + ]) + assert arr.type == pa.int64() + assert len(arr) == 3 + assert len(arr.chunks) == 2 + + msg = ( + "When passing an empty collection of arrays you must also pass the " + "data type" + ) + with pytest.raises(ValueError, match=msg): + assert pa.chunked_array([]) + + assert pa.chunked_array([], type=pa.string()).type == pa.string() + assert pa.chunked_array([[]]).type == pa.null() + assert pa.chunked_array([[]], type=pa.string()).type == pa.string() + + +def test_chunked_array_to_numpy(): + data = pa.chunked_array([ + [1, 2, 3], + [4, 5, 6], + [] + ]) + arr1 = np.asarray(data) + arr2 = data.to_numpy() + + assert isinstance(arr2, np.ndarray) + assert arr2.shape == (6,) + assert np.array_equal(arr1, arr2) + + def test_chunked_array_mismatch_types(): - with pytest.raises(pa.ArrowInvalid): - pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])]) + with pytest.raises(TypeError): + # Given array types are different + pa.chunked_array([ + pa.array([1, 2, 3]), + pa.array([1., 2., 3.]) + ]) + + with pytest.raises(TypeError): + # Given array type is different from explicit type argument + pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64()) def test_chunked_array_str():