From 7bcbbdde2245efea6534d2cbf8cd45395c521be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 30 Jul 2020 16:34:46 +0200 Subject: [PATCH 1/3] more flexible chunked array construction; ChunkedArray.to_numpy --- python/pyarrow/table.pxi | 69 +++++++++++++-------- python/pyarrow/tests/test_extension_type.py | 6 +- python/pyarrow/tests/test_table.py | 57 +++++++++++++++++ 3 files changed, 105 insertions(+), 27 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 37064a51dc5..9e0dca0a9a2 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -226,24 +226,34 @@ cdef class ChunkedArray(_PandasConvertible): def _to_pandas(self, options, **kwargs): return _array_like_to_pandas(self, options) - def __array__(self, dtype=None): + def to_numpy(self): + """ + Return a NumPy copy of this array (experimental). + + Returns + ------- + array : numpy.ndarray + """ cdef: PyObject* out PandasOptions c_options object values if self.type.id == _Type_EXTENSION: - return ( - chunked_array( - [self.chunk(i).storage for i in range(self.num_chunks)] - ).__array__(dtype) - ) + storage_array = chunked_array([ + chunk.storage for chunk in self.iterchunks() + ]) + return storage_array.to_numpy() with nogil: - check_status(libarrow.ConvertChunkedArrayToPandas( - c_options, - self.sp_chunked_array, - self, &out)) + check_status( + ConvertChunkedArrayToPandas( + c_options, + self.sp_chunked_array, + self, + &out + ) + ) # wrap_array_output uses pandas to convert to Categorical, here # always convert to numpy array @@ -252,6 +262,10 @@ cdef class ChunkedArray(_PandasConvertible): if isinstance(values, dict): values = np.take(values['dictionary'], values['indices']) + return values + + def __array__(self, dtype=None): + values = self.to_numpy() if dtype is None: return values return values.astype(dtype) @@ -416,7 +430,6 @@ def chunked_array(arrays, type=None): Must all be the same data type. Can be empty only if type also passed. type : DataType or string coercible to DataType - Returns ------- ChunkedArray @@ -425,31 +438,35 @@ def chunked_array(arrays, type=None): Array arr vector[shared_ptr[CArray]] c_arrays shared_ptr[CChunkedArray] sp_chunked_array - shared_ptr[CDataType] sp_data_type + + type = ensure_type(type, allow_none=True) if isinstance(arrays, Array): arrays = [arrays] for x in arrays: - if isinstance(x, Array): - arr = x - if type is not None: - assert x.type == type + arr = x if isinstance(x, Array) else array(x, type=type) + + if type is None: + # it allows more flexible chunked array construction from to coerce + # subsequent arrays to the firstly inferred array type + # it also spares the inference overhead after the first chunk + type = arr.type else: - arr = array(x, type=type) + if arr.type != type: + raise ArrowInvalid( + "Each array chunks must have type {}".format(type) + ) c_arrays.push_back(arr.sp_array) - if type: - type = ensure_type(type) - sp_data_type = pyarrow_unwrap_data_type(type) - sp_chunked_array.reset(new CChunkedArray(c_arrays, sp_data_type)) - else: - if c_arrays.size() == 0: - raise ValueError("When passing an empty collection of arrays " - "you must also pass the data type") - sp_chunked_array.reset(new CChunkedArray(c_arrays)) + if c_arrays.size() == 0 and type is None: + raise ValueError("When passing an empty collection of arrays " + "you must also pass the data type") + sp_chunked_array.reset( + new CChunkedArray(c_arrays, pyarrow_unwrap_data_type(type)) + ) with nogil: check_status(sp_chunked_array.get().Validate()) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index dafa4f0d8f3..75c4ec2d8ac 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -482,7 +482,11 @@ def test_to_numpy(): np.testing.assert_array_equal(result, expected) # chunked array - charr = pa.chunked_array([arr]) + charr = pa.chunked_array([arr, arr]) + expected = np.hstack([expected, expected]) result = np.asarray(charr) np.testing.assert_array_equal(result, expected) + + result = charr.to_numpy() + np.testing.assert_array_equal(result, expected) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index af32e180ec1..896ab9811d7 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -57,6 +57,63 @@ def test_chunked_array_basics(): assert wr() is None +def test_chunked_array_construction(): + arr = pa.chunked_array([ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], + ]) + assert arr.type == pa.int64() + assert len(arr) == 9 + assert len(arr.chunks) == 3 + + arr = pa.chunked_array([ + [1, 2, 3], + [4., 5., 6.], + [7, 8, 9], + ]) + assert arr.type == pa.int64() + assert len(arr) == 9 + assert len(arr.chunks) == 3 + + arr = pa.chunked_array([ + [1, 2, 3], + [4., 5., 6.], + [7, 8, 9], + ], type=pa.int8()) + assert arr.type == pa.int8() + assert len(arr) == 9 + assert len(arr.chunks) == 3 + + arr = pa.chunked_array([ + [1, 2, 3], + [] + ]) + assert arr.type == pa.int64() + assert len(arr) == 3 + assert len(arr.chunks) == 2 + + with pytest.raises(pa.ArrowInvalid): + pa.chunked_array([ + pa.array([1, 2, 3]), + pa.array([1., 2., 3.]) + ]) + + +def test_chunked_array_to_numpy(): + data = pa.chunked_array([ + [1, 2, 3], + [4, 5, 6], + [] + ]) + arr1 = np.asarray(data) + arr2 = data.to_numpy() + + assert isinstance(arr2, np.ndarray) + assert arr2.shape == (6,) + assert np.array_equal(arr1, arr2) + + def test_chunked_array_mismatch_types(): with pytest.raises(pa.ArrowInvalid): pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])]) From 94627eaaced55d937c8367d822fa5fdbf489b8e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 10 Aug 2020 14:06:14 +0200 Subject: [PATCH 2/3] address review comment --- python/pyarrow/table.pxi | 9 +++++---- python/pyarrow/tests/test_extension_type.py | 19 ++++++++++++++----- python/pyarrow/tests/test_table.py | 11 +++++++++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9e0dca0a9a2..d0ae5f24029 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -240,9 +240,10 @@ cdef class ChunkedArray(_PandasConvertible): object values if self.type.id == _Type_EXTENSION: - storage_array = chunked_array([ - chunk.storage for chunk in self.iterchunks() - ]) + storage_array = chunked_array( + [chunk.storage for chunk in self.iterchunks()], + type=self.type.storage_type + ) return storage_array.to_numpy() with nogil: @@ -455,7 +456,7 @@ def chunked_array(arrays, type=None): else: if arr.type != type: raise ArrowInvalid( - "Each array chunks must have type {}".format(type) + "All array chunks must have type {}".format(type) ) c_arrays.push_back(arr.sp_array) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 75c4ec2d8ac..a3ef336a22f 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -482,11 +482,20 @@ def test_to_numpy(): np.testing.assert_array_equal(result, expected) # chunked array - charr = pa.chunked_array([arr, arr]) + a1 = pa.chunked_array([arr, arr]) + a2 = pa.chunked_array([arr, arr], type=period_type) expected = np.hstack([expected, expected]) - result = np.asarray(charr) - np.testing.assert_array_equal(result, expected) + for charr in [a1, a2]: + assert charr.type == period_type + for result in [np.asarray(charr), charr.to_numpy()]: + assert result.dtype == np.int64 + np.testing.assert_array_equal(result, expected) - result = charr.to_numpy() - np.testing.assert_array_equal(result, expected) + # zero chunks + charr = pa.chunked_array([], type=period_type) + assert charr.type == period_type + + for result in [np.asarray(charr), charr.to_numpy()]: + assert result.dtype == np.int64 + np.testing.assert_array_equal(result, np.array([], dtype='int64')) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 896ab9811d7..cd45d718b22 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -99,6 +99,17 @@ def test_chunked_array_construction(): pa.array([1., 2., 3.]) ]) + msg = ( + "When passing an empty collection of arrays you must also pass the " + "data type" + ) + with pytest.raises(ValueError, match=msg): + assert pa.chunked_array([]) + + assert pa.chunked_array([], type=pa.string()).type == pa.string() + assert pa.chunked_array([[]]).type == pa.null() + assert pa.chunked_array([[]], type=pa.string()).type == pa.string() + def test_chunked_array_to_numpy(): data = pa.chunked_array([ From 5bebad7bfdfaf58c77a81452241094bdcb1ad72f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 10 Aug 2020 14:42:27 +0200 Subject: [PATCH 3/3] Raise TypeError, not ValueError, on invalid type --- python/pyarrow/table.pxi | 2 +- python/pyarrow/tests/test_table.py | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index d0ae5f24029..b8205a31c02 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -455,7 +455,7 @@ def chunked_array(arrays, type=None): type = arr.type else: if arr.type != type: - raise ArrowInvalid( + raise TypeError( "All array chunks must have type {}".format(type) ) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index cd45d718b22..4012a919548 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -93,12 +93,6 @@ def test_chunked_array_construction(): assert len(arr) == 3 assert len(arr.chunks) == 2 - with pytest.raises(pa.ArrowInvalid): - pa.chunked_array([ - pa.array([1, 2, 3]), - pa.array([1., 2., 3.]) - ]) - msg = ( "When passing an empty collection of arrays you must also pass the " "data type" @@ -126,8 +120,16 @@ def test_chunked_array_to_numpy(): def test_chunked_array_mismatch_types(): - with pytest.raises(pa.ArrowInvalid): - pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])]) + with pytest.raises(TypeError): + # Given array types are different + pa.chunked_array([ + pa.array([1, 2, 3]), + pa.array([1., 2., 3.]) + ]) + + with pytest.raises(TypeError): + # Given array type is different from explicit type argument + pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64()) def test_chunked_array_str():