From 72b1352e9e749c791d624af54b7f0341b9969044 Mon Sep 17 00:00:00 2001 From: Spencer Nelson Date: Tue, 30 May 2023 23:16:24 -0700 Subject: [PATCH] Add FixedSizeListArray.to,from_numpy_ndarray --- python/pyarrow/array.pxi | 40 ++++++++++++++++++++++ python/pyarrow/tests/test_array.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 49ae9ceb36a..248e9368dc9 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2294,6 +2294,46 @@ cdef class FixedSizeListArray(BaseListArray): result.validate() return result + @staticmethod + def from_numpy_ndarray(obj): + """ + Convert a 2D numpy ndarray to a fixed size list array. + + The first dimension of the ndarray is the array length, and the second + dimension is the list size. + + The numpy array needs to be C-contiguous. + + Parameters + ---------- + obj : numpy.ndarray + The numpy array to convert. + + Returns + ------- + FixedSizeListArray + """ + if not obj.flags["C_CONTIGUOUS"]: + raise ValueError("The data in the numpy array needs to be in a single, " + "C-style contiguous segment.") + + if len(obj.shape) != 2: + raise NotImplementedError("Only 2D numpy arrays are supported") + + list_size = obj.shape[1] + if list_size <= 0: + raise ValueError("The list size needs to be positive") + array = np.ravel(obj, order="C") + + return FixedSizeListArray.from_arrays(array, list_size=list_size) + + def to_numpy_ndarray(self): + """ + Output self as a 2D numpy ndarray. + """ + flat = np.asarray(self.values) + return flat.reshape((len(self), self.type.list_size)) + @property def values(self): cdef CFixedSizeListArray* arr = self.ap diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index fe272151e81..73639c3fde3 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2825,6 +2825,61 @@ def test_fixed_size_list_array_flatten_with_slice(): assert array[2:].flatten() == pa.array([3], type=pa.float64()) +def test_fixed_size_list_array_from_numpy_ndarray(): + values = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.int64) + result = pa.FixedSizeListArray.from_numpy_ndarray(values) + assert result.to_pylist() == [[1, 2], [3, 4], [5, 6]] + assert result.type == pa.list_(pa.int64(), 2) + + # String-types should work + values = np.array([["a", "b"], ["c", "d"], ["e", "f"]], dtype=np.object_) + result = pa.FixedSizeListArray.from_numpy_ndarray(values) + assert result.to_pylist() == [["a", "b"], ["c", "d"], ["e", "f"]] + assert result.type == pa.list_(pa.string(), 2) + + values = np.ones((0, 5), dtype=np.int64) + result = pa.FixedSizeListArray.from_numpy_ndarray(values) + assert result.to_pylist() == [] + assert result.type == pa.list_(pa.int64(), 5) + + # Zero-size lists are not permitted + with pytest.raises(ValueError): + values = np.ones((5, 0), dtype=np.int64) + result = pa.FixedSizeListArray.from_numpy_ndarray(values) + + # 1D array is not supported + with pytest.raises(NotImplementedError): + values = np.ones(5, dtype=np.int64) + pa.FixedSizeListArray.from_numpy_ndarray(values) + + # 3D array is not supported + with pytest.raises(NotImplementedError): + values = np.ones((3, 3, 3), dtype=np.int64) + pa.FixedSizeListArray.from_numpy_ndarray(values) + + # Data must be C-contiguous + with pytest.raises(ValueError): + values = np.ones((3, 3), order='F', dtype=np.int64) + pa.FixedSizeListArray.from_numpy_ndarray(values) + + +def test_fixed_size_list_array_to_numpy_ndarray(): + array = pa.array([[1, 2], [3, 4], [5, 6]], + type=pa.list_(pa.int64(), list_size=2)) + result = array.to_numpy_ndarray() + expected = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.int64) + np.testing.assert_array_equal(result, expected) + + # An array with a null value. + array = pa.array([[1, 2], None, [5, 6]], + type=pa.list_(pa.float64(), list_size=2)) + result = array.to_numpy_ndarray() + + # The null entry should be expanded to multiple nulls. + expected = np.array([[1.0, 2.0], [np.nan, np.nan], [5, 6]], dtype=np.float64) + np.testing.assert_array_equal(result, expected) + + def test_map_array_values_offsets(): ty = pa.map_(pa.utf8(), pa.int32()) ty_values = pa.struct([pa.field("key", pa.utf8(), nullable=False),