From d69685a5690ada934a4045cfd0ccf5bd22da8466 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 4 Apr 2023 13:10:26 +0200 Subject: [PATCH 01/24] Add bindings and tests for FixedShapeTensorType and Array --- python/pyarrow/__init__.py | 4 +- python/pyarrow/array.pxi | 64 ++++++++++ python/pyarrow/includes/libarrow.pxd | 25 ++++ python/pyarrow/lib.pxd | 5 + python/pyarrow/public-api.pxi | 3 + python/pyarrow/tests/test_extension_type.py | 83 +++++++++++++ python/pyarrow/types.pxi | 129 ++++++++++++++++++++ 7 files changed, 311 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 5c2c7dcc490..d98e490fe53 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -178,7 +178,7 @@ def print_entry(label, value): TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, - RunEndEncodedType, + RunEndEncodedType, FixedShapeTensorType, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -209,7 +209,7 @@ def print_entry(label, value): Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, - RunEndEncodedArray, + RunEndEncodedArray, FixedShapeTensorArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index d5ba45db20e..e39a8cba02e 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3090,6 +3090,70 @@ cdef class ExtensionArray(Array): return self.storage.to_numpy(**kwargs) +class FixedShapeTensorArray(ExtensionArray): + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + def to_numpy_ndarray(self): + """ + Convert fixed shape tensor extension array to a numpy array (with dim+1). + """ + np_flat = np.asarray(self.storage.values) + numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape), + order='C') + + return numpy_tensor + + def from_numpy_ndarray(obj): + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + """ + numpy_type = obj.flatten().dtype + arrow_type = from_numpy_dtype(numpy_type) + shape = obj.shape[1:] + size = obj.size / obj.shape[0] + + return ExtensionArray.from_storage( + FixedShapeTensorType(arrow_type, shape), + array([t.flatten() for t in obj], + list_(arrow_type, size)) + ) + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 798f0e8395d..09b8add5427 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2619,6 +2619,31 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": + cdef cppclass CFixedShapeTensorType \ + " arrow::extension::FixedShapeTensorType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, + const vector[int64_t]& shape, + const vector[int64_t]& permutation, + const vector[c_string]& dim_names) + + CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, + const c_string& serialized_data) const + + c_string Serialize() const + + const shared_ptr[CDataType] value_type() + const vector[int64_t] shape() + const vector[int64_t] permutation() + const vector[c_string] dim_names() + + CFixedShapeTensorType(shared_ptr[CDataType]& value_type, int32_t& size, + vector[int64_t]& shape, vector[int64_t]& permutation, + vector[c_string]& dim_names) + + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index d984475171f..54e14005f6d 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -199,6 +199,11 @@ cdef class ExtensionType(BaseExtensionType): const CPyExtensionType* cpy_ext_type +cdef class FixedShapeTensorType(BaseExtensionType): + cdef: + const CFixedShapeTensorType* tensor_ext_type + + cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index fadc659d455..91d5832754f 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -76,6 +76,7 @@ cdef api object pyarrow_wrap_data_type( cdef: const CExtensionType* ext_type const CPyExtensionType* cpy_ext_type + c_string tensor_name = tobytes("arrow.fixed_shape_tensor") DataType out if type.get() == NULL: @@ -118,6 +119,8 @@ cdef api object pyarrow_wrap_data_type( cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() + elif ext_type.extension_name() == tensor_name: + out = FixedShapeTensorType.__new__(FixedShapeTensorType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index fa7ece5bc24..ac27690f0a7 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1127,3 +1127,86 @@ def test_cpp_extension_in_python(tmpdir): reconstructed_array = batch.column(0) assert reconstructed_array.type == uuid_type assert reconstructed_array == array + + +def test_tensor_type(): + tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 3)) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type.storage_type == pa.list_(pa.int8(), 6) + + +def test_tensor_class_methods(): + tensor_type = pa.FixedShapeTensorType(pa.float32(), (2, 3)) + storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], + pa.list_(pa.float32(), 6)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = np.array( + [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + + result = arr.to_numpy_ndarray() + np.testing.assert_array_equal(result, expected) + + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(expected) + assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) + assert tensor_array_from_numpy.type.value_type == pa.float32() + assert tensor_array_from_numpy.type.shape == [2, 3] + + +@pytest.mark.parametrize("tensor_type", ( + pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)), + pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]), + pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) +)) +def test_tensor_type_ipc(tensor_type): + storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + # check the built array has exactly the expected clss + tensor_class = tensor_type.__arrow_ext_class__() + assert type(arr) == tensor_class + + buf = ipc_write_batch(batch) + del batch + batch = ipc_read_batch(buf) + + result = batch.column(0) + # check the deserialized array class is the expected one + assert type(result) == tensor_class + assert result.type.extension_name == "arrow.fixed_shape_tensor" + assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]] + + # we get back an actual TensorType + assert isinstance(result.type, pa.FixedShapeTensorType) + assert result.type.value_type == pa.int8() + assert result.type.shape == [2, 2, 3] + + # using different parametrization as how it was registered + tensor_type_uint = tensor_type.__class__(pa.uint8(), (2, 3)) + assert tensor_type_uint.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type_uint.value_type == pa.uint8() + assert tensor_type_uint.shape == [2, 3] + + storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], + pa.list_(pa.uint8(), 6)) + arr = pa.ExtensionArray.from_storage(tensor_type_uint, storage) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + buf = ipc_write_batch(batch) + del batch + batch = ipc_read_batch(buf) + result = batch.column(0) + assert isinstance(result.type, pa.FixedShapeTensorType) + assert result.type.value_type == pa.uint8() + assert result.type.shape == [2, 3] + assert type(result) == tensor_class + + +def test_tensor_type_equality(): + tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + + tensor_type2 = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)) + tensor_type3 = pa.FixedShapeTensorType(pa.uint8(), (2, 2, 3)) + assert tensor_type == tensor_type2 + assert not tensor_type == tensor_type3 diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0e817066601..92b377490b8 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1494,6 +1494,135 @@ cdef class ExtensionType(BaseExtensionType): """ return ExtensionScalar + +cdef class FixedShapeTensorType(BaseExtensionType): + """ + Concrete class for fixed shape tensor extension type. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple + The physical shape of the contained tensors. + dim_names : tuple + Explicit names to tensor dimensions. + permutation : tuple + Indices of the desired ordering of the original dimensions. + + Examples + -------- + >>> import pyarrow as pa + + Create fixed shape tensor extension type: + + >>> tensor_type = pa.FixedShapeTensorType(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a fixed shape tensor extension type with names of tensor dimensions: + + >>> tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) + >>> tensor_type.dim_names + [b'C', b'H', b'W'] + + Create a fixed shape tensor extension type with permutation: + >>> tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + + def __init__(self, DataType value_type, shape, dim_names=None, permutation=None): + """ + Initialize an fixed shape tensor extension type instance. + + This should be called at the end of the subclass' + ``__init__`` method. + """ + cdef: + vector[int64_t] c_shape + vector[int64_t] c_permutation + vector[c_string] c_dim_names + shared_ptr[CDataType] tensor_ext_type + + assert value_type is not None + assert shape is not None + + for i in shape: + c_shape.push_back(i) + + if permutation is not None: + for i in permutation: + c_permutation.push_back(i) + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( + value_type.sp_type, c_shape, c_permutation, c_dim_names)) + + self.init(tensor_ext_type) + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.tensor_ext_type = type.get() + + @property + def value_type(self): + """ + Data type of an individual tensor. + """ + return pyarrow_wrap_data_type(self.tensor_ext_type.value_type()) + + @property + def shape(self): + """ + Shape of the tensors. + """ + return self.tensor_ext_type.shape() + + @property + def dim_names(self): + """ + Explicit names of the dimensions. + """ + return self.tensor_ext_type.dim_names() + + @property + def permutation(self): + """ + Indices of the dimensions ordering. + """ + return self.tensor_ext_type.permutation() + + def __arrow_ext_serialize__(self): + """ + Serialized representation of metadata to reconstruct the type object. + """ + metadata = self.tensor_ext_type.Serialize() + return metadata + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + """ + Return an FixedShapeTensor type instance from the storage type and serialized + metadata. + """ + tensor_ext_type = self.tensor_ext_type.Deserialize(storage_type, serialized) + return tensor_ext_type + + def __arrow_ext_class__(self): + return FixedShapeTensorArray + + cdef class PyExtensionType(ExtensionType): """ Concrete base class for Python-defined extension types based on pickle From a6292f80c49ac4fd2441ce36063d99c59c146e26 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 07:49:43 +0200 Subject: [PATCH 02/24] Fix linter error --- python/pyarrow/array.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e39a8cba02e..a09fdecb7c7 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3128,6 +3128,7 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ + def to_numpy_ndarray(self): """ Convert fixed shape tensor extension array to a numpy array (with dim+1). From 1bdba1d6e13912536719d81eed51582b3aa9405c Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 08:23:34 +0200 Subject: [PATCH 03/24] Add pa.fixedshapetensor factory function and update docstring examples --- python/pyarrow/__init__.py | 1 + python/pyarrow/types.pxi | 169 ++++++++++++++++++++++--------------- 2 files changed, 104 insertions(+), 66 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d98e490fe53..04007716c98 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -170,6 +170,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, + fixedshapetensor, field, type_for_alias, DataType, DictionaryType, StructType, diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 92b377490b8..52bff6179fe 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1499,78 +1499,23 @@ cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. - Parameters - ---------- - value_type : DataType - Data type of individual tensor elements. - shape : tuple - The physical shape of the contained tensors. - dim_names : tuple - Explicit names to tensor dimensions. - permutation : tuple - Indices of the desired ordering of the original dimensions. - Examples -------- - >>> import pyarrow as pa - - Create fixed shape tensor extension type: + Create an instance of fixed shape tensor extension type: - >>> tensor_type = pa.FixedShapeTensorType(pa.int32(), [2, 2]) - >>> tensor_type + >>> import pyarrow as pa + >>> pa.fixedshapetensor(pa.int32(), [2, 2]) FixedShapeTensorType(extension) - Inspect the data type: - - >>> tensor_type.value_type - DataType(int32) - >>> tensor_type.shape - [2, 2] - - Create a fixed shape tensor extension type with names of tensor dimensions: - - >>> tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) - >>> tensor_type.dim_names - [b'C', b'H', b'W'] + Create an instance of fixed shape tensor extension type with + permutation: - Create a fixed shape tensor extension type with permutation: - >>> tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) >>> tensor_type.permutation [0, 2, 1] """ - def __init__(self, DataType value_type, shape, dim_names=None, permutation=None): - """ - Initialize an fixed shape tensor extension type instance. - - This should be called at the end of the subclass' - ``__init__`` method. - """ - cdef: - vector[int64_t] c_shape - vector[int64_t] c_permutation - vector[c_string] c_dim_names - shared_ptr[CDataType] tensor_ext_type - - assert value_type is not None - assert shape is not None - - for i in shape: - c_shape.push_back(i) - - if permutation is not None: - for i in permutation: - c_permutation.push_back(i) - - if dim_names is not None: - for x in dim_names: - c_dim_names.push_back(tobytes(x)) - - tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( - value_type.sp_type, c_shape, c_permutation, c_dim_names)) - - self.init(tensor_ext_type) - cdef void init(self, const shared_ptr[CDataType]& type) except *: BaseExtensionType.init(self, type) self.tensor_ext_type = type.get() @@ -1607,8 +1552,7 @@ cdef class FixedShapeTensorType(BaseExtensionType): """ Serialized representation of metadata to reconstruct the type object. """ - metadata = self.tensor_ext_type.Serialize() - return metadata + return self.tensor_ext_type.Serialize() @classmethod def __arrow_ext_deserialize__(self, storage_type, serialized): @@ -1616,8 +1560,7 @@ cdef class FixedShapeTensorType(BaseExtensionType): Return an FixedShapeTensor type instance from the storage type and serialized metadata. """ - tensor_ext_type = self.tensor_ext_type.Deserialize(storage_type, serialized) - return tensor_ext_type + return self.tensor_ext_type.Deserialize(storage_type, serialized) def __arrow_ext_class__(self): return FixedShapeTensorArray @@ -4672,6 +4615,100 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def fixedshapetensor(DataType value_type, shape, dim_names=None, permutation=None): + """ + Create instance of fixed shape tensor extension type with shape and optional + names of tensor dimensions and indices of the desired ordering. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple + The physical shape of the contained tensors. + dim_names : tuple, default None + Explicit names to tensor dimensions. + permutation : tuple, default None + Indices of the desired ordering of the original dimensions. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.fixedshapetensor(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a table with fixed shape tensor extension array: + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] + + Create an instance of fixed shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3), + ... dim_names=['C', 'H', 'W']) + >>> tensor_type.dim_names + [b'C', b'H', b'W'] + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : FixedShapeTensorType + """ + + cdef: + vector[int64_t] c_shape + vector[int64_t] c_permutation + vector[c_string] c_dim_names + shared_ptr[CDataType] c_tensor_ext_type + + assert value_type is not None + assert shape is not None + + for i in shape: + c_shape.push_back(i) + + if permutation is not None: + for i in permutation: + c_permutation.push_back(i) + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + cdef FixedShapeTensorType out = FixedShapeTensorType.__new__(FixedShapeTensorType) + + c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( + value_type.sp_type, c_shape, c_permutation, c_dim_names)) + + out.init(c_tensor_ext_type) + + return out + + cdef dict _type_aliases = { 'null': null, 'bool': bool_, From 7c395b0bc6f21efe17db08e1d3ea091a2c02e903 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 09:09:34 +0200 Subject: [PATCH 04/24] Apply suggestions from code review - Joris Co-authored-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a09fdecb7c7..b25235009af 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3143,8 +3143,7 @@ class FixedShapeTensorArray(ExtensionArray): """ Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. """ - numpy_type = obj.flatten().dtype - arrow_type = from_numpy_dtype(numpy_type) + arrow_type = from_numpy_dtype(obj.dtype) shape = obj.shape[1:] size = obj.size / obj.shape[0] From d27d48fcbf843ce048a003cc963ecc67956c6ed0 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 09:52:38 +0200 Subject: [PATCH 05/24] Use pa.FixedSizeListArray.from_arrays(..) in from_numpy_ndarray() --- python/pyarrow/array.pxi | 5 ++- python/pyarrow/tests/test_extension_type.py | 36 +++++---------------- 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b25235009af..fb692fa3b03 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3148,9 +3148,8 @@ class FixedShapeTensorArray(ExtensionArray): size = obj.size / obj.shape[0] return ExtensionArray.from_storage( - FixedShapeTensorType(arrow_type, shape), - array([t.flatten() for t in obj], - list_(arrow_type, size)) + fixedshapetensor(arrow_type, shape), + FixedSizeListArray.from_arrays(obj.flatten(), size) ) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ac27690f0a7..0e745e038b0 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1130,13 +1130,13 @@ def test_cpp_extension_in_python(tmpdir): def test_tensor_type(): - tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 3)) + tensor_type = pa.fixedshapetensor(pa.int8(), (2, 3)) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.int8(), 6) def test_tensor_class_methods(): - tensor_type = pa.FixedShapeTensorType(pa.float32(), (2, 3)) + tensor_type = pa.fixedshapetensor(pa.float32(), (2, 3)) storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], pa.list_(pa.float32(), 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) @@ -1153,9 +1153,9 @@ def test_tensor_class_methods(): @pytest.mark.parametrize("tensor_type", ( - pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)), - pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]), - pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) + pa.fixedshapetensor(pa.int8(), (2, 2, 3)), + pa.fixedshapetensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]), + pa.fixedshapetensor(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) )) def test_tensor_type_ipc(tensor_type): storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) @@ -1181,32 +1181,12 @@ def test_tensor_type_ipc(tensor_type): assert result.type.value_type == pa.int8() assert result.type.shape == [2, 2, 3] - # using different parametrization as how it was registered - tensor_type_uint = tensor_type.__class__(pa.uint8(), (2, 3)) - assert tensor_type_uint.extension_name == "arrow.fixed_shape_tensor" - assert tensor_type_uint.value_type == pa.uint8() - assert tensor_type_uint.shape == [2, 3] - - storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], - pa.list_(pa.uint8(), 6)) - arr = pa.ExtensionArray.from_storage(tensor_type_uint, storage) - batch = pa.RecordBatch.from_arrays([arr], ["ext"]) - - buf = ipc_write_batch(batch) - del batch - batch = ipc_read_batch(buf) - result = batch.column(0) - assert isinstance(result.type, pa.FixedShapeTensorType) - assert result.type.value_type == pa.uint8() - assert result.type.shape == [2, 3] - assert type(result) == tensor_class - def test_tensor_type_equality(): - tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)) + tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3)) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" - tensor_type2 = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)) - tensor_type3 = pa.FixedShapeTensorType(pa.uint8(), (2, 2, 3)) + tensor_type2 = pa.fixedshapetensor(pa.int8(), (2, 2, 3)) + tensor_type3 = pa.fixedshapetensor(pa.uint8(), (2, 2, 3)) assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 From 8e790b468972fdabe8937f1a0115f407bc49ad03 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 09:57:28 +0200 Subject: [PATCH 06/24] Change fixedshapetensor to fixed_shape_tensor --- python/pyarrow/__init__.py | 2 +- python/pyarrow/array.pxi | 2 +- python/pyarrow/tests/test_extension_type.py | 16 ++++++++-------- python/pyarrow/types.pxi | 18 +++++++++--------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 04007716c98..3ddfd36a8e9 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -170,7 +170,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - fixedshapetensor, + fixed_shape_tensor, field, type_for_alias, DataType, DictionaryType, StructType, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index fb692fa3b03..69d73f55e3a 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3148,7 +3148,7 @@ class FixedShapeTensorArray(ExtensionArray): size = obj.size / obj.shape[0] return ExtensionArray.from_storage( - fixedshapetensor(arrow_type, shape), + fixed_shape_tensor(arrow_type, shape), FixedSizeListArray.from_arrays(obj.flatten(), size) ) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 0e745e038b0..543d3000fe8 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1130,13 +1130,13 @@ def test_cpp_extension_in_python(tmpdir): def test_tensor_type(): - tensor_type = pa.fixedshapetensor(pa.int8(), (2, 3)) + tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 3)) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.int8(), 6) def test_tensor_class_methods(): - tensor_type = pa.fixedshapetensor(pa.float32(), (2, 3)) + tensor_type = pa.fixed_shape_tensor(pa.float32(), (2, 3)) storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], pa.list_(pa.float32(), 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) @@ -1153,9 +1153,9 @@ def test_tensor_class_methods(): @pytest.mark.parametrize("tensor_type", ( - pa.fixedshapetensor(pa.int8(), (2, 2, 3)), - pa.fixedshapetensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]), - pa.fixedshapetensor(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) + pa.fixed_shape_tensor(pa.int8(), (2, 2, 3)), + pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]), + pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) )) def test_tensor_type_ipc(tensor_type): storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) @@ -1183,10 +1183,10 @@ def test_tensor_type_ipc(tensor_type): def test_tensor_type_equality(): - tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3)) + tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3)) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" - tensor_type2 = pa.fixedshapetensor(pa.int8(), (2, 2, 3)) - tensor_type3 = pa.fixedshapetensor(pa.uint8(), (2, 2, 3)) + tensor_type2 = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3)) + tensor_type3 = pa.fixed_shape_tensor(pa.uint8(), (2, 2, 3)) assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 52bff6179fe..cb1b9687650 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1504,14 +1504,14 @@ cdef class FixedShapeTensorType(BaseExtensionType): Create an instance of fixed shape tensor extension type: >>> import pyarrow as pa - >>> pa.fixedshapetensor(pa.int32(), [2, 2]) + >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) FixedShapeTensorType(extension) Create an instance of fixed shape tensor extension type with permutation: - >>> tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3), - ... permutation=[0, 2, 1]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) >>> tensor_type.permutation [0, 2, 1] """ @@ -4615,7 +4615,7 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) -def fixedshapetensor(DataType value_type, shape, dim_names=None, permutation=None): +def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None): """ Create instance of fixed shape tensor extension type with shape and optional names of tensor dimensions and indices of the desired ordering. @@ -4636,7 +4636,7 @@ def fixedshapetensor(DataType value_type, shape, dim_names=None, permutation=Non Create an instance of fixed shape tensor extension type: >>> import pyarrow as pa - >>> tensor_type = pa.fixedshapetensor(pa.int32(), [2, 2]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) >>> tensor_type FixedShapeTensorType(extension) @@ -4661,16 +4661,16 @@ def fixedshapetensor(DataType value_type, shape, dim_names=None, permutation=Non Create an instance of fixed shape tensor extension type with names of tensor dimensions: - >>> tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3), - ... dim_names=['C', 'H', 'W']) + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... dim_names=['C', 'H', 'W']) >>> tensor_type.dim_names [b'C', b'H', b'W'] Create an instance of fixed shape tensor extension type with permutation: - >>> tensor_type = pa.fixedshapetensor(pa.int8(), (2, 2, 3), - ... permutation=[0, 2, 1]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) >>> tensor_type.permutation [0, 2, 1] From 64e0cd084b72d06bfbabe440e2fdab155fc2420c Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 10:12:36 +0200 Subject: [PATCH 07/24] Add tests for all the custom attributes --- python/pyarrow/tests/test_extension_type.py | 36 ++++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 543d3000fe8..99ea4d2b570 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -25,6 +25,7 @@ import numpy as np import pyarrow as pa +from pyarrow.lib import tobytes import pytest @@ -1130,13 +1131,32 @@ def test_cpp_extension_in_python(tmpdir): def test_tensor_type(): - tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 3)) + tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.int8(), 6) + assert tensor_type.shape == [2, 3] + assert not tensor_type.dim_names + assert not tensor_type.permutation + + tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3], + permutation=[0, 2, 1]) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type.storage_type == pa.list_(pa.float64(), 12) + assert tensor_type.shape == [2, 2, 3] + assert not tensor_type.dim_names + assert tensor_type.permutation == [0, 2, 1] + + tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], + dim_names=['C', 'H', 'W']) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type.storage_type == pa.list_(pa.bool_(), 12) + assert tensor_type.shape == [2, 2, 3] + assert tensor_type.dim_names == [tobytes(x) for x in ['C', 'H', 'W']] + assert not tensor_type.permutation def test_tensor_class_methods(): - tensor_type = pa.fixed_shape_tensor(pa.float32(), (2, 3)) + tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], pa.list_(pa.float32(), 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) @@ -1153,9 +1173,9 @@ def test_tensor_class_methods(): @pytest.mark.parametrize("tensor_type", ( - pa.fixed_shape_tensor(pa.int8(), (2, 2, 3)), - pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]), - pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W']) + pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), + pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), + pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], dim_names=['C', 'H', 'W']) )) def test_tensor_type_ipc(tensor_type): storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) @@ -1183,10 +1203,10 @@ def test_tensor_type_ipc(tensor_type): def test_tensor_type_equality(): - tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3)) + tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" - tensor_type2 = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3)) - tensor_type3 = pa.fixed_shape_tensor(pa.uint8(), (2, 2, 3)) + tensor_type2 = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) + tensor_type3 = pa.fixed_shape_tensor(pa.uint8(), [2, 2, 3]) assert tensor_type == tensor_type2 assert not tensor_type == tensor_type3 From 48cbeb3c1d1b6a7f72d3747f744d2a4c39f29ea6 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 10:26:16 +0200 Subject: [PATCH 08/24] Add test for numpy F-contiguous --- python/pyarrow/tests/test_extension_type.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 99ea4d2b570..1af0b9e9914 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1155,7 +1155,8 @@ def test_tensor_type(): assert not tensor_type.permutation -def test_tensor_class_methods(): +@pytest.mark.parametrize("numpy_order", ('C', 'F')) +def test_tensor_class_methods(numpy_order): tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], pa.list_(pa.float32(), 6)) @@ -1166,7 +1167,10 @@ def test_tensor_class_methods(): result = arr.to_numpy_ndarray() np.testing.assert_array_equal(result, expected) - tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(expected) + arr = np.array( + [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + dtype=np.float32, order=numpy_order) + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) assert tensor_array_from_numpy.type.value_type == pa.float32() assert tensor_array_from_numpy.type.shape == [2, 3] From d9ca165982ba31f43f5e0b91f32fa6af17f8fedd Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 10:42:42 +0200 Subject: [PATCH 09/24] Correct dim_names() to return list of strings, not bytes --- python/pyarrow/tests/test_extension_type.py | 3 +-- python/pyarrow/types.pxi | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 1af0b9e9914..4c76faba60e 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -25,7 +25,6 @@ import numpy as np import pyarrow as pa -from pyarrow.lib import tobytes import pytest @@ -1151,7 +1150,7 @@ def test_tensor_type(): assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.bool_(), 12) assert tensor_type.shape == [2, 2, 3] - assert tensor_type.dim_names == [tobytes(x) for x in ['C', 'H', 'W']] + assert tensor_type.dim_names == ['C', 'H', 'W'] assert not tensor_type.permutation diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index cb1b9687650..15b6f3d1232 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1539,7 +1539,8 @@ cdef class FixedShapeTensorType(BaseExtensionType): """ Explicit names of the dimensions. """ - return self.tensor_ext_type.dim_names() + result_list = self.tensor_ext_type.dim_names() + return [frombytes(x) for x in result_list] @property def permutation(self): @@ -4624,11 +4625,11 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N ---------- value_type : DataType Data type of individual tensor elements. - shape : tuple + shape : tuple or list of integers The physical shape of the contained tensors. - dim_names : tuple, default None + dim_names : tuple or list of strings, default None Explicit names to tensor dimensions. - permutation : tuple, default None + permutation : tuple or list integers, default None Indices of the desired ordering of the original dimensions. Examples From d3530af33365f62d4436b0c29bedefa5fbc5ef08 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 10:57:10 +0200 Subject: [PATCH 10/24] Correct dim_names and permutation methods to return None and not empty list --- python/pyarrow/tests/test_extension_type.py | 8 ++++---- python/pyarrow/types.pxi | 13 ++++++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 4c76faba60e..a9104bf6bf6 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1134,15 +1134,15 @@ def test_tensor_type(): assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.int8(), 6) assert tensor_type.shape == [2, 3] - assert not tensor_type.dim_names - assert not tensor_type.permutation + assert tensor_type.dim_names is None + assert tensor_type.permutation is None tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3], permutation=[0, 2, 1]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" assert tensor_type.storage_type == pa.list_(pa.float64(), 12) assert tensor_type.shape == [2, 2, 3] - assert not tensor_type.dim_names + assert tensor_type.dim_names is None assert tensor_type.permutation == [0, 2, 1] tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], @@ -1151,7 +1151,7 @@ def test_tensor_type(): assert tensor_type.storage_type == pa.list_(pa.bool_(), 12) assert tensor_type.shape == [2, 2, 3] assert tensor_type.dim_names == ['C', 'H', 'W'] - assert not tensor_type.permutation + assert tensor_type.permutation is None @pytest.mark.parametrize("numpy_order", ('C', 'F')) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 15b6f3d1232..adbc69f05fa 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1539,15 +1539,22 @@ cdef class FixedShapeTensorType(BaseExtensionType): """ Explicit names of the dimensions. """ - result_list = self.tensor_ext_type.dim_names() - return [frombytes(x) for x in result_list] + list_of_bytes = self.tensor_ext_type.dim_names() + if len(list_of_bytes) != 0: + return [frombytes(x) for x in list_of_bytes] + else: + return None @property def permutation(self): """ Indices of the dimensions ordering. """ - return self.tensor_ext_type.permutation() + indices = self.tensor_ext_type.permutation() + if len(indices) != 0: + return indices + else: + return None def __arrow_ext_serialize__(self): """ From e2ce8ba5abd88d67d78e53cfb8b97805a414a4ee Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 15:21:13 +0200 Subject: [PATCH 11/24] Replace FixedShapeTensorType with fixed_shape_tensor in FixedShapeTensorArray --- python/pyarrow/array.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 69d73f55e3a..f8081c243cb 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3099,7 +3099,7 @@ class FixedShapeTensorArray(ExtensionArray): Define the extension type for tensor array >>> import pyarrow as pa - >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2]) + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) Create an extension array From ee5d25c34a63c96056c6d20d0d7bc4f9b1b0eba1 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 15:46:06 +0200 Subject: [PATCH 12/24] Update from_numpy_ndarray docstrings --- python/pyarrow/array.pxi | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f8081c243cb..7eab7b9b3fa 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3142,6 +3142,43 @@ class FixedShapeTensorArray(ExtensionArray): def from_numpy_ndarray(obj): """ Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + + For correct results, Numpy array needs to be C-contiguous in memory + (``obj.flags["C_CONTIGUOUS"]==True``). + + Parameters + ---------- + obj : ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array( + ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + ... dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] """ arrow_type = from_numpy_dtype(obj.dtype) shape = obj.shape[1:] From f5a5c0c6e743f4b87e051746693ace61222162cf Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 15:49:25 +0200 Subject: [PATCH 13/24] Update public-api.pxi --- python/pyarrow/public-api.pxi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 91d5832754f..72e16f2cec3 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -76,7 +76,6 @@ cdef api object pyarrow_wrap_data_type( cdef: const CExtensionType* ext_type const CPyExtensionType* cpy_ext_type - c_string tensor_name = tobytes("arrow.fixed_shape_tensor") DataType out if type.get() == NULL: @@ -119,7 +118,7 @@ cdef api object pyarrow_wrap_data_type( cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() - elif ext_type.extension_name() == tensor_name: + elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) else: out = BaseExtensionType.__new__(BaseExtensionType) From 52f9e7e8fc653a4c3565d36e39d23aeebbf1f324 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 15:50:44 +0200 Subject: [PATCH 14/24] Update python/pyarrow/types.pxi Co-authored-by: Joris Van den Bossche --- python/pyarrow/types.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index adbc69f05fa..e6a379eafdf 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4672,7 +4672,7 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), ... dim_names=['C', 'H', 'W']) >>> tensor_type.dim_names - [b'C', b'H', b'W'] + ['C', 'H', 'W'] Create an instance of fixed shape tensor extension type with permutation: From b171d008a8da69ef03642aa230c59f0a54fda1aa Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 17:27:22 +0200 Subject: [PATCH 15/24] Use ravel insted of flatten and raise ValueError if numpy array is not C-contiguous in memory --- python/pyarrow/array.pxi | 16 ++++++++++------ python/pyarrow/tests/test_extension_type.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 935ecba5ab6..07b831cd942 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3131,7 +3131,7 @@ class FixedShapeTensorArray(ExtensionArray): The first dimension of ndarray will become the length of the fixed shape tensor array. - For correct results, Numpy array needs to be C-contiguous in memory + Numpy array needs to be C-contiguous in memory (``obj.flags["C_CONTIGUOUS"]==True``). Parameters @@ -3148,31 +3148,35 @@ class FixedShapeTensorArray(ExtensionArray): >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) [ - [ + [ 1, 2, 3, 4, 5, 6 - ], - [ + ], + [ 1, 2, 3, 4, 5, 6 - ] + ] ] """ + if obj.flags["F_CONTIGUOUS"]: + raise ValueError('The data in the numpy array need to be in a single, ' + 'C-style contiguous segment.') + arrow_type = from_numpy_dtype(obj.dtype) shape = obj.shape[1:] size = obj.size / obj.shape[0] return ExtensionArray.from_storage( fixed_shape_tensor(arrow_type, shape), - FixedSizeListArray.from_arrays(obj.flatten(), size) + FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size) ) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index f5ca0afb170..3a0b7ee6e73 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1169,26 +1169,30 @@ def test_tensor_type(): assert tensor_type.permutation is None -@pytest.mark.parametrize("numpy_order", ('C', 'F')) -def test_tensor_class_methods(numpy_order): +def test_tensor_class_methods(): tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], pa.list_(pa.float32(), 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) - result = arr.to_numpy_ndarray() np.testing.assert_array_equal(result, expected) arr = np.array( [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], - dtype=np.float32, order=numpy_order) + dtype=np.float32, order="C") tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) assert tensor_array_from_numpy.type.value_type == pa.float32() assert tensor_array_from_numpy.type.shape == [2, 3] + arr = np.array( + [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + dtype=np.float32, order="F") + with pytest.raises(ValueError, match="C-style contiguous segment"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), From c0ec94cf34adbc8ff277dd1b5424c3a619249852 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 5 Apr 2023 17:31:55 +0200 Subject: [PATCH 16/24] Remove CFixedShapeTensorType binding in libarrow --- python/pyarrow/includes/libarrow.pxd | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 09b8add5427..9df90ec60aa 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2639,10 +2639,6 @@ cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extens const vector[int64_t] permutation() const vector[c_string] dim_names() - CFixedShapeTensorType(shared_ptr[CDataType]& value_type, int32_t& size, - vector[int64_t]& shape, vector[int64_t]& permutation, - vector[c_string]& dim_names) - cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": From f2d9fe77ec09e7577873173beceb66ca99be0a01 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 6 Apr 2023 10:07:52 +0200 Subject: [PATCH 17/24] Fix doctest failure --- python/pyarrow/array.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 07b831cd942..109e67aebe7 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3146,7 +3146,7 @@ class FixedShapeTensorArray(ExtensionArray): ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], ... dtype=np.float32) >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - + [ [ 1, From 8b5dc93eb1f9cb9e57d9326deee28d733ee1e0cf Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 6 Apr 2023 10:15:59 +0200 Subject: [PATCH 18/24] Add explanation of permutation from the spec to the docstring of fixed_shape_tensor --- docs/source/format/CanonicalExtensions.rst | 2 ++ python/pyarrow/types.pxi | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 92dc1b2db98..5dd269ee5c6 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -72,6 +72,8 @@ same rules as laid out above, and provide backwards compatibility guarantees. Official List ============= +.. _fixed_shape_tensor_extension: + Fixed shape tensor ================== diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e6a379eafdf..8050c70f9dd 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4638,6 +4638,11 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N Explicit names to tensor dimensions. permutation : tuple or list integers, default None Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. Examples -------- From 570f086f54ce1259973727622d9422c0f346fec1 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 6 Apr 2023 10:26:24 +0200 Subject: [PATCH 19/24] from_numpy_ndarray should be a static method --- python/pyarrow/array.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 109e67aebe7..0b71aff213c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3125,6 +3125,7 @@ class FixedShapeTensorArray(ExtensionArray): return numpy_tensor + @staticmethod def from_numpy_ndarray(obj): """ Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. From 3dbbe20ca4d92b8329e7e5fb04753ceb74579c85 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 6 Apr 2023 12:26:47 +0200 Subject: [PATCH 20/24] Apply suggestions from code review Co-authored-by: Joris Van den Bossche Co-authored-by: Rok Mihevc --- python/pyarrow/array.pxi | 3 +-- python/pyarrow/types.pxi | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 0b71aff213c..d60999cdb08 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3120,8 +3120,7 @@ class FixedShapeTensorArray(ExtensionArray): Convert fixed shape tensor extension array to a numpy array (with dim+1). """ np_flat = np.asarray(self.storage.values) - numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape), - order='C') + numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) return numpy_tensor diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 8050c70f9dd..4ad2536368e 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4626,7 +4626,8 @@ def run_end_encoded(run_end_type, value_type): def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None): """ Create instance of fixed shape tensor extension type with shape and optional - names of tensor dimensions and indices of the desired ordering. + names of tensor dimensions and indices of the desired logical + ordering of dimensions. Parameters ---------- From 223968a9ec871a1702eb23465a9d3bf34d53e24e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 6 Apr 2023 12:42:19 +0200 Subject: [PATCH 21/24] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index d60999cdb08..46e4f4d3660 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3136,7 +3136,7 @@ class FixedShapeTensorArray(ExtensionArray): Parameters ---------- - obj : ndarray + obj : numpy.ndarray Examples -------- @@ -3166,7 +3166,7 @@ class FixedShapeTensorArray(ExtensionArray): ] ] """ - if obj.flags["F_CONTIGUOUS"]: + if not obj.flags["C_CONTIGUOUS"]: raise ValueError('The data in the numpy array need to be in a single, ' 'C-style contiguous segment.') From dd8fd31f4b15ca2e0a39673ab2a338b991cd2032 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 7 Apr 2023 11:54:24 +0200 Subject: [PATCH 22/24] Update to_numpy_ndarraydocstring --- python/pyarrow/array.pxi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 46e4f4d3660..840150edc53 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3118,6 +3118,9 @@ class FixedShapeTensorArray(ExtensionArray): def to_numpy_ndarray(self): """ Convert fixed shape tensor extension array to a numpy array (with dim+1). + + Note: The method doesn't take into account ``permutation`` and ``dim_names`` + parameter therefore this information will be lost. """ np_flat = np.asarray(self.storage.values) numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) From 1ebb8297773494813f5515215cb869eb0ea959f6 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 11 Apr 2023 13:58:34 +0200 Subject: [PATCH 23/24] Add a check for non-trivial permutation in to_numpy_ndarray --- python/pyarrow/array.pxi | 14 ++++++++------ python/pyarrow/tests/test_extension_type.py | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 840150edc53..a6481bc7c03 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3119,13 +3119,15 @@ class FixedShapeTensorArray(ExtensionArray): """ Convert fixed shape tensor extension array to a numpy array (with dim+1). - Note: The method doesn't take into account ``permutation`` and ``dim_names`` - parameter therefore this information will be lost. + Note: ``permutation`` should be non-trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). """ - np_flat = np.asarray(self.storage.values) - numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) - - return numpy_tensor + if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): + np_flat = np.asarray(self.storage.values) + numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) + return numpy_tensor + else: + raise ValueError( + 'Only non-permuted tensors can be converted to numpy tensors.') @staticmethod def from_numpy_ndarray(obj): diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 3a0b7ee6e73..e6268823aad 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1193,6 +1193,12 @@ def test_tensor_class_methods(): with pytest.raises(ValueError, match="C-style contiguous segment"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]) + storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + with pytest.raises(ValueError, match="non-permuted tensors"): + arr.to_numpy_ndarray() + @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), From b2d0453aaec5bfddaf2103b8287eab9240d53478 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 11 Apr 2023 14:32:55 +0200 Subject: [PATCH 24/24] Update python/pyarrow/array.pxi Co-authored-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a6481bc7c03..b62339212e8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3119,7 +3119,7 @@ class FixedShapeTensorArray(ExtensionArray): """ Convert fixed shape tensor extension array to a numpy array (with dim+1). - Note: ``permutation`` should be non-trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). + Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). """ if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): np_flat = np.asarray(self.storage.values)