diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 92dc1b2db98..5dd269ee5c6 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -72,6 +72,8 @@ same rules as laid out above, and provide backwards compatibility guarantees. Official List ============= +.. _fixed_shape_tensor_extension: + Fixed shape tensor ================== diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 5c2c7dcc490..3ddfd36a8e9 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -170,6 +170,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, + fixed_shape_tensor, field, type_for_alias, DataType, DictionaryType, StructType, @@ -178,7 +179,7 @@ def print_entry(label, value): TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, - RunEndEncodedType, + RunEndEncodedType, FixedShapeTensorType, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -209,7 +210,7 @@ def print_entry(label, value): Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, - RunEndEncodedArray, + RunEndEncodedArray, FixedShapeTensorArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9413866e30c..b62339212e8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3076,6 +3076,115 @@ cdef class ExtensionArray(Array): return Array._to_pandas(self.storage, options, **kwargs) +class FixedShapeTensorArray(ExtensionArray): + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + + def to_numpy_ndarray(self): + """ + Convert fixed shape tensor extension array to a numpy array (with dim+1). + + Note: ``permutation`` should be trivial (``None`` or ``[0, 1, ..., len(shape)-1]``). + """ + if self.type.permutation is None or self.type.permutation == list(range(len(self.type.shape))): + np_flat = np.asarray(self.storage.values) + numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape)) + return numpy_tensor + else: + raise ValueError( + 'Only non-permuted tensors can be converted to numpy tensors.') + + @staticmethod + def from_numpy_ndarray(obj): + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + + Numpy array needs to be C-contiguous in memory + (``obj.flags["C_CONTIGUOUS"]==True``). + + Parameters + ---------- + obj : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array( + ... [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + ... dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] + """ + if not obj.flags["C_CONTIGUOUS"]: + raise ValueError('The data in the numpy array need to be in a single, ' + 'C-style contiguous segment.') + + arrow_type = from_numpy_dtype(obj.dtype) + shape = obj.shape[1:] + size = obj.size / obj.shape[0] + + return ExtensionArray.from_storage( + fixed_shape_tensor(arrow_type, shape), + FixedSizeListArray.from_arrays(np.ravel(obj, order='C'), size) + ) + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 798f0e8395d..9df90ec60aa 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2619,6 +2619,27 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": + cdef cppclass CFixedShapeTensorType \ + " arrow::extension::FixedShapeTensorType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type, + const vector[int64_t]& shape, + const vector[int64_t]& permutation, + const vector[c_string]& dim_names) + + CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type, + const c_string& serialized_data) const + + c_string Serialize() const + + const shared_ptr[CDataType] value_type() + const vector[int64_t] shape() + const vector[int64_t] permutation() + const vector[c_string] dim_names() + + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index d984475171f..54e14005f6d 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -199,6 +199,11 @@ cdef class ExtensionType(BaseExtensionType): const CPyExtensionType* cpy_ext_type +cdef class FixedShapeTensorType(BaseExtensionType): + cdef: + const CFixedShapeTensorType* tensor_ext_type + + cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index fadc659d455..72e16f2cec3 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -118,6 +118,8 @@ cdef api object pyarrow_wrap_data_type( cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() + elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": + out = FixedShapeTensorType.__new__(FixedShapeTensorType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 96b3a9c26db..e6268823aad 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1144,6 +1144,102 @@ def test_cpp_extension_in_python(tmpdir): assert reconstructed_array == array +def test_tensor_type(): + tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type.storage_type == pa.list_(pa.int8(), 6) + assert tensor_type.shape == [2, 3] + assert tensor_type.dim_names is None + assert tensor_type.permutation is None + + tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3], + permutation=[0, 2, 1]) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type.storage_type == pa.list_(pa.float64(), 12) + assert tensor_type.shape == [2, 2, 3] + assert tensor_type.dim_names is None + assert tensor_type.permutation == [0, 2, 1] + + tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], + dim_names=['C', 'H', 'W']) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + assert tensor_type.storage_type == pa.list_(pa.bool_(), 12) + assert tensor_type.shape == [2, 2, 3] + assert tensor_type.dim_names == ['C', 'H', 'W'] + assert tensor_type.permutation is None + + +def test_tensor_class_methods(): + tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) + storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], + pa.list_(pa.float32(), 6)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + expected = np.array( + [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + result = arr.to_numpy_ndarray() + np.testing.assert_array_equal(result, expected) + + arr = np.array( + [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + dtype=np.float32, order="C") + tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) + assert tensor_array_from_numpy.type.value_type == pa.float32() + assert tensor_array_from_numpy.type.shape == [2, 3] + + arr = np.array( + [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], + dtype=np.float32, order="F") + with pytest.raises(ValueError, match="C-style contiguous segment"): + pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]) + storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + with pytest.raises(ValueError, match="non-permuted tensors"): + arr.to_numpy_ndarray() + + +@pytest.mark.parametrize("tensor_type", ( + pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), + pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), + pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], dim_names=['C', 'H', 'W']) +)) +def test_tensor_type_ipc(tensor_type): + storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12)) + arr = pa.ExtensionArray.from_storage(tensor_type, storage) + batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + + # check the built array has exactly the expected clss + tensor_class = tensor_type.__arrow_ext_class__() + assert type(arr) == tensor_class + + buf = ipc_write_batch(batch) + del batch + batch = ipc_read_batch(buf) + + result = batch.column(0) + # check the deserialized array class is the expected one + assert type(result) == tensor_class + assert result.type.extension_name == "arrow.fixed_shape_tensor" + assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]] + + # we get back an actual TensorType + assert isinstance(result.type, pa.FixedShapeTensorType) + assert result.type.value_type == pa.int8() + assert result.type.shape == [2, 2, 3] + + +def test_tensor_type_equality(): + tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) + assert tensor_type.extension_name == "arrow.fixed_shape_tensor" + + tensor_type2 = pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]) + tensor_type3 = pa.fixed_shape_tensor(pa.uint8(), [2, 2, 3]) + assert tensor_type == tensor_type2 + assert not tensor_type == tensor_type3 + + @pytest.mark.pandas def test_extension_to_pandas_storage_type(registered_period_type): period_type, _ = registered_period_type diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0e817066601..4ad2536368e 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1494,6 +1494,86 @@ cdef class ExtensionType(BaseExtensionType): """ return ExtensionScalar + +cdef class FixedShapeTensorType(BaseExtensionType): + """ + Concrete class for fixed shape tensor extension type. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) + FixedShapeTensorType(extension) + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.tensor_ext_type = type.get() + + @property + def value_type(self): + """ + Data type of an individual tensor. + """ + return pyarrow_wrap_data_type(self.tensor_ext_type.value_type()) + + @property + def shape(self): + """ + Shape of the tensors. + """ + return self.tensor_ext_type.shape() + + @property + def dim_names(self): + """ + Explicit names of the dimensions. + """ + list_of_bytes = self.tensor_ext_type.dim_names() + if len(list_of_bytes) != 0: + return [frombytes(x) for x in list_of_bytes] + else: + return None + + @property + def permutation(self): + """ + Indices of the dimensions ordering. + """ + indices = self.tensor_ext_type.permutation() + if len(indices) != 0: + return indices + else: + return None + + def __arrow_ext_serialize__(self): + """ + Serialized representation of metadata to reconstruct the type object. + """ + return self.tensor_ext_type.Serialize() + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + """ + Return an FixedShapeTensor type instance from the storage type and serialized + metadata. + """ + return self.tensor_ext_type.Deserialize(storage_type, serialized) + + def __arrow_ext_class__(self): + return FixedShapeTensorArray + + cdef class PyExtensionType(ExtensionType): """ Concrete base class for Python-defined extension types based on pickle @@ -4543,6 +4623,106 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None): + """ + Create instance of fixed shape tensor extension type with shape and optional + names of tensor dimensions and indices of the desired logical + ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple or list of integers + The physical shape of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a table with fixed shape tensor extension array: + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] + + Create an instance of fixed shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... dim_names=['C', 'H', 'W']) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), + ... permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : FixedShapeTensorType + """ + + cdef: + vector[int64_t] c_shape + vector[int64_t] c_permutation + vector[c_string] c_dim_names + shared_ptr[CDataType] c_tensor_ext_type + + assert value_type is not None + assert shape is not None + + for i in shape: + c_shape.push_back(i) + + if permutation is not None: + for i in permutation: + c_permutation.push_back(i) + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + cdef FixedShapeTensorType out = FixedShapeTensorType.__new__(FixedShapeTensorType) + + c_tensor_ext_type = GetResultValue(CFixedShapeTensorType.Make( + value_type.sp_type, c_shape, c_permutation, c_dim_names)) + + out.init(c_tensor_ext_type) + + return out + + cdef dict _type_aliases = { 'null': null, 'bool': bool_,