diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 12991b94aeb..4ae5d897917 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1026,9 +1026,8 @@ struct SparseTensorEqualsImpl { const uint8_t* left_data = left.data()->data(); const uint8_t* right_data = right.data()->data(); - return memcmp(left_data, right_data, - static_cast(byte_width * left.non_zero_length())); + static_cast(byte_width * left.non_zero_length())) == 0; } }; diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index f7068b353be..515864ae287 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -25,8 +25,10 @@ #include #include "arrow/buffer.h" +#include "arrow/sparse_tensor.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/util/logging.h" #include "arrow/python/common.h" #include "arrow/python/pyarrow.h" @@ -186,7 +188,9 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { #undef TO_ARROW_TYPE_CASE -Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { +Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, + std::shared_ptr* out) { if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } @@ -197,35 +201,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* int ndim = PyArray_NDIM(ndarray); - // This is also holding the GIL, so don't already draw it. std::shared_ptr data = std::make_shared(ao); std::vector shape(ndim); std::vector strides(ndim); - { - PyAcquireGIL lock; - npy_intp* array_strides = PyArray_STRIDES(ndarray); - npy_intp* array_shape = PyArray_SHAPE(ndarray); - for (int i = 0; i < ndim; ++i) { - if (array_strides[i] < 0) { - return Status::Invalid("Negative ndarray strides not supported"); - } - shape[i] = array_shape[i]; - strides[i] = array_strides[i]; + npy_intp* array_strides = PyArray_STRIDES(ndarray); + npy_intp* array_shape = PyArray_SHAPE(ndarray); + for (int i = 0; i < ndim; ++i) { + if (array_strides[i] < 0) { + return Status::Invalid("Negative ndarray strides not supported"); } - - std::shared_ptr type; - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - *out = std::make_shared(type, data, shape, strides); - return Status::OK(); + shape[i] = array_shape[i]; + strides[i] = array_strides[i]; } + + std::shared_ptr type; + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); + *out = std::make_shared(type, data, shape, strides, dim_names); + return Status::OK(); } Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, PyObject** out) { - PyAcquireGIL lock; - int type_num; RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num)); PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num); @@ -274,5 +272,140 @@ Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, return Status::OK(); } +// Wrap the dense data of a sparse tensor in a ndarray +static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor, + std::vector data_shape, PyObject* base, + PyObject** out_data) { + int type_num_data; + RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data)); + PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); + RETURN_IF_PYERROR(); + + const void* immutable_data = sparse_tensor.data()->data(); + // Remove const =( + void* mutable_data = const_cast(immutable_data); + int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS; + if (sparse_tensor.is_mutable()) { + array_flags |= NPY_ARRAY_WRITEABLE; + } + + *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data, + static_cast(data_shape.size()), data_shape.data(), + nullptr, mutable_data, array_flags, nullptr); + RETURN_IF_PYERROR() + Py_XINCREF(base); + PyArray_SetBaseObject(reinterpret_cast(*out_data), base); + return Status::OK(); +} + +Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_coords) { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTensorDataToNdarray( + *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); + + // Wrap indices + PyObject* result_coords; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords)); + + *out_data = result_data.detach(); + *out_coords = result_coords; + return Status::OK(); +} + +Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_indptr, PyObject** out_indices) { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTensorDataToNdarray( + *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); + + // Wrap indices + OwnedRef result_indptr; + OwnedRef result_indices; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref())); + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref())); + + *out_data = result_data.detach(); + *out_indptr = result_indptr.detach(); + *out_indices = result_indices.detach(); + return Status::OK(); +} + +Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, + const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr type_data; + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + + std::shared_ptr coords; + RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords)); + ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller + + std::shared_ptr sparse_index = std::make_shared( + std::static_pointer_cast>(coords)); + *out = std::make_shared>(sparse_index, type_data, data, + shape, dim_names); + return Status::OK(); +} + +Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, + PyObject* indices_ao, const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) || + !PyArray_Check(indices_ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr type_data; + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + + std::shared_ptr indptr, indices; + RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr)); + RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices)); + ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller + ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller + + auto sparse_index = std::make_shared( + std::static_pointer_cast>(indptr), + std::static_pointer_cast>(indices)); + *out = std::make_shared>(sparse_index, type_data, data, + shape, dim_names); + return Status::OK(); +} + +Status TensorToSparseTensorCOO(const std::shared_ptr& tensor, + std::shared_ptr* out) { + *out = std::make_shared(*tensor); + return Status::OK(); +} + +Status TensorToSparseTensorCSR(const std::shared_ptr& tensor, + std::shared_ptr* out) { + *out = std::make_shared(*tensor); + return Status::OK(); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h index dce5fe522d6..5fa1326f52b 100644 --- a/cpp/src/arrow/python/numpy_convert.h +++ b/cpp/src/arrow/python/numpy_convert.h @@ -25,9 +25,11 @@ #include #include +#include #include "arrow/buffer.h" #include "arrow/python/visibility.h" +#include "arrow/sparse_tensor.h" namespace arrow { @@ -63,11 +65,38 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out); Status GetNumPyType(const DataType& type, int* type_num); ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, std::shared_ptr* out); ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, PyObject** out); +ARROW_PYTHON_EXPORT Status +SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, PyObject** out_coords); + +ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray( + const std::shared_ptr& sparse_tensor, PyObject* base, + PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); + +ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO( + MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, + const std::vector& shape, const std::vector& dim_names, + std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR( + MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, + const std::vector& shape, const std::vector& dim_names, + std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status +TensorToSparseTensorCOO(const std::shared_ptr& tensor, + std::shared_ptr* csparse_tensor); + +ARROW_PYTHON_EXPORT Status +TensorToSparseTensorCSR(const std::shared_ptr& tensor, + std::shared_ptr* csparse_tensor); + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc index 1cedc549cfa..e037318bce2 100644 --- a/cpp/src/arrow/python/pyarrow.cc +++ b/cpp/src/arrow/python/pyarrow.cc @@ -123,6 +123,44 @@ PyObject* wrap_tensor(const std::shared_ptr& tensor) { return ::pyarrow_wrap_tensor(tensor); } +bool is_sparse_tensor_csr(PyObject* sparse_tensor) { + return ::pyarrow_is_sparse_tensor_csr(sparse_tensor) != 0; +} + +Status unwrap_sparse_tensor_csr(PyObject* sparse_tensor, + std::shared_ptr* out) { + *out = ::pyarrow_unwrap_sparse_tensor_csr(sparse_tensor); + if (*out) { + return Status::OK(); + } else { + return Status::Invalid( + "Could not unwrap SparseTensorCSR from the passed Python object."); + } +} + +PyObject* wrap_sparse_tensor_csr(const std::shared_ptr& sparse_tensor) { + return ::pyarrow_wrap_sparse_tensor_csr(sparse_tensor); +} + +bool is_sparse_tensor_coo(PyObject* sparse_tensor) { + return ::pyarrow_is_sparse_tensor_coo(sparse_tensor) != 0; +} + +Status unwrap_sparse_tensor_coo(PyObject* sparse_tensor, + std::shared_ptr* out) { + *out = ::pyarrow_unwrap_sparse_tensor_coo(sparse_tensor); + if (*out) { + return Status::OK(); + } else { + return Status::Invalid( + "Could not unwrap SparseTensorCOO from the passed Python object."); + } +} + +PyObject* wrap_sparse_tensor_coo(const std::shared_ptr& sparse_tensor) { + return ::pyarrow_wrap_sparse_tensor_coo(sparse_tensor); +} + bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; } Status unwrap_column(PyObject* column, std::shared_ptr* out) { diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h index ff5bf8f01dd..b4834f79f78 100644 --- a/cpp/src/arrow/python/pyarrow.h +++ b/cpp/src/arrow/python/pyarrow.h @@ -24,6 +24,8 @@ #include "arrow/python/visibility.h" +#include "arrow/sparse_tensor.h" + namespace arrow { class Array; @@ -67,6 +69,18 @@ ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor); ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out); ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); +ARROW_PYTHON_EXPORT bool is_sparse_tensor_coo(PyObject* sparse_tensor); +ARROW_PYTHON_EXPORT Status +unwrap_sparse_tensor_coo(PyObject* sparse_tensor, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_coo( + const std::shared_ptr& sparse_tensor); + +ARROW_PYTHON_EXPORT bool is_sparse_tensor_csr(PyObject* sparse_tensor); +ARROW_PYTHON_EXPORT Status +unwrap_sparse_tensor_csr(PyObject* sparse_tensor, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_csr( + const std::shared_ptr& sparse_tensor); + ARROW_PYTHON_EXPORT bool is_column(PyObject* column); ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr& column); diff --git a/cpp/src/arrow/python/pyarrow_api.h b/cpp/src/arrow/python/pyarrow_api.h index b76e9614a8a..2d8f71c8c5a 100644 --- a/cpp/src/arrow/python/pyarrow_api.h +++ b/cpp/src/arrow/python/pyarrow_api.h @@ -50,6 +50,10 @@ static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr #define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0; #define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr)(std::shared_ptr< arrow::SparseTensorCSR> const &) = 0; +#define pyarrow_wrap_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo)(std::shared_ptr< arrow::SparseTensorCOO> const &) = 0; +#define pyarrow_wrap_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo static std::shared_ptr< arrow::Array> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0; #define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0; @@ -68,6 +72,10 @@ static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwra #define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0; #define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor +static std::shared_ptr< arrow::SparseTensorCSR> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr)(PyObject *) = 0; +#define pyarrow_unwrap_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr +static std::shared_ptr< arrow::SparseTensorCOO> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo)(PyObject *) = 0; +#define pyarrow_unwrap_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const &) = 0; #define pyarrow_internal_check_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0; @@ -84,6 +92,10 @@ static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_pt #define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0; #define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr)(PyObject *) = 0; +#define pyarrow_is_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo)(PyObject *) = 0; +#define pyarrow_is_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0; #define pyarrow_is_column __pyx_api_f_7pyarrow_3lib_pyarrow_is_column static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0; @@ -167,6 +179,8 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr, "PyObject *(std::shared_ptr< arrow::SparseTensorCSR> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr, "PyObject *(std::shared_ptr< arrow::SparseTensorCOO> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad; @@ -176,6 +190,8 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr, "std::shared_ptr< arrow::SparseTensorCSR> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo, "std::shared_ptr< arrow::SparseTensorCOO> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_internal_check_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int (arrow::Status const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad; @@ -184,6 +200,8 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject *(std::shared_ptr< arrow::Scalar> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad; diff --git a/cpp/src/arrow/python/pyarrow_lib.h b/cpp/src/arrow/python/pyarrow_lib.h index 5f5fc4c6b6f..a4bc1039ee8 100644 --- a/cpp/src/arrow/python/pyarrow_lib.h +++ b/cpp/src/arrow/python/pyarrow_lib.h @@ -48,6 +48,8 @@ __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo(std::shared_ptr< arrow::SparseTensorCOO> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr(std::shared_ptr< arrow::SparseTensorCSR> const &); __PYX_EXTERN_C std::shared_ptr< arrow::Array> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Buffer> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *); @@ -57,6 +59,8 @@ __PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unw __PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCOO> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCSR> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr(PyObject *); #endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 8ff0e01480f..d93e3954e41 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -515,7 +515,7 @@ Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* bui builder->AppendNdarray(static_cast(blobs_out->ndarrays.size()))); std::shared_ptr tensor; RETURN_NOT_OK(NdarrayToTensor(default_memory_pool(), - reinterpret_cast(array), &tensor)); + reinterpret_cast(array), {}, &tensor)); blobs_out->ndarrays.push_back(tensor); } break; default: { diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index daff0194fe5..69ec4ca5c60 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -182,6 +182,25 @@ TEST(TestSparseCOOTensor, CreationFromNonContiguousTensor) { AssertCOOIndex(sidx, 11, {1, 2, 3}); } +TEST(TestSparseCOOTensor, TensorEquality) { + std::vector shape = {2, 3, 4}; + std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values2 = {0, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer1 = Buffer::Wrap(values1); + std::shared_ptr buffer2 = Buffer::Wrap(values2); + NumericTensor tensor1(buffer1, shape); + NumericTensor tensor2(buffer1, shape); + NumericTensor tensor3(buffer2, shape); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + SparseTensorImpl st3(tensor3); + + ASSERT_TRUE(st1.Equals(st2)); + ASSERT_TRUE(!st1.Equals(st3)); +} + TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { std::vector shape = {6, 4}; std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, @@ -269,4 +288,24 @@ TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); } +TEST(TestSparseCSRMatrix, TensorEquality) { + std::vector shape = {6, 4}; + std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values2 = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + std::shared_ptr buffer1 = Buffer::Wrap(values1); + std::shared_ptr buffer2 = Buffer::Wrap(values2); + NumericTensor tensor1(buffer1, shape); + NumericTensor tensor2(buffer1, shape); + NumericTensor tensor3(buffer2, shape); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + SparseTensorImpl st3(tensor3); + + ASSERT_TRUE(st1.Equals(st2)); + ASSERT_TRUE(!st1.Equals(st3)); +} + } // namespace arrow diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst index 6b5c9ce1902..f15b1bedbac 100644 --- a/docs/source/python/extending.rst +++ b/docs/source/python/extending.rst @@ -116,6 +116,16 @@ C++ objects. Return whether *obj* wraps an Arrow C++ :class:`Tensor` pointer; in other words, whether *obj* is a :py:class:`pyarrow.Tensor` instance. +.. function:: bool is_sparse_tensor_coo(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`SparseTensorCOO` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseTensorCOO` instance. + +.. function:: bool is_sparse_tensor_csr(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`SparseTensorCSR` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseTensorCSR` instance. + The following functions expect a pyarrow object, unwrap the underlying Arrow C++ API pointer, and put it in the *out* parameter. The returned :class:`Status` object must be inspected first to know whether any error @@ -157,6 +167,14 @@ occurred. If successful, *out* is guaranteed to be non-NULL. Unwrap the Arrow C++ :class:`Tensor` pointer from *obj* and put it in *out*. +.. function:: Status unwrap_sparse_tensor_coo(PyObject* obj, std::shared_ptr* out) + + Unwrap the Arrow C++ :class:`SparseTensorCOO` pointer from *obj* and put it in *out*. + +.. function:: Status unwrap_sparse_tensor_csr(PyObject* obj, std::shared_ptr* out) + + Unwrap the Arrow C++ :class:`SparseTensorCSR` pointer from *obj* and put it in *out*. + The following functions take an Arrow C++ API pointer and wrap it in a pyarray object of the corresponding type. A new reference is returned. On error, NULL is returned and a Python exception is set. @@ -197,6 +215,14 @@ On error, NULL is returned and a Python exception is set. Wrap the Arrow C++ *tensor* in a :py:class:`pyarrow.Tensor` instance. +.. function:: PyObject* wrap_sparse_tensor_coo(const std::shared_ptr& sparse_tensor) + + Wrap the Arrow C++ *COO sparse tensor* in a :py:class:`pyarrow.SparseTensorCOO` instance. + +.. function:: PyObject* wrap_sparse_tensor_csr(const std::shared_ptr& sparse_tensor) + + Wrap the Arrow C++ *CSR sparse tensor* in a :py:class:`pyarrow.SparseTensorCSR` instance. + Cython API ---------- @@ -257,6 +283,14 @@ an exception) if the input is not of the right type. Unwrap the Arrow C++ :cpp:class:`Tensor` pointer from *obj*. +.. function:: pyarrow_unwrap_sparse_tensor_coo(obj) -> shared_ptr[CSparseTensorCOO] + + Unwrap the Arrow C++ :cpp:class:`SparseTensorCOO` pointer from *obj*. + +.. function:: pyarrow_unwrap_sparse_tensor_csr(obj) -> shared_ptr[CSparseTensorCSR] + + Unwrap the Arrow C++ :cpp:class:`SparseTensorCSR` pointer from *obj*. + The following functions take a Arrow C++ API pointer and wrap it in a pyarray object of the corresponding type. An exception is raised on error. @@ -300,6 +334,14 @@ pyarray object of the corresponding type. An exception is raised on error. Wrap the Arrow C++ *tensor* in a Python :class:`pyarrow.Tensor` instance. +.. function:: pyarrow_wrap_sparse_tensor_coo(sp_array: const shared_ptr[CSparseTensorCOO]& sparse_tensor) -> object + + Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseTensorCOO` instance. + +.. function:: pyarrow_wrap_sparse_tensor_csr(sp_array: const shared_ptr[CSparseTensorCSR]& sparse_tensor) -> object + + Wrap the Arrow C++ *CSR sparse tensor* in a Python :class:`pyarrow.SparseTensorCSR` instance. + Example ~~~~~~~ diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd index 95cea5ca4fc..432880556cc 100644 --- a/python/pyarrow/__init__.pxd +++ b/python/pyarrow/__init__.pxd @@ -20,8 +20,9 @@ from __future__ import absolute_import from libcpp.memory cimport shared_ptr from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType, CField, CRecordBatch, CSchema, - CTable, CTensor) - + CTable, CTensor, + CSparseTensorCSR, CSparseTensorCOO) +from pyarrow.compat import frombytes cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": cdef int import_pyarrow() except -1 @@ -31,6 +32,10 @@ cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": cdef object wrap_schema(const shared_ptr[CSchema]& schema) cdef object wrap_array(const shared_ptr[CArray]& sp_array) cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor) + cdef object wrap_sparse_tensor_coo( + const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor) + cdef object wrap_sparse_tensor_csr( + const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor) cdef object wrap_column(const shared_ptr[CColumn]& ccolumn) cdef object wrap_table(const shared_ptr[CTable]& ctable) cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 487065c2892..bbbd91a9508 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -66,6 +66,7 @@ def parse_git(root, **kwargs): schema, Array, Tensor, array, chunked_array, column, table, + SparseTensorCSR, SparseTensorCOO, infer_type, from_numpy_dtype, NullArray, NumericArray, IntegerArray, FloatingPointArray, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5ae178d8953..15905a18507 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -870,104 +870,6 @@ cdef class Array(_PandasConvertible): return res -cdef class Tensor: - """ - A n-dimensional array a.k.a Tensor. - """ - - def __init__(self): - raise TypeError("Do not call Tensor's constructor directly, use one " - "of the `pyarrow.Tensor.from_*` functions instead.") - - cdef void init(self, const shared_ptr[CTensor]& sp_tensor): - self.sp_tensor = sp_tensor - self.tp = sp_tensor.get() - self.type = pyarrow_wrap_data_type(self.tp.type()) - - def __repr__(self): - return """ -type: {0.type} -shape: {0.shape} -strides: {0.strides}""".format(self) - - @staticmethod - def from_numpy(obj): - cdef shared_ptr[CTensor] ctensor - with nogil: - check_status(NdarrayToTensor(c_default_memory_pool(), obj, - &ctensor)) - return pyarrow_wrap_tensor(ctensor) - - def to_numpy(self): - """ - Convert arrow::Tensor to numpy.ndarray with zero copy - """ - cdef PyObject* out - - with nogil: - check_status(TensorToNdarray(self.sp_tensor, self, &out)) - return PyObject_to_object(out) - - def equals(self, Tensor other): - """ - Return true if the tensors contains exactly equal data - """ - return self.tp.Equals(deref(other.tp)) - - def __eq__(self, other): - if isinstance(other, Tensor): - return self.equals(other) - else: - return NotImplemented - - @property - def is_mutable(self): - return self.tp.is_mutable() - - @property - def is_contiguous(self): - return self.tp.is_contiguous() - - @property - def ndim(self): - return self.tp.ndim() - - @property - def size(self): - return self.tp.size() - - @property - def shape(self): - # Cython knows how to convert a vector[T] to a Python list - return tuple(self.tp.shape()) - - @property - def strides(self): - return tuple(self.tp.strides()) - - def __getbuffer__(self, cp.Py_buffer* buffer, int flags): - buffer.buf = self.tp.data().get().data() - pep3118_format = self.type.pep3118_format - if pep3118_format is None: - raise NotImplementedError("type %s not supported for buffer " - "protocol" % (self.type,)) - buffer.format = pep3118_format - buffer.itemsize = self.type.bit_width // 8 - buffer.internal = NULL - buffer.len = self.tp.size() * buffer.itemsize - buffer.ndim = self.tp.ndim() - buffer.obj = self - if self.tp.is_mutable(): - buffer.readonly = 0 - else: - buffer.readonly = 1 - # NOTE: This assumes Py_ssize_t == int64_t, and that the shape - # and strides arrays lifetime is tied to the tensor's - buffer.shape = &self.tp.shape()[0] - buffer.strides = &self.tp.strides()[0] - buffer.suboffsets = NULL - - cdef wrap_array_output(PyObject* output): cdef object obj = PyObject_to_object(output) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8798834b5fd..93a75945ce3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -593,6 +593,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int64_t size() int ndim() + const vector[c_string]& dim_names() const c_string& dim_name(int i) c_bool is_mutable() @@ -600,6 +601,38 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type type_id() c_bool Equals(const CTensor& other) + cdef cppclass CSparseTensorCOO" arrow::SparseTensorCOO": + shared_ptr[CDataType] type() + shared_ptr[CBuffer] data() + + const vector[int64_t]& shape() + int64_t size() + int64_t non_zero_length() + + int ndim() + const vector[c_string]& dim_names() + const c_string& dim_name(int i) + + c_bool is_mutable() + Type type_id() + c_bool Equals(const CSparseTensorCOO& other) + + cdef cppclass CSparseTensorCSR" arrow::SparseTensorCSR": + shared_ptr[CDataType] type() + shared_ptr[CBuffer] data() + + const vector[int64_t]& shape() + int64_t size() + int64_t non_zero_length() + + int ndim() + const vector[c_string]& dim_names() + const c_string& dim_name(int i) + + c_bool is_mutable() + Type type_id() + c_bool Equals(const CSparseTensorCSR& other) + cdef cppclass CScalar" arrow::Scalar": shared_ptr[CDataType] type @@ -1202,11 +1235,38 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, + const vector[c_string]& dim_names, shared_ptr[CTensor]* out) CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base, PyObject** out) + CStatus SparseTensorCOOToNdarray( + const shared_ptr[CSparseTensorCOO]& sparse_tensor, object base, + PyObject** out_data, PyObject** out_coords) + + CStatus SparseTensorCSRToNdarray( + const shared_ptr[CSparseTensorCSR]& sparse_tensor, object base, + PyObject** out_data, PyObject** out_indptr, PyObject** out_indices) + + CStatus NdarraysToSparseTensorCOO(CMemoryPool* pool, object data_ao, + object coords_ao, + const vector[int64_t]& shape, + const vector[c_string]& dim_names, + shared_ptr[CSparseTensorCOO]* out) + + CStatus NdarraysToSparseTensorCSR(CMemoryPool* pool, object data_ao, + object indptr_ao, object indices_ao, + const vector[int64_t]& shape, + const vector[c_string]& dim_names, + shared_ptr[CSparseTensorCSR]* out) + + CStatus TensorToSparseTensorCOO(shared_ptr[CTensor], + shared_ptr[CSparseTensorCOO]* out) + + CStatus TensorToSparseTensorCSR(shared_ptr[CTensor], + shared_ptr[CSparseTensorCSR]* out) + CStatus ConvertArrayToPandas(const PandasOptions& options, const shared_ptr[CArray]& arr, object py_ref, PyObject** out) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 79ab9478b16..898c70a4bf7 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -231,6 +231,28 @@ cdef class Tensor: cdef void init(self, const shared_ptr[CTensor]& sp_tensor) +cdef class SparseTensorCSR: + cdef: + shared_ptr[CSparseTensorCSR] sp_sparse_tensor + CSparseTensorCSR* stp + + cdef readonly: + DataType type + + cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor) + + +cdef class SparseTensorCOO: + cdef: + shared_ptr[CSparseTensorCOO] sp_sparse_tensor + CSparseTensorCOO* stp + + cdef readonly: + DataType type + + cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor) + + cdef class NullArray(Array): pass @@ -452,6 +474,10 @@ cdef public object pyarrow_wrap_resizable_buffer( cdef public object pyarrow_wrap_schema(const shared_ptr[CSchema]& type) cdef public object pyarrow_wrap_table(const shared_ptr[CTable]& ctable) cdef public object pyarrow_wrap_tensor(const shared_ptr[CTensor]& sp_tensor) +cdef public object pyarrow_wrap_sparse_tensor_coo( + const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor) +cdef public object pyarrow_wrap_sparse_tensor_csr( + const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor) cdef public shared_ptr[CArray] pyarrow_unwrap_array(object array) cdef public shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch) @@ -462,3 +488,7 @@ cdef public shared_ptr[CField] pyarrow_unwrap_field(object field) cdef public shared_ptr[CSchema] pyarrow_unwrap_schema(object schema) cdef public shared_ptr[CTable] pyarrow_unwrap_table(object table) cdef public shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor) +cdef public shared_ptr[CSparseTensorCOO] pyarrow_unwrap_sparse_tensor_coo( + object sparse_tensor) +cdef public shared_ptr[CSparseTensorCSR] pyarrow_unwrap_sparse_tensor_csr( + object sparse_tensor) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 783e2b2731a..2da5a8301bc 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -121,6 +121,9 @@ include "builder.pxi" # Column, Table, Record Batch include "table.pxi" +# Tensors +include "tensor.pxi" + # File IO include "io.pxi" include "io-hdfs.pxi" diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 33bc8031804..05c07748f17 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -18,7 +18,8 @@ from libcpp.memory cimport shared_ptr from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, CRecordBatch, CSchema, - CTable, CTensor) + CTable, CTensor, + CSparseTensorCSR, CSparseTensorCOO) # You cannot assign something to a dereferenced pointer in Cython thus these # methods don't use Status to indicate a successful operation. @@ -225,6 +226,7 @@ cdef api object pyarrow_wrap_scalar(const shared_ptr[CScalar]& sp_scalar): scalar.init(sp_scalar) return scalar + cdef api bint pyarrow_is_tensor(object tensor): return isinstance(tensor, Tensor) @@ -248,6 +250,52 @@ cdef api object pyarrow_wrap_tensor( return tensor +cdef api bint pyarrow_is_sparse_tensor_coo(object sparse_tensor): + return isinstance(sparse_tensor, SparseTensorCOO) + +cdef api shared_ptr[CSparseTensorCOO] pyarrow_unwrap_sparse_tensor_coo( + object sparse_tensor): + cdef SparseTensorCOO sten + if pyarrow_is_sparse_tensor_coo(sparse_tensor): + sten = (sparse_tensor) + return sten.sp_sparse_tensor + + return shared_ptr[CSparseTensorCOO]() + +cdef api object pyarrow_wrap_sparse_tensor_coo( + const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor): + if sp_sparse_tensor.get() == NULL: + raise ValueError('SparseTensorCOO was NULL') + + cdef SparseTensorCOO sparse_tensor = SparseTensorCOO.__new__( + SparseTensorCOO) + sparse_tensor.init(sp_sparse_tensor) + return sparse_tensor + + +cdef api bint pyarrow_is_sparse_tensor_csr(object sparse_tensor): + return isinstance(sparse_tensor, SparseTensorCSR) + +cdef api shared_ptr[CSparseTensorCSR] pyarrow_unwrap_sparse_tensor_csr( + object sparse_tensor): + cdef SparseTensorCSR sten + if pyarrow_is_sparse_tensor_csr(sparse_tensor): + sten = (sparse_tensor) + return sten.sp_sparse_tensor + + return shared_ptr[CSparseTensorCSR]() + +cdef api object pyarrow_wrap_sparse_tensor_csr( + const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor): + if sp_sparse_tensor.get() == NULL: + raise ValueError('SparseTensorCSR was NULL') + + cdef SparseTensorCSR sparse_tensor = SparseTensorCSR.__new__( + SparseTensorCSR) + sparse_tensor.init(sp_sparse_tensor) + return sparse_tensor + + cdef api bint pyarrow_is_column(object column): return isinstance(column, Column) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi new file mode 100644 index 00000000000..17554e61740 --- /dev/null +++ b/python/pyarrow/tensor.pxi @@ -0,0 +1,367 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +cdef class Tensor: + """ + A n-dimensional array a.k.a Tensor. + """ + + def __init__(self): + raise TypeError("Do not call Tensor's constructor directly, use one " + "of the `pyarrow.Tensor.from_*` functions instead.") + + cdef void init(self, const shared_ptr[CTensor]& sp_tensor): + self.sp_tensor = sp_tensor + self.tp = sp_tensor.get() + self.type = pyarrow_wrap_data_type(self.tp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape} +strides: {0.strides}""".format(self) + + @staticmethod + def from_numpy(obj, dim_names=None): + cdef: + vector[c_string] c_dim_names + shared_ptr[CTensor] ctensor + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + check_status(NdarrayToTensor(c_default_memory_pool(), obj, + c_dim_names, &ctensor)) + return pyarrow_wrap_tensor(ctensor) + + def to_numpy(self): + """ + Convert arrow::Tensor to numpy.ndarray with zero copy + """ + cdef PyObject* out + + check_status(TensorToNdarray(self.sp_tensor, self, &out)) + return PyObject_to_object(out) + + def equals(self, Tensor other): + """ + Return true if the tensors contains exactly equal data + """ + return self.tp.Equals(deref(other.tp)) + + def __eq__(self, other): + if isinstance(other, Tensor): + return self.equals(other) + else: + return NotImplemented + + def dim_name(self, i): + return frombytes(self.tp.dim_name(i)) + + @property + def dim_names(self): + return [frombytes(x) for x in tuple(self.tp.dim_names())] + + @property + def is_mutable(self): + return self.tp.is_mutable() + + @property + def is_contiguous(self): + return self.tp.is_contiguous() + + @property + def ndim(self): + return self.tp.ndim() + + @property + def size(self): + return self.tp.size() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.tp.shape()) + + @property + def strides(self): + return tuple(self.tp.strides()) + + def __getbuffer__(self, cp.Py_buffer* buffer, int flags): + buffer.buf = self.tp.data().get().data() + pep3118_format = self.type.pep3118_format + if pep3118_format is None: + raise NotImplementedError("type %s not supported for buffer " + "protocol" % (self.type,)) + buffer.format = pep3118_format + buffer.itemsize = self.type.bit_width // 8 + buffer.internal = NULL + buffer.len = self.tp.size() * buffer.itemsize + buffer.ndim = self.tp.ndim() + buffer.obj = self + if self.tp.is_mutable(): + buffer.readonly = 0 + else: + buffer.readonly = 1 + # NOTE: This assumes Py_ssize_t == int64_t, and that the shape + # and strides arrays lifetime is tied to the tensor's + buffer.shape = &self.tp.shape()[0] + buffer.strides = &self.tp.strides()[0] + buffer.suboffsets = NULL + + +cdef class SparseTensorCOO: + """ + A sparse COO tensor. + """ + + def __init__(self): + raise TypeError("Do not call SparseTensorCOO's constructor directly, " + "use one of the `pyarrow.SparseTensorCOO.from_*` " + "functions instead.") + + cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor): + self.sp_sparse_tensor = sp_sparse_tensor + self.stp = sp_sparse_tensor.get() + self.type = pyarrow_wrap_data_type(self.stp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape}""".format(self) + + @classmethod + def from_dense_numpy(cls, obj, dim_names=None): + """ + Convert numpy.ndarray to arrow::SparseTensorCOO + """ + return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) + + @staticmethod + def from_numpy(data, coords, shape, dim_names=None): + """ + Create arrow::SparseTensorCOO from numpy.ndarrays + """ + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + # Enforce precondition for SparseTensorCOO indices + coords = np.require(coords, dtype='i8', requirements='F') + if coords.ndim != 2: + raise ValueError("Expected 2-dimensional array for " + "SparseTensorCOO indices") + + check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(), + data, coords, c_shape, c_dim_names, &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + @staticmethod + def from_tensor(obj): + """ + Convert arrow::Tensor to arrow::SparseTensorCOO + """ + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) + + with nogil: + check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor)) + + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + def to_numpy(self): + """ + Convert arrow::SparseTensorCOO to numpy.ndarrays with zero copy + """ + cdef PyObject* out_data + cdef PyObject* out_coords + + check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_coords)) + return PyObject_to_object(out_data), PyObject_to_object(out_coords) + + def equals(self, SparseTensorCOO other): + """ + Return true if sparse tensors contains exactly equal data + """ + return self.stp.Equals(deref(other.stp)) + + def __eq__(self, other): + if isinstance(other, SparseTensorCOO): + return self.equals(other) + else: + return NotImplemented + + @property + def is_mutable(self): + return self.stp.is_mutable() + + @property + def ndim(self): + return self.stp.ndim() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.stp.shape()) + + @property + def size(self): + return self.stp.size() + + def dim_name(self, i): + return frombytes(self.stp.dim_name(i)) + + @property + def dim_names(self): + return [frombytes(x) for x in tuple(self.stp.dim_names())] + + @property + def non_zero_length(self): + return self.stp.non_zero_length() + + +cdef class SparseTensorCSR: + """ + A sparse CSR tensor. + """ + + def __init__(self): + raise TypeError("Do not call SparseTensorCSR's constructor directly, " + "use one of the `pyarrow.SparseTensorCSR.from_*` " + "functions instead.") + + cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor): + self.sp_sparse_tensor = sp_sparse_tensor + self.stp = sp_sparse_tensor.get() + self.type = pyarrow_wrap_data_type(self.stp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape}""".format(self) + + @classmethod + def from_dense_numpy(cls, obj, dim_names=None): + """ + Convert numpy.ndarray to arrow::SparseTensorCSR + """ + return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) + + @staticmethod + def from_numpy(data, indptr, indices, shape, dim_names=None): + """ + Create arrow::SparseTensorCSR from numpy.ndarrays + """ + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + # Enforce precondition for SparseTensorCSR indices + indptr = np.require(indptr, dtype='i8') + indices = np.require(indices, dtype='i8') + if indptr.ndim != 1: + raise ValueError("Expected 1-dimensional array for " + "SparseTensorCSR indptr") + if indices.ndim != 1: + raise ValueError("Expected 1-dimensional array for " + "SparseTensorCSR indices") + + check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(), + data, indptr, indices, c_shape, c_dim_names, + &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + @staticmethod + def from_tensor(obj): + """ + Convert arrow::Tensor to arrow::SparseTensorCSR + """ + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) + + with nogil: + check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor)) + + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + def to_numpy(self): + """ + Convert arrow::SparseTensorCSR to numpy.ndarrays with zero copy + """ + cdef PyObject* out_data + cdef PyObject* out_indptr + cdef PyObject* out_indices + + check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_indptr, &out_indices)) + return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), + PyObject_to_object(out_indices)) + + def equals(self, SparseTensorCSR other): + """ + Return true if sparse tensors contains exactly equal data + """ + return self.stp.Equals(deref(other.stp)) + + def __eq__(self, other): + if isinstance(other, SparseTensorCSR): + return self.equals(other) + else: + return NotImplemented + + @property + def is_mutable(self): + return self.stp.is_mutable() + + @property + def ndim(self): + return self.stp.ndim() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.stp.shape()) + + @property + def size(self): + return self.stp.size() + + def dim_name(self, i): + return frombytes(self.stp.dim_name(i)) + + @property + def dim_names(self): + return [frombytes(x) for x in tuple(self.stp.dim_names())] + + @property + def non_zero_length(self): + return self.stp.non_zero_length() diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py new file mode 100644 index 00000000000..68564dacf4b --- /dev/null +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -0,0 +1,221 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +import sys + +import numpy as np +import pyarrow as pa + + +tensor_type_pairs = [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +] + + +@pytest.mark.parametrize('sparse_tensor_type', [ + pa.SparseTensorCSR, + pa.SparseTensorCOO, +]) +def test_sparse_tensor_attrs(sparse_tensor_type): + data = np.array([ + [0, 1, 0, 0, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 0], + [0, 3, 0, 0, 0], + ]) + dim_names = ['x', 'y'] + sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names) + + assert sparse_tensor.ndim == 2 + assert sparse_tensor.size == 25 + assert sparse_tensor.shape == data.shape + assert sparse_tensor.is_mutable + assert sparse_tensor.dim_name(0) == dim_names[0] + assert sparse_tensor.dim_names == dim_names + assert sparse_tensor.non_zero_length == 4 + + +def test_sparse_tensor_coo_base_object(): + data = np.array([[4], [9], [7], [5]]) + coords = np.array([[0, 0], [0, 2], [1, 1], [3, 3]]) + array = np.array([[4, 0, 9, 0], + [0, 7, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 5]]) + sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(array) + n = sys.getrefcount(sparse_tensor) + result_data, result_coords = sparse_tensor.to_numpy() + assert sys.getrefcount(sparse_tensor) == n + 2 + + sparse_tensor = None + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) + assert result_coords.flags.f_contiguous # column-major + + +def test_sparse_tensor_csr_base_object(): + data = np.array([[1], [2], [3], [4], [5], [6]]) + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + array = np.array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]) + + sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(array) + n = sys.getrefcount(sparse_tensor) + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert sys.getrefcount(sparse_tensor) == n + 3 + + sparse_tensor = None + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) + + +@pytest.mark.parametrize('sparse_tensor_type', [ + pa.SparseTensorCSR, + pa.SparseTensorCOO, +]) +def test_sparse_tensor_equals(sparse_tensor_type): + def eq(a, b): + assert a.equals(b) + assert a == b + assert not (a != b) + + def ne(a, b): + assert not a.equals(b) + assert not (a == b) + assert a != b + + data = np.random.randn(10, 6)[::, ::2] + sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data) + sparse_tensor2 = sparse_tensor_type.from_dense_numpy( + np.ascontiguousarray(data)) + eq(sparse_tensor1, sparse_tensor2) + data = data.copy() + data[9, 0] = 1.0 + sparse_tensor2 = sparse_tensor_type.from_dense_numpy( + np.ascontiguousarray(data)) + ne(sparse_tensor1, sparse_tensor2) + + +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_sparse_tensor_coo_from_dense(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data = np.array([[4], [9], [7], [5]]).astype(dtype) + coords = np.array([[0, 0], [0, 2], [1, 1], [3, 3]]) + array = np.array([[4, 0, 9, 0], + [0, 7, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 5]]).astype(dtype) + tensor = pa.Tensor.from_numpy(array) + + # Test from numpy array + sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(array) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_coords = sparse_tensor.to_numpy() + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) + + # Test from Tensor + sparse_tensor = pa.SparseTensorCOO.from_tensor(tensor) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_coords = sparse_tensor.to_numpy() + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) + + +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + dense_data = np.array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]).astype(dtype) + + data = np.array([[1], [2], [3], [4], [5], [6]]) + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + tensor = pa.Tensor.from_numpy(dense_data) + + # Test from numpy array + sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(dense_data) + repr(sparse_tensor) + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) + + # Test from Tensor + sparse_tensor = pa.SparseTensorCSR.from_tensor(tensor) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) + + +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data = np.array([[4], [9], [7], [5]]).astype(dtype) + coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]]) + shape = (4, 4) + dim_names = ["x", "y"] + + sparse_tensor = pa.SparseTensorCOO.from_numpy(data, coords, shape, + dim_names) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_coords = sparse_tensor.to_numpy() + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) + assert sparse_tensor.dim_names == dim_names + + +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) +def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data = np.array([[1], [2], [3], [4], [5], [6]]).astype(dtype) + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + shape = (3, 3) + dim_names = ["x", "y"] + + sparse_tensor = pa.SparseTensorCSR.from_numpy(data, indptr, indices, + shape, dim_names) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) + assert sparse_tensor.dim_names == dim_names diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index 188a4a5e1a5..13f05d27489 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -23,12 +23,28 @@ import pyarrow as pa +tensor_type_pairs = [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +] + + def test_tensor_attrs(): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) assert tensor.ndim == 2 + assert tensor.dim_names == [] assert tensor.size == 40 assert tensor.shape == data.shape assert tensor.strides == data.strides @@ -42,6 +58,13 @@ def test_tensor_attrs(): tensor = pa.Tensor.from_numpy(data2) assert not tensor.is_mutable + # With dim_names + tensor = pa.Tensor.from_numpy(data, dim_names=('x', 'y')) + assert tensor.ndim == 2 + assert tensor.dim_names == ['x', 'y'] + assert tensor.dim_name(0) == 'x' + assert tensor.dim_name(1) == 'y' + def test_tensor_base_object(): tensor = pa.Tensor.from_numpy(np.random.randn(10, 4)) @@ -50,19 +73,7 @@ def test_tensor_base_object(): assert sys.getrefcount(tensor) == n + 1 -@pytest.mark.parametrize('dtype_str,arrow_type', [ - ('i1', pa.int8()), - ('i2', pa.int16()), - ('i4', pa.int32()), - ('i8', pa.int64()), - ('u1', pa.uint8()), - ('u2', pa.uint16()), - ('u4', pa.uint32()), - ('u8', pa.uint64()), - ('f2', pa.float16()), - ('f4', pa.float32()), - ('f8', pa.float64()) -]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = (100 * np.random.randn(10, 4)).astype(dtype) @@ -76,15 +87,6 @@ def test_tensor_numpy_roundtrip(dtype_str, arrow_type): assert (data == result).all() -def _try_delete(path): - import gc - gc.collect() - try: - os.remove(path) - except os.error: - pass - - def test_tensor_ipc_roundtrip(tmpdir): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data)