From 4eeae02d8a4e8a43133ce0733acad8df08fb5dd1 Mon Sep 17 00:00:00 2001 From: Rok Date: Sun, 2 Jun 2019 04:37:45 +0200 Subject: [PATCH 1/8] Cython wrapper for SparseTensor. --- cpp/src/arrow/python/numpy_convert.cc | 309 +++++++++++++++++++++ cpp/src/arrow/python/numpy_convert.h | 34 +++ cpp/src/arrow/python/pyarrow.cc | 38 +++ cpp/src/arrow/python/pyarrow.h | 14 + cpp/src/arrow/python/pyarrow_api.h | 18 ++ cpp/src/arrow/python/pyarrow_lib.h | 16 ++ docs/source/python/extending.rst | 42 +++ python/pyarrow/__init__.pxd | 7 +- python/pyarrow/__init__.py | 1 + python/pyarrow/array.pxi | 193 +++++++++++++ python/pyarrow/includes/libarrow.pxd | 64 +++++ python/pyarrow/lib.pxd | 30 ++ python/pyarrow/public-api.pxi | 50 +++- python/pyarrow/tests/test_sparse_tensor.py | 187 +++++++++++++ 14 files changed, 1001 insertions(+), 2 deletions(-) create mode 100644 python/pyarrow/tests/test_sparse_tensor.py diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index f7068b353be..422ff0e644c 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -25,6 +25,7 @@ #include #include "arrow/buffer.h" +#include "arrow/sparse_tensor.h" #include "arrow/tensor.h" #include "arrow/type.h" @@ -274,5 +275,313 @@ Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, return Status::OK(); } +Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_coords) { + PyAcquireGIL lock; + + int type_num_data; + int type_num_coords; + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + const std::shared_ptr> sparse_index_coords = + sparse_index.indices(); + + RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data)); + PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); + RETURN_NOT_OK(GetNumPyType(*sparse_index_coords->type(), &type_num_coords)); + PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords); + RETURN_IF_PYERROR(); + + std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); + std::vector npy_shape_coords({sparse_index_coords->shape()[0], 2}); + + const void* immutable_data = nullptr; + if (sparse_tensor->data()) { + immutable_data = sparse_tensor->data()->data(); + } + + const void* immutable_coords = nullptr; + if (sparse_index_coords->data()) { + immutable_coords = sparse_index_coords->data()->data(); + } + + // Remove const =( + void* mutable_data = const_cast(immutable_data); + void* mutable_coords = const_cast(immutable_coords); + + int array_flags = 0; + if (sparse_tensor->is_mutable()) { + array_flags |= NPY_ARRAY_WRITEABLE; + } + + PyObject* data = + PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr, + mutable_data, array_flags, nullptr); + PyObject* coords = + PyArray_NewFromDescr(&PyArray_Type, dtype_coords, 2, npy_shape_coords.data(), + nullptr, mutable_coords, array_flags, nullptr); + + RETURN_IF_PYERROR() + + *out_data = data; + *out_coords = coords; + return Status::OK(); +} + +Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_indptr, PyObject** out_indices) { + PyAcquireGIL lock; + + int type_num_data; + RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data)); + PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); + RETURN_IF_PYERROR(); + + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + const std::shared_ptr> sparse_index_indptr = + sparse_index.indptr(); + const std::shared_ptr> sparse_index_indices = + sparse_index.indices(); + + std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); + std::vector npy_shape_indptr({sparse_index_indptr->shape()[0], 1}); + std::vector npy_shape_indices({sparse_index_indices->shape()[0], 1}); + + const void* immutable_data = nullptr; + if (sparse_tensor->data()) { + immutable_data = sparse_tensor->data()->data(); + } + + const void* immutable_indptr = nullptr; + if (sparse_index_indptr->data()) { + immutable_indptr = sparse_index_indptr->data()->data(); + } + + const void* immutable_indices = nullptr; + if (sparse_index_indices->data()) { + immutable_indices = sparse_index_indices->data()->data(); + } + + // Remove const =( + void* mutable_data = const_cast(immutable_data); + void* mutable_indptr = const_cast(immutable_indptr); + void* mutable_indices = const_cast(immutable_indices); + + int array_flags = 0; + if (sparse_tensor->is_mutable()) { + array_flags |= NPY_ARRAY_WRITEABLE; + } + + int type_num_indptr; + RETURN_NOT_OK(GetNumPyType(*sparse_index_indptr->type(), &type_num_indptr)); + PyArray_Descr* dtype_indptr = PyArray_DescrNewFromType(type_num_indptr); + + int type_num_indices; + RETURN_NOT_OK(GetNumPyType(*sparse_index_indptr->type(), &type_num_indices)); + PyArray_Descr* dtype_indices = PyArray_DescrNewFromType(type_num_indices); + + PyObject* result_data = + PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr, + mutable_data, array_flags, nullptr); + PyObject* result_indptr = + PyArray_NewFromDescr(&PyArray_Type, dtype_indptr, 1, npy_shape_indptr.data(), + nullptr, mutable_indptr, array_flags, nullptr); + PyObject* result_indices = + PyArray_NewFromDescr(&PyArray_Type, dtype_indices, 1, npy_shape_indices.data(), + nullptr, mutable_indices, array_flags, nullptr); + RETURN_IF_PYERROR() + + *out_data = result_data; + *out_indptr = result_indptr; + *out_indices = result_indices; + return Status::OK(); +} + +Status NdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao, + std::shared_ptr* out) { + if (!PyArray_Check(ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray = reinterpret_cast(ao); + + // TODO(wesm): What do we want to do with non-contiguous memory and negative strides? + + int ndim = PyArray_NDIM(ndarray); + + // This is also holding the GIL, so don't already draw it. + std::shared_ptr data = std::make_shared(ao); + std::vector shape(ndim); + std::vector strides(ndim); + + { + PyAcquireGIL lock; + npy_intp* array_strides = PyArray_STRIDES(ndarray); + npy_intp* array_shape = PyArray_SHAPE(ndarray); + for (int i = 0; i < ndim; ++i) { + if (array_strides[i] < 0) { + return Status::Invalid("Negative ndarray strides not supported"); + } + shape[i] = array_shape[i]; + strides[i] = array_strides[i]; + } + + std::shared_ptr type; + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); + Tensor tensor(type, data, shape, strides); + *out = std::make_shared(tensor); + return Status::OK(); + } +} + +Status NdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao, + std::shared_ptr* out) { + if (!PyArray_Check(ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray = reinterpret_cast(ao); + + // TODO(wesm): What do we want to do with non-contiguous memory and negative strides? + + int ndim = PyArray_NDIM(ndarray); + + // This is also holding the GIL, so don't already draw it. + std::shared_ptr data = std::make_shared(ao); + std::vector shape(ndim); + std::vector strides(ndim); + + { + PyAcquireGIL lock; + npy_intp* array_strides = PyArray_STRIDES(ndarray); + npy_intp* array_shape = PyArray_SHAPE(ndarray); + for (int i = 0; i < ndim; ++i) { + if (array_strides[i] < 0) { + return Status::Invalid("Negative ndarray strides not supported"); + } + shape[i] = array_shape[i]; + strides[i] = array_strides[i]; + } + + std::shared_ptr type; + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); + Tensor tensor(type, data, shape, strides); + *out = std::make_shared(tensor); + return Status::OK(); + } +} + +Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, + const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyAcquireGIL lock; + + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + PyArrayObject* ndarray_coords = reinterpret_cast(coords_ao); + + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr coords_buffer = std::make_shared(coords_ao); + + int coords_ndim = PyArray_NDIM(ndarray_coords); + + std::shared_ptr type; + npy_intp* coords_array_shape = PyArray_SHAPE(ndarray_coords); + std::vector coords_shape(coords_ndim); + + for (int i = 0; i < coords_ndim; ++i) { + coords_shape[i] = coords_array_shape[i]; + } + + std::shared_ptr> coords = + std::make_shared>(coords_buffer, coords_shape); + + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), &type)); + + std::shared_ptr sparse_index = std::make_shared(coords); + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); + return Status::OK(); +} + +Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, + PyObject* indices_ao, const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) || + !PyArray_Check(indices_ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyAcquireGIL lock; + + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + PyArrayObject* ndarray_indptr = reinterpret_cast(indptr_ao); + PyArrayObject* ndarray_indices = reinterpret_cast(indices_ao); + + // This is also holding the GIL, so don't already draw it. + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr indptr_buffer = std::make_shared(indptr_ao); + std::shared_ptr indices_buffer = std::make_shared(indices_ao); + + int indptr_ndim = PyArray_NDIM(ndarray_indptr); + int indices_ndim = PyArray_NDIM(ndarray_indices); + + std::shared_ptr type; + npy_intp* indptr_array_shape = PyArray_SHAPE(ndarray_indptr); + npy_intp* indices_array_shape = PyArray_SHAPE(ndarray_indices); + std::vector indptr_shape(indptr_ndim); + std::vector indices_shape(indices_ndim); + + for (int i = 0; i < indptr_ndim; ++i) { + indptr_shape[i] = indptr_array_shape[i]; + } + for (int i = 0; i < indices_ndim; ++i) { + indices_shape[i] = indices_array_shape[i]; + } + + std::shared_ptr indptr = + std::make_shared(indptr_buffer, indptr_shape); + std::shared_ptr indices = + std::make_shared(indices_buffer, indices_shape); + + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), &type)); + + std::shared_ptr sparse_index = + std::make_shared(indptr, indices); + *out = std::make_shared>(sparse_index, type, data, + shape, dim_names); + return Status::OK(); +} + +Status TensorToSparseTensorCOO(const std::shared_ptr& tensor, + std::shared_ptr* out) { + { + PyAcquireGIL lock; + *out = std::make_shared(*tensor); + return Status::OK(); + } +} + +Status TensorToSparseTensorCSR(const std::shared_ptr& tensor, + std::shared_ptr* out) { + { + PyAcquireGIL lock; + *out = std::make_shared(*tensor); + return Status::OK(); + } +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h index dce5fe522d6..4eff77bd042 100644 --- a/cpp/src/arrow/python/numpy_convert.h +++ b/cpp/src/arrow/python/numpy_convert.h @@ -25,9 +25,11 @@ #include #include +#include #include "arrow/buffer.h" #include "arrow/python/visibility.h" +#include "arrow/sparse_tensor.h" namespace arrow { @@ -68,6 +70,38 @@ ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, PyObject** out); +ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray( + const std::shared_ptr& sparse_tensor, PyObject* base, + PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); + +ARROW_PYTHON_EXPORT Status +SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, PyObject** out_coords); + +ARROW_PYTHON_EXPORT Status NdarrayToSparseTensorCSR( + MemoryPool* pool, PyObject* ao, std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status NdarrayToSparseTensorCOO( + MemoryPool* pool, PyObject* ao, std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR( + MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, + const std::vector& shape, const std::vector& dim_names, + std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO( + MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, + const std::vector& shape, const std::vector& dim_names, + std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status +TensorToSparseTensorCSR(const std::shared_ptr& tensor, + std::shared_ptr* csparse_tensor); + +ARROW_PYTHON_EXPORT Status +TensorToSparseTensorCOO(const std::shared_ptr& tensor, + std::shared_ptr* csparse_tensor); + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc index 1cedc549cfa..e037318bce2 100644 --- a/cpp/src/arrow/python/pyarrow.cc +++ b/cpp/src/arrow/python/pyarrow.cc @@ -123,6 +123,44 @@ PyObject* wrap_tensor(const std::shared_ptr& tensor) { return ::pyarrow_wrap_tensor(tensor); } +bool is_sparse_tensor_csr(PyObject* sparse_tensor) { + return ::pyarrow_is_sparse_tensor_csr(sparse_tensor) != 0; +} + +Status unwrap_sparse_tensor_csr(PyObject* sparse_tensor, + std::shared_ptr* out) { + *out = ::pyarrow_unwrap_sparse_tensor_csr(sparse_tensor); + if (*out) { + return Status::OK(); + } else { + return Status::Invalid( + "Could not unwrap SparseTensorCSR from the passed Python object."); + } +} + +PyObject* wrap_sparse_tensor_csr(const std::shared_ptr& sparse_tensor) { + return ::pyarrow_wrap_sparse_tensor_csr(sparse_tensor); +} + +bool is_sparse_tensor_coo(PyObject* sparse_tensor) { + return ::pyarrow_is_sparse_tensor_coo(sparse_tensor) != 0; +} + +Status unwrap_sparse_tensor_coo(PyObject* sparse_tensor, + std::shared_ptr* out) { + *out = ::pyarrow_unwrap_sparse_tensor_coo(sparse_tensor); + if (*out) { + return Status::OK(); + } else { + return Status::Invalid( + "Could not unwrap SparseTensorCOO from the passed Python object."); + } +} + +PyObject* wrap_sparse_tensor_coo(const std::shared_ptr& sparse_tensor) { + return ::pyarrow_wrap_sparse_tensor_coo(sparse_tensor); +} + bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; } Status unwrap_column(PyObject* column, std::shared_ptr* out) { diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h index ff5bf8f01dd..b4834f79f78 100644 --- a/cpp/src/arrow/python/pyarrow.h +++ b/cpp/src/arrow/python/pyarrow.h @@ -24,6 +24,8 @@ #include "arrow/python/visibility.h" +#include "arrow/sparse_tensor.h" + namespace arrow { class Array; @@ -67,6 +69,18 @@ ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor); ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out); ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); +ARROW_PYTHON_EXPORT bool is_sparse_tensor_coo(PyObject* sparse_tensor); +ARROW_PYTHON_EXPORT Status +unwrap_sparse_tensor_coo(PyObject* sparse_tensor, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_coo( + const std::shared_ptr& sparse_tensor); + +ARROW_PYTHON_EXPORT bool is_sparse_tensor_csr(PyObject* sparse_tensor); +ARROW_PYTHON_EXPORT Status +unwrap_sparse_tensor_csr(PyObject* sparse_tensor, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_sparse_tensor_csr( + const std::shared_ptr& sparse_tensor); + ARROW_PYTHON_EXPORT bool is_column(PyObject* column); ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr& column); diff --git a/cpp/src/arrow/python/pyarrow_api.h b/cpp/src/arrow/python/pyarrow_api.h index b76e9614a8a..2d8f71c8c5a 100644 --- a/cpp/src/arrow/python/pyarrow_api.h +++ b/cpp/src/arrow/python/pyarrow_api.h @@ -50,6 +50,10 @@ static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr #define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0; #define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr)(std::shared_ptr< arrow::SparseTensorCSR> const &) = 0; +#define pyarrow_wrap_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo)(std::shared_ptr< arrow::SparseTensorCOO> const &) = 0; +#define pyarrow_wrap_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo static std::shared_ptr< arrow::Array> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0; #define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0; @@ -68,6 +72,10 @@ static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwra #define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0; #define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor +static std::shared_ptr< arrow::SparseTensorCSR> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr)(PyObject *) = 0; +#define pyarrow_unwrap_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr +static std::shared_ptr< arrow::SparseTensorCOO> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo)(PyObject *) = 0; +#define pyarrow_unwrap_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const &) = 0; #define pyarrow_internal_check_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0; @@ -84,6 +92,10 @@ static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_pt #define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0; #define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr)(PyObject *) = 0; +#define pyarrow_is_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo)(PyObject *) = 0; +#define pyarrow_is_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0; #define pyarrow_is_column __pyx_api_f_7pyarrow_3lib_pyarrow_is_column static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0; @@ -167,6 +179,8 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr, "PyObject *(std::shared_ptr< arrow::SparseTensorCSR> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr, "PyObject *(std::shared_ptr< arrow::SparseTensorCOO> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad; @@ -176,6 +190,8 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr, "std::shared_ptr< arrow::SparseTensorCSR> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo, "std::shared_ptr< arrow::SparseTensorCOO> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_internal_check_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int (arrow::Status const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad; @@ -184,6 +200,8 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject *(std::shared_ptr< arrow::Scalar> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad; diff --git a/cpp/src/arrow/python/pyarrow_lib.h b/cpp/src/arrow/python/pyarrow_lib.h index 5f5fc4c6b6f..acf223a64cd 100644 --- a/cpp/src/arrow/python/pyarrow_lib.h +++ b/cpp/src/arrow/python/pyarrow_lib.h @@ -48,6 +48,8 @@ __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const &); __PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_coo(std::shared_ptr< arrow::SparseTensorCOO> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_sparse_tensor_csr(std::shared_ptr< arrow::SparseTensorCSR> const &); __PYX_EXTERN_C std::shared_ptr< arrow::Array> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Buffer> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *); @@ -57,6 +59,20 @@ __PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unw __PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCOO> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCSR> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr(PyObject *); +__PYX_EXTERN_C int pyarrow_is_buffer(PyObject *); +__PYX_EXTERN_C int pyarrow_is_data_type(PyObject *); +__PYX_EXTERN_C int pyarrow_is_field(PyObject *); +__PYX_EXTERN_C int pyarrow_is_schema(PyObject *); +__PYX_EXTERN_C int pyarrow_is_array(PyObject *); +__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); +__PYX_EXTERN_C int pyarrow_is_tensor(PyObject *); +__PYX_EXTERN_C int pyarrow_is_sparse_tensor_coo(PyObject *); +__PYX_EXTERN_C int pyarrow_is_sparse_tensor_csr(PyObject *); +__PYX_EXTERN_C int pyarrow_is_column(PyObject *); +__PYX_EXTERN_C int pyarrow_is_table(PyObject *); +__PYX_EXTERN_C int pyarrow_is_batch(PyObject *); #endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst index 6b5c9ce1902..f15b1bedbac 100644 --- a/docs/source/python/extending.rst +++ b/docs/source/python/extending.rst @@ -116,6 +116,16 @@ C++ objects. Return whether *obj* wraps an Arrow C++ :class:`Tensor` pointer; in other words, whether *obj* is a :py:class:`pyarrow.Tensor` instance. +.. function:: bool is_sparse_tensor_coo(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`SparseTensorCOO` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseTensorCOO` instance. + +.. function:: bool is_sparse_tensor_csr(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`SparseTensorCSR` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseTensorCSR` instance. + The following functions expect a pyarrow object, unwrap the underlying Arrow C++ API pointer, and put it in the *out* parameter. The returned :class:`Status` object must be inspected first to know whether any error @@ -157,6 +167,14 @@ occurred. If successful, *out* is guaranteed to be non-NULL. Unwrap the Arrow C++ :class:`Tensor` pointer from *obj* and put it in *out*. +.. function:: Status unwrap_sparse_tensor_coo(PyObject* obj, std::shared_ptr* out) + + Unwrap the Arrow C++ :class:`SparseTensorCOO` pointer from *obj* and put it in *out*. + +.. function:: Status unwrap_sparse_tensor_csr(PyObject* obj, std::shared_ptr* out) + + Unwrap the Arrow C++ :class:`SparseTensorCSR` pointer from *obj* and put it in *out*. + The following functions take an Arrow C++ API pointer and wrap it in a pyarray object of the corresponding type. A new reference is returned. On error, NULL is returned and a Python exception is set. @@ -197,6 +215,14 @@ On error, NULL is returned and a Python exception is set. Wrap the Arrow C++ *tensor* in a :py:class:`pyarrow.Tensor` instance. +.. function:: PyObject* wrap_sparse_tensor_coo(const std::shared_ptr& sparse_tensor) + + Wrap the Arrow C++ *COO sparse tensor* in a :py:class:`pyarrow.SparseTensorCOO` instance. + +.. function:: PyObject* wrap_sparse_tensor_csr(const std::shared_ptr& sparse_tensor) + + Wrap the Arrow C++ *CSR sparse tensor* in a :py:class:`pyarrow.SparseTensorCSR` instance. + Cython API ---------- @@ -257,6 +283,14 @@ an exception) if the input is not of the right type. Unwrap the Arrow C++ :cpp:class:`Tensor` pointer from *obj*. +.. function:: pyarrow_unwrap_sparse_tensor_coo(obj) -> shared_ptr[CSparseTensorCOO] + + Unwrap the Arrow C++ :cpp:class:`SparseTensorCOO` pointer from *obj*. + +.. function:: pyarrow_unwrap_sparse_tensor_csr(obj) -> shared_ptr[CSparseTensorCSR] + + Unwrap the Arrow C++ :cpp:class:`SparseTensorCSR` pointer from *obj*. + The following functions take a Arrow C++ API pointer and wrap it in a pyarray object of the corresponding type. An exception is raised on error. @@ -300,6 +334,14 @@ pyarray object of the corresponding type. An exception is raised on error. Wrap the Arrow C++ *tensor* in a Python :class:`pyarrow.Tensor` instance. +.. function:: pyarrow_wrap_sparse_tensor_coo(sp_array: const shared_ptr[CSparseTensorCOO]& sparse_tensor) -> object + + Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseTensorCOO` instance. + +.. function:: pyarrow_wrap_sparse_tensor_csr(sp_array: const shared_ptr[CSparseTensorCSR]& sparse_tensor) -> object + + Wrap the Arrow C++ *CSR sparse tensor* in a Python :class:`pyarrow.SparseTensorCSR` instance. + Example ~~~~~~~ diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd index 95cea5ca4fc..cdc918f0385 100644 --- a/python/pyarrow/__init__.pxd +++ b/python/pyarrow/__init__.pxd @@ -20,7 +20,8 @@ from __future__ import absolute_import from libcpp.memory cimport shared_ptr from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType, CField, CRecordBatch, CSchema, - CTable, CTensor) + CTable, CTensor, + CSparseTensorCSR, CSparseTensorCOO) cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": @@ -31,6 +32,10 @@ cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": cdef object wrap_schema(const shared_ptr[CSchema]& schema) cdef object wrap_array(const shared_ptr[CArray]& sp_array) cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor) + cdef object wrap_sparse_tensor_coo( + const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor) + cdef object wrap_sparse_tensor_csr( + const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor) cdef object wrap_column(const shared_ptr[CColumn]& ccolumn) cdef object wrap_table(const shared_ptr[CTable]& ctable) cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 487065c2892..bbbd91a9508 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -66,6 +66,7 @@ def parse_git(root, **kwargs): schema, Array, Tensor, array, chunked_array, column, table, + SparseTensorCSR, SparseTensorCOO, infer_type, from_numpy_dtype, NullArray, NumericArray, IntegerArray, FloatingPointArray, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5ae178d8953..20dfae47b91 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -870,6 +870,199 @@ cdef class Array(_PandasConvertible): return res +cdef class SparseTensorCOO: + """ + A sparse COO tensor. + """ + + def __init__(self): + raise TypeError("Do not call SparseTensorCOO's constructor directly, " + "use one of the `pyarrow.SparseTensorCOO.from_*` " + "functions instead.") + + cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor): + self.sp_sparse_tensor = sp_sparse_tensor + self.stp = sp_sparse_tensor.get() + self.type = pyarrow_wrap_data_type(self.stp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape}""".format(self) + + @staticmethod + def from_dense_numpy(obj): + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + with nogil: + check_status(NdarrayToSparseTensorCOO(c_default_memory_pool(), obj, + &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + @staticmethod + def from_numpy(data, coords, shape, dim_names=None): + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + with nogil: + check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(), + data, coords, c_shape, c_dim_names, &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + @staticmethod + def from_tensor(obj): + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) + + with nogil: + check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor)) + + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + def to_numpy(self): + """ + Convert arrow::SparseTensorCOO to numpy.ndarrays with zero copy + """ + cdef PyObject* out_data + cdef PyObject* out_coords + + with nogil: + check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_coords)) + return PyObject_to_object(out_data), PyObject_to_object(out_coords) + + @property + def is_mutable(self): + return self.stp.is_mutable() + + @property + def ndim(self): + return self.stp.ndim() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.stp.shape()) + + @property + def size(self): + return self.stp.size() + + def dim_name(self, i): + return self.stp.dim_name(i) + + @property + def dim_names(self): + return self.stp.dim_names() + + @property + def non_zero_length(self): + return self.stp.non_zero_length() + + +cdef class SparseTensorCSR: + """ + A sparse CSR tensor. + """ + + def __init__(self): + raise TypeError("Do not call SparseTensorCSR's constructor directly, " + "use one of the `pyarrow.SparseTensorCSR.from_*` " + "functions instead.") + + cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor): + self.sp_sparse_tensor = sp_sparse_tensor + self.stp = sp_sparse_tensor.get() + self.type = pyarrow_wrap_data_type(self.stp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape}""".format(self) + + @staticmethod + def from_dense_numpy(obj): + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + with nogil: + check_status(NdarrayToSparseTensorCSR(c_default_memory_pool(), obj, + &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + @staticmethod + def from_numpy(data, indptr, indices, shape, dim_names=None): + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + with nogil: + check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(), + data, indptr, indices, c_shape, c_dim_names, + &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + @staticmethod + def from_tensor(obj): + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) + + with nogil: + check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor)) + + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + def to_numpy(self): + """ + Convert arrow::SparseTensorCSR to numpy.ndarrays with zero copy + """ + cdef PyObject* out_data + cdef PyObject* out_indptr + cdef PyObject* out_indices + + with nogil: + check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_indptr, &out_indices)) + return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), + PyObject_to_object(out_indices)) + + @property + def is_mutable(self): + return self.stp.is_mutable() + + @property + def ndim(self): + return self.stp.ndim() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.stp.shape()) + + @property + def size(self): + return self.stp.size() + + def dim_name(self, i): + return self.stp.dim_name(i) + + @property + def dim_names(self): + return self.stp.dim_names() + + @property + def non_zero_length(self): + return self.stp.non_zero_length() + + cdef class Tensor: """ A n-dimensional array a.k.a Tensor. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8798834b5fd..2ba05b20d79 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -600,6 +600,38 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type type_id() c_bool Equals(const CTensor& other) + cdef cppclass CSparseTensorCOO" arrow::SparseTensorCOO": + shared_ptr[CDataType] type() + shared_ptr[CBuffer] data() + + const vector[int64_t]& shape() + int64_t size() + int64_t non_zero_length() + + int ndim() + const vector[c_string]& dim_names() + const c_string& dim_name(int i) + + c_bool is_mutable() + Type type_id() + c_bool Equals(const CSparseTensorCOO& other) + + cdef cppclass CSparseTensorCSR" arrow::SparseTensorCSR": + shared_ptr[CDataType] type() + shared_ptr[CBuffer] data() + + const vector[int64_t]& shape() + int64_t size() + int64_t non_zero_length() + + int ndim() + const vector[c_string]& dim_names() + const c_string& dim_name(int i) + + c_bool is_mutable() + Type type_id() + c_bool Equals(const CSparseTensorCSR& other) + cdef cppclass CScalar" arrow::Scalar": shared_ptr[CDataType] type @@ -1207,6 +1239,38 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base, PyObject** out) + CStatus SparseTensorCOOToNdarray( + const shared_ptr[CSparseTensorCOO]& sparse_tensor, object base, + PyObject** out_data, PyObject** out_coords) + + CStatus SparseTensorCSRToNdarray( + const shared_ptr[CSparseTensorCSR]& sparse_tensor, object base, + PyObject** out_data, PyObject** out_indptr, PyObject** out_indices) + + CStatus NdarraysToSparseTensorCOO(CMemoryPool* pool, object data_ao, + object coords_ao, + const vector[int64_t]& shape, + const vector[c_string]& dim_names, + shared_ptr[CSparseTensorCOO]* out) + + CStatus NdarraysToSparseTensorCSR(CMemoryPool* pool, object data_ao, + object indptr_ao, object indices_ao, + const vector[int64_t]& shape, + const vector[c_string]& dim_names, + shared_ptr[CSparseTensorCSR]* out) + + CStatus NdarrayToSparseTensorCOO(CMemoryPool* pool, object ao, + shared_ptr[CSparseTensorCOO]* out) + + CStatus NdarrayToSparseTensorCSR(CMemoryPool* pool, object ao, + shared_ptr[CSparseTensorCSR]* out) + + CStatus TensorToSparseTensorCOO(shared_ptr[CTensor], + shared_ptr[CSparseTensorCOO]* out) + + CStatus TensorToSparseTensorCSR(shared_ptr[CTensor], + shared_ptr[CSparseTensorCSR]* out) + CStatus ConvertArrayToPandas(const PandasOptions& options, const shared_ptr[CArray]& arr, object py_ref, PyObject** out) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 79ab9478b16..898c70a4bf7 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -231,6 +231,28 @@ cdef class Tensor: cdef void init(self, const shared_ptr[CTensor]& sp_tensor) +cdef class SparseTensorCSR: + cdef: + shared_ptr[CSparseTensorCSR] sp_sparse_tensor + CSparseTensorCSR* stp + + cdef readonly: + DataType type + + cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor) + + +cdef class SparseTensorCOO: + cdef: + shared_ptr[CSparseTensorCOO] sp_sparse_tensor + CSparseTensorCOO* stp + + cdef readonly: + DataType type + + cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor) + + cdef class NullArray(Array): pass @@ -452,6 +474,10 @@ cdef public object pyarrow_wrap_resizable_buffer( cdef public object pyarrow_wrap_schema(const shared_ptr[CSchema]& type) cdef public object pyarrow_wrap_table(const shared_ptr[CTable]& ctable) cdef public object pyarrow_wrap_tensor(const shared_ptr[CTensor]& sp_tensor) +cdef public object pyarrow_wrap_sparse_tensor_coo( + const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor) +cdef public object pyarrow_wrap_sparse_tensor_csr( + const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor) cdef public shared_ptr[CArray] pyarrow_unwrap_array(object array) cdef public shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch) @@ -462,3 +488,7 @@ cdef public shared_ptr[CField] pyarrow_unwrap_field(object field) cdef public shared_ptr[CSchema] pyarrow_unwrap_schema(object schema) cdef public shared_ptr[CTable] pyarrow_unwrap_table(object table) cdef public shared_ptr[CTensor] pyarrow_unwrap_tensor(object tensor) +cdef public shared_ptr[CSparseTensorCOO] pyarrow_unwrap_sparse_tensor_coo( + object sparse_tensor) +cdef public shared_ptr[CSparseTensorCSR] pyarrow_unwrap_sparse_tensor_csr( + object sparse_tensor) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 33bc8031804..05c07748f17 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -18,7 +18,8 @@ from libcpp.memory cimport shared_ptr from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, CRecordBatch, CSchema, - CTable, CTensor) + CTable, CTensor, + CSparseTensorCSR, CSparseTensorCOO) # You cannot assign something to a dereferenced pointer in Cython thus these # methods don't use Status to indicate a successful operation. @@ -225,6 +226,7 @@ cdef api object pyarrow_wrap_scalar(const shared_ptr[CScalar]& sp_scalar): scalar.init(sp_scalar) return scalar + cdef api bint pyarrow_is_tensor(object tensor): return isinstance(tensor, Tensor) @@ -248,6 +250,52 @@ cdef api object pyarrow_wrap_tensor( return tensor +cdef api bint pyarrow_is_sparse_tensor_coo(object sparse_tensor): + return isinstance(sparse_tensor, SparseTensorCOO) + +cdef api shared_ptr[CSparseTensorCOO] pyarrow_unwrap_sparse_tensor_coo( + object sparse_tensor): + cdef SparseTensorCOO sten + if pyarrow_is_sparse_tensor_coo(sparse_tensor): + sten = (sparse_tensor) + return sten.sp_sparse_tensor + + return shared_ptr[CSparseTensorCOO]() + +cdef api object pyarrow_wrap_sparse_tensor_coo( + const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor): + if sp_sparse_tensor.get() == NULL: + raise ValueError('SparseTensorCOO was NULL') + + cdef SparseTensorCOO sparse_tensor = SparseTensorCOO.__new__( + SparseTensorCOO) + sparse_tensor.init(sp_sparse_tensor) + return sparse_tensor + + +cdef api bint pyarrow_is_sparse_tensor_csr(object sparse_tensor): + return isinstance(sparse_tensor, SparseTensorCSR) + +cdef api shared_ptr[CSparseTensorCSR] pyarrow_unwrap_sparse_tensor_csr( + object sparse_tensor): + cdef SparseTensorCSR sten + if pyarrow_is_sparse_tensor_csr(sparse_tensor): + sten = (sparse_tensor) + return sten.sp_sparse_tensor + + return shared_ptr[CSparseTensorCSR]() + +cdef api object pyarrow_wrap_sparse_tensor_csr( + const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor): + if sp_sparse_tensor.get() == NULL: + raise ValueError('SparseTensorCSR was NULL') + + cdef SparseTensorCSR sparse_tensor = SparseTensorCSR.__new__( + SparseTensorCSR) + sparse_tensor.init(sp_sparse_tensor) + return sparse_tensor + + cdef api bint pyarrow_is_column(object column): return isinstance(column, Column) diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py new file mode 100644 index 00000000000..4cffb8f8151 --- /dev/null +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -0,0 +1,187 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +import numpy as np +import pyarrow as pa + + +@pytest.mark.parametrize('sparse_tensor_type', [ + pa.SparseTensorCSR, + pa.SparseTensorCOO, +]) +def test_sparse_tensor_attrs(sparse_tensor_type): + data = np.array([ + [0, 1, 0, 0, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 0], + [0, 3, 0, 0, 0], + ]) + sparse_tensor = sparse_tensor_type.from_dense_numpy(data) + + assert sparse_tensor.ndim == 2 + assert sparse_tensor.size == 25 + assert sparse_tensor.shape == data.shape + assert sparse_tensor.is_mutable + assert sparse_tensor.dim_name(0) == b'' + assert sparse_tensor.dim_names == [] + assert sparse_tensor.non_zero_length == 4 + + +@pytest.mark.parametrize('dtype_str,arrow_type', [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +]) +def test_sparse_tensor_coo_from_dense(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data_vector = np.array([4, 9, 7, 5]).astype(dtype) + coords = np.array([[0, 0], [1, 3], [0, 2], [1, 3]]) + data = np.array([[4, 0, 9, 0], + [0, 7, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 5]]).astype(dtype) + tensor = pa.Tensor.from_numpy(data) + + # Test from numpy array + sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(data) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_coords = sparse_tensor.to_numpy() + assert (data_vector == result_data).all() + assert (result_coords == coords).all() + + # Test from Tensor + sparse_tensor = pa.SparseTensorCOO.from_tensor(tensor) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_coords = sparse_tensor.to_numpy() + assert (data_vector == result_data).all() + assert (result_coords == coords).all() + + +@pytest.mark.parametrize('dtype_str,arrow_type', [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +]) +def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + dense_data = np.array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]).astype(dtype) + + data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + tensor = pa.Tensor.from_numpy(dense_data) + + # Test from numpy array + sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(dense_data) + repr(sparse_tensor) + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert (data == result_data).all() + assert (indptr == result_indptr).all() + assert (indices == result_indices).all() + + # Test from Tensor + sparse_tensor = pa.SparseTensorCSR.from_tensor(tensor) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert (data == result_data).all() + assert (indptr == result_indptr).all() + assert (indices == result_indices).all() + + +@pytest.mark.skip +@pytest.mark.parametrize('dtype_str,arrow_type', [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +]) +def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]]).astype('i8') + data = np.array([4, 5, 7, 9]).astype(dtype) + shape = (4, 4) + dim_names = ["x", "y"] + + sparse_tensor = pa.SparseTensorCOO.from_numpy(data, coords, shape, + dim_names) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_coords = sparse_tensor.to_numpy() + assert (data == result_data).all() + assert (coords == result_coords).all() + + +@pytest.mark.parametrize('dtype_str,arrow_type', [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +]) +def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type): + dtype = np.dtype(dtype_str) + data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) + indptr = np.array([0, 2, 3, 6]).astype('i8') + indices = np.array([0, 2, 2, 0, 1, 2]).astype('i8') + shape = (3, 3) + dim_names = ["x", "y"] + + sparse_tensor = pa.SparseTensorCSR.from_numpy(data, indptr, indices, + shape, dim_names) + repr(sparse_tensor) + assert sparse_tensor.type == arrow_type + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert (data == result_data).all() + assert (indptr == result_indptr).all() + assert (indices == result_indices).all() From 4a30487fce5852468ce6595c796e60acd8a6e4e4 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 20 Jun 2019 01:19:00 +0200 Subject: [PATCH 2/8] Set base object in to_numpy methods. --- cpp/src/arrow/python/numpy_convert.cc | 25 ++++++++++++++++++---- python/pyarrow/tests/test_sparse_tensor.py | 18 ++++++++++++++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 422ff0e644c..75073770851 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -315,17 +315,25 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t array_flags |= NPY_ARRAY_WRITEABLE; } - PyObject* data = + PyObject* result_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr, mutable_data, array_flags, nullptr); - PyObject* coords = + PyObject* result_coords = PyArray_NewFromDescr(&PyArray_Type, dtype_coords, 2, npy_shape_coords.data(), nullptr, mutable_coords, array_flags, nullptr); RETURN_IF_PYERROR() - *out_data = data; - *out_coords = coords; + if (base == Py_None || base == nullptr) { + base = py::wrap_sparse_tensor_coo(sparse_tensor); + } else { + Py_XINCREF(base); + } + PyArray_SetBaseObject(reinterpret_cast(result_data), base); + PyArray_SetBaseObject(reinterpret_cast(result_coords), base); + + *out_data = result_data; + *out_coords = result_coords; return Status::OK(); } @@ -394,6 +402,15 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_t nullptr, mutable_indices, array_flags, nullptr); RETURN_IF_PYERROR() + if (base == Py_None || base == nullptr) { + base = py::wrap_sparse_tensor_csr(sparse_tensor); + } else { + Py_XINCREF(base); + } + PyArray_SetBaseObject(reinterpret_cast(result_data), base); + PyArray_SetBaseObject(reinterpret_cast(result_indptr), base); + PyArray_SetBaseObject(reinterpret_cast(result_indices), base); + *out_data = result_data; *out_indptr = result_indptr; *out_indices = result_indices; diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 4cffb8f8151..4448d7c691a 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -16,6 +16,7 @@ # under the License. import pytest +import sys import numpy as np import pyarrow as pa @@ -44,6 +45,20 @@ def test_sparse_tensor_attrs(sparse_tensor_type): assert sparse_tensor.non_zero_length == 4 +def test_sparse_tensor_coo_base_object(): + sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(np.random.randn(10, 4)) + n = sys.getrefcount(sparse_tensor) + data, coords = sparse_tensor.to_numpy() # noqa + assert sys.getrefcount(sparse_tensor) == n + 1 + + +def test_sparse_tensor_csr_base_object(): + sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(np.random.randn(10, 4)) + n = sys.getrefcount(sparse_tensor) + data, indptr, indices = sparse_tensor.to_numpy() # noqa + assert sys.getrefcount(sparse_tensor) == n + 1 + + @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), @@ -126,7 +141,6 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): assert (indices == result_indices).all() -@pytest.mark.skip @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), @@ -142,8 +156,8 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): ]) def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) - coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]]).astype('i8') data = np.array([4, 5, 7, 9]).astype(dtype) + coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]]).astype('i8') shape = (4, 4) dim_names = ["x", "y"] From 3fcc1929e84b627925ea9e301c182395c6fdc0f3 Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 20 Jun 2019 01:19:31 +0200 Subject: [PATCH 3/8] Add equality methods. --- python/pyarrow/array.pxi | 24 +++++++++++++++++++ python/pyarrow/tests/test_sparse_tensor.py | 27 ++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 20dfae47b91..39d0d9ff088 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -936,6 +936,18 @@ shape: {0.shape}""".format(self) &out_data, &out_coords)) return PyObject_to_object(out_data), PyObject_to_object(out_coords) + def equals(self, SparseTensorCOO other): + """ + Return true if sparse tensors contains exactly equal data + """ + return self.stp.Equals(deref(other.stp)) + + def __eq__(self, other): + if isinstance(other, SparseTensorCOO): + return self.equals(other) + else: + return NotImplemented + @property def is_mutable(self): return self.stp.is_mutable() @@ -1034,6 +1046,18 @@ shape: {0.shape}""".format(self) return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), PyObject_to_object(out_indices)) + def equals(self, SparseTensorCSR other): + """ + Return true if sparse tensors contains exactly equal data + """ + return self.stp.Equals(deref(other.stp)) + + def __eq__(self, other): + if isinstance(other, SparseTensorCSR): + return self.equals(other) + else: + return NotImplemented + @property def is_mutable(self): return self.stp.is_mutable() diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 4448d7c691a..3757890d082 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -59,6 +59,32 @@ def test_sparse_tensor_csr_base_object(): assert sys.getrefcount(sparse_tensor) == n + 1 +@pytest.mark.skip +@pytest.mark.parametrize('sparse_tensor_type', [ + pa.SparseTensorCSR, + pa.SparseTensorCOO, +]) +def test_sparse_tensor_equals(sparse_tensor_type): + def eq(a, b): + assert a.equals(b) + assert a == b + assert not (a != b) + + def ne(a, b): + assert not a.equals(b) + assert not (a == b) + assert a != b + + data = np.random.randn(10, 6)[::, ::2] + sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data) + sparse_tensor2 = sparse_tensor_type.from_dense_numpy(np.ascontiguousarray(data)) + eq(sparse_tensor1, sparse_tensor2) + data = data.copy() + data[9, 0] = 1.0 + sparse_tensor2 = sparse_tensor_type.from_dense_numpy(np.ascontiguousarray(data)) + ne(sparse_tensor1, sparse_tensor2) + + @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), @@ -141,6 +167,7 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): assert (indices == result_indices).all() +@pytest.mark.skip @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), From e89edc620ef3aadd24bc15bb6205b6f5a51096aa Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 20 Jun 2019 01:57:00 +0200 Subject: [PATCH 4/8] Refactoring to_numpy methods. --- cpp/src/arrow/python/numpy_convert.cc | 24 +++++++++++++++++++--- python/pyarrow/tests/test_sparse_tensor.py | 6 ++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 75073770851..8e8b2d3cace 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -293,8 +293,14 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords); RETURN_IF_PYERROR(); + const int ndim_coords = sparse_tensor->ndim(); + std::vector npy_shape_coords(ndim_coords); + + for (int i = 0; i < ndim_coords; ++i) { + npy_shape_coords[i] = sparse_index_coords->shape()[i]; + } + std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); - std::vector npy_shape_coords({sparse_index_coords->shape()[0], 2}); const void* immutable_data = nullptr; if (sparse_tensor->data()) { @@ -355,8 +361,20 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_t sparse_index.indices(); std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); - std::vector npy_shape_indptr({sparse_index_indptr->shape()[0], 1}); - std::vector npy_shape_indices({sparse_index_indices->shape()[0], 1}); + + const int ndim_indptr = sparse_index_indptr->ndim(); + std::vector npy_shape_indptr(ndim_indptr); + + for (int i = 0; i < ndim_indptr; ++i) { + npy_shape_indptr[i] = sparse_index_indptr->shape()[i]; + } + + const int ndim_indices = sparse_index_indices->ndim(); + std::vector npy_shape_indices(ndim_indices); + + for (int i = 0; i < ndim_indices; ++i) { + npy_shape_indices[i] = sparse_index_indices->shape()[i]; + } const void* immutable_data = nullptr; if (sparse_tensor->data()) { diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 3757890d082..68f0499fe81 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -77,11 +77,13 @@ def ne(a, b): data = np.random.randn(10, 6)[::, ::2] sparse_tensor1 = sparse_tensor_type.from_dense_numpy(data) - sparse_tensor2 = sparse_tensor_type.from_dense_numpy(np.ascontiguousarray(data)) + sparse_tensor2 = sparse_tensor_type.from_dense_numpy( + np.ascontiguousarray(data)) eq(sparse_tensor1, sparse_tensor2) data = data.copy() data[9, 0] = 1.0 - sparse_tensor2 = sparse_tensor_type.from_dense_numpy(np.ascontiguousarray(data)) + sparse_tensor2 = sparse_tensor_type.from_dense_numpy( + np.ascontiguousarray(data)) ne(sparse_tensor1, sparse_tensor2) From 654002afe0ef14217ca7d65efc3dac29ca82570e Mon Sep 17 00:00:00 2001 From: Rok Date: Fri, 28 Jun 2019 19:56:44 +0200 Subject: [PATCH 5/8] Partial review feedback implementation. --- cpp/src/arrow/python/numpy_convert.cc | 148 ++++++++++----------- cpp/src/arrow/python/numpy_convert.h | 30 +++-- cpp/src/arrow/python/pyarrow_lib.h | 12 -- python/pyarrow/__init__.pxd | 2 +- python/pyarrow/array.pxi | 92 +++++++++---- python/pyarrow/includes/libarrow.pxd | 10 +- python/pyarrow/tests/test_sparse_tensor.py | 100 ++++++++------ 7 files changed, 220 insertions(+), 174 deletions(-) diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 8e8b2d3cace..b27ed73cd1e 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -281,18 +281,19 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t PyAcquireGIL lock; int type_num_data; - int type_num_coords; + RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data)); + PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); + RETURN_IF_PYERROR(); + const auto& sparse_index = arrow::internal::checked_cast( *sparse_tensor->sparse_index()); const std::shared_ptr> sparse_index_coords = sparse_index.indices(); - RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data)); - PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); - RETURN_NOT_OK(GetNumPyType(*sparse_index_coords->type(), &type_num_coords)); - PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords); RETURN_IF_PYERROR(); + std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); + const int ndim_coords = sparse_tensor->ndim(); std::vector npy_shape_coords(ndim_coords); @@ -300,17 +301,8 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t npy_shape_coords[i] = sparse_index_coords->shape()[i]; } - std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); - - const void* immutable_data = nullptr; - if (sparse_tensor->data()) { - immutable_data = sparse_tensor->data()->data(); - } - - const void* immutable_coords = nullptr; - if (sparse_index_coords->data()) { - immutable_coords = sparse_index_coords->data()->data(); - } + const void* immutable_data = sparse_tensor->data()->data(); + const void* immutable_coords = sparse_index_coords->data()->data(); // Remove const =( void* mutable_data = const_cast(immutable_data); @@ -321,20 +313,21 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t array_flags |= NPY_ARRAY_WRITEABLE; } + int type_num_coords; + RETURN_NOT_OK(GetNumPyType(*sparse_index_coords->type(), &type_num_coords)); + PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords); + PyObject* result_data = - PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr, + PyArray_NewFromDescr(&PyArray_Type, dtype_data, 2, npy_shape_data.data(), nullptr, mutable_data, array_flags, nullptr); - PyObject* result_coords = - PyArray_NewFromDescr(&PyArray_Type, dtype_coords, 2, npy_shape_coords.data(), - nullptr, mutable_coords, array_flags, nullptr); - + PyObject* result_coords = PyArray_NewFromDescr(&PyArray_Type, dtype_coords, ndim_coords, + npy_shape_coords.data(), nullptr, + mutable_coords, array_flags, nullptr); RETURN_IF_PYERROR() - if (base == Py_None || base == nullptr) { - base = py::wrap_sparse_tensor_coo(sparse_tensor); - } else { - Py_XINCREF(base); - } + Py_XINCREF(base); + Py_XINCREF(base); + PyArray_SetBaseObject(reinterpret_cast(result_data), base); PyArray_SetBaseObject(reinterpret_cast(result_coords), base); @@ -376,20 +369,9 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_t npy_shape_indices[i] = sparse_index_indices->shape()[i]; } - const void* immutable_data = nullptr; - if (sparse_tensor->data()) { - immutable_data = sparse_tensor->data()->data(); - } - - const void* immutable_indptr = nullptr; - if (sparse_index_indptr->data()) { - immutable_indptr = sparse_index_indptr->data()->data(); - } - - const void* immutable_indices = nullptr; - if (sparse_index_indices->data()) { - immutable_indices = sparse_index_indices->data()->data(); - } + const void* immutable_data = sparse_tensor->data()->data(); + const void* immutable_indptr = sparse_index_indptr->data()->data(); + const void* immutable_indices = sparse_index_indices->data()->data(); // Remove const =( void* mutable_data = const_cast(immutable_data); @@ -410,21 +392,20 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_t PyArray_Descr* dtype_indices = PyArray_DescrNewFromType(type_num_indices); PyObject* result_data = - PyArray_NewFromDescr(&PyArray_Type, dtype_data, 1, npy_shape_data.data(), nullptr, + PyArray_NewFromDescr(&PyArray_Type, dtype_data, 2, npy_shape_data.data(), nullptr, mutable_data, array_flags, nullptr); - PyObject* result_indptr = - PyArray_NewFromDescr(&PyArray_Type, dtype_indptr, 1, npy_shape_indptr.data(), - nullptr, mutable_indptr, array_flags, nullptr); - PyObject* result_indices = - PyArray_NewFromDescr(&PyArray_Type, dtype_indices, 1, npy_shape_indices.data(), - nullptr, mutable_indices, array_flags, nullptr); + PyObject* result_indptr = PyArray_NewFromDescr(&PyArray_Type, dtype_indptr, ndim_indptr, + npy_shape_indptr.data(), nullptr, + mutable_indptr, array_flags, nullptr); + PyObject* result_indices = PyArray_NewFromDescr( + &PyArray_Type, dtype_indices, ndim_indices, npy_shape_indices.data(), nullptr, + mutable_indices, array_flags, nullptr); RETURN_IF_PYERROR() - if (base == Py_None || base == nullptr) { - base = py::wrap_sparse_tensor_csr(sparse_tensor); - } else { - Py_XINCREF(base); - } + Py_XINCREF(base); + Py_XINCREF(base); + Py_XINCREF(base); + PyArray_SetBaseObject(reinterpret_cast(result_data), base); PyArray_SetBaseObject(reinterpret_cast(result_indptr), base); PyArray_SetBaseObject(reinterpret_cast(result_indices), base); @@ -435,19 +416,17 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_t return Status::OK(); } -Status NdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao, - std::shared_ptr* out) { +Status DenseNdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, + std::shared_ptr* out) { if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } PyArrayObject* ndarray = reinterpret_cast(ao); - // TODO(wesm): What do we want to do with non-contiguous memory and negative strides? - int ndim = PyArray_NDIM(ndarray); - // This is also holding the GIL, so don't already draw it. std::shared_ptr data = std::make_shared(ao); std::vector shape(ndim); std::vector strides(ndim); @@ -467,25 +446,23 @@ Status NdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao, std::shared_ptr type; RETURN_NOT_OK( GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - Tensor tensor(type, data, shape, strides); + Tensor tensor(type, data, shape, strides, dim_names); *out = std::make_shared(tensor); return Status::OK(); } } -Status NdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao, - std::shared_ptr* out) { +Status DenseNdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, + std::shared_ptr* out) { if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } PyArrayObject* ndarray = reinterpret_cast(ao); - // TODO(wesm): What do we want to do with non-contiguous memory and negative strides? - int ndim = PyArray_NDIM(ndarray); - // This is also holding the GIL, so don't already draw it. std::shared_ptr data = std::make_shared(ao); std::vector shape(ndim); std::vector strides(ndim); @@ -505,7 +482,7 @@ Status NdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao, std::shared_ptr type; RETURN_NOT_OK( GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - Tensor tensor(type, data, shape, strides); + Tensor tensor(type, data, shape, strides, dim_names); *out = std::make_shared(tensor); return Status::OK(); } @@ -529,22 +506,34 @@ Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* int coords_ndim = PyArray_NDIM(ndarray_coords); - std::shared_ptr type; + std::shared_ptr type_data; + std::shared_ptr type_coords; + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_coords)), + &type_coords)); + ARROW_CHECK_EQ(type_coords->id(), Type::INT64); + + const int64_t i64_size = sizeof(int64_t); npy_intp* coords_array_shape = PyArray_SHAPE(ndarray_coords); std::vector coords_shape(coords_ndim); + std::vector coords_strides(coords_ndim); for (int i = 0; i < coords_ndim; ++i) { coords_shape[i] = coords_array_shape[i]; + if (i == 0) { + coords_strides[i] = i64_size; + } else { + coords_strides[i] = coords_strides[i - 1] * coords_array_shape[i - 1]; + } } std::shared_ptr> coords = - std::make_shared>(coords_buffer, coords_shape); - - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), &type)); + std::make_shared>(coords_buffer, coords_shape, + coords_strides); std::shared_ptr sparse_index = std::make_shared(coords); - *out = std::make_shared>(sparse_index, type, data, + *out = std::make_shared>(sparse_index, type_data, data, shape, dim_names); return Status::OK(); } @@ -564,7 +553,6 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* PyArrayObject* ndarray_indptr = reinterpret_cast(indptr_ao); PyArrayObject* ndarray_indices = reinterpret_cast(indices_ao); - // This is also holding the GIL, so don't already draw it. std::shared_ptr data = std::make_shared(data_ao); std::shared_ptr indptr_buffer = std::make_shared(indptr_ao); std::shared_ptr indices_buffer = std::make_shared(indices_ao); @@ -572,7 +560,9 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* int indptr_ndim = PyArray_NDIM(ndarray_indptr); int indices_ndim = PyArray_NDIM(ndarray_indices); - std::shared_ptr type; + std::shared_ptr type_data; + std::shared_ptr type_indptr; + std::shared_ptr type_indices; npy_intp* indptr_array_shape = PyArray_SHAPE(ndarray_indptr); npy_intp* indices_array_shape = PyArray_SHAPE(ndarray_indices); std::vector indptr_shape(indptr_ndim); @@ -585,17 +575,23 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* indices_shape[i] = indices_array_shape[i]; } + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_indptr)), + &type_indptr)); + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_indices)), + &type_indices)); + ARROW_CHECK_EQ(type_indptr->id(), Type::INT64); + ARROW_CHECK_EQ(type_indices->id(), Type::INT64); + std::shared_ptr indptr = std::make_shared(indptr_buffer, indptr_shape); std::shared_ptr indices = std::make_shared(indices_buffer, indices_shape); - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), &type)); - std::shared_ptr sparse_index = std::make_shared(indptr, indices); - *out = std::make_shared>(sparse_index, type, data, + *out = std::make_shared>(sparse_index, type_data, data, shape, dim_names); return Status::OK(); } diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h index 4eff77bd042..aa877399461 100644 --- a/cpp/src/arrow/python/numpy_convert.h +++ b/cpp/src/arrow/python/numpy_convert.h @@ -70,23 +70,20 @@ ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, PyObject** out); -ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray( - const std::shared_ptr& sparse_tensor, PyObject* base, - PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); - ARROW_PYTHON_EXPORT Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, PyObject* base, PyObject** out_data, PyObject** out_coords); -ARROW_PYTHON_EXPORT Status NdarrayToSparseTensorCSR( - MemoryPool* pool, PyObject* ao, std::shared_ptr* out); +ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray( + const std::shared_ptr& sparse_tensor, PyObject* base, + PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); -ARROW_PYTHON_EXPORT Status NdarrayToSparseTensorCOO( - MemoryPool* pool, PyObject* ao, std::shared_ptr* out); +ARROW_PYTHON_EXPORT Status DenseNdarrayToSparseTensorCOO( + MemoryPool* pool, PyObject* ao, const std::vector& dim_names, + std::shared_ptr* out); -ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR( - MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, - const std::vector& shape, const std::vector& dim_names, +ARROW_PYTHON_EXPORT Status DenseNdarrayToSparseTensorCSR( + MemoryPool* pool, PyObject* ao, const std::vector& dim_names, std::shared_ptr* out); ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO( @@ -94,14 +91,19 @@ ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO( const std::vector& shape, const std::vector& dim_names, std::shared_ptr* out); -ARROW_PYTHON_EXPORT Status -TensorToSparseTensorCSR(const std::shared_ptr& tensor, - std::shared_ptr* csparse_tensor); +ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCSR( + MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao, + const std::vector& shape, const std::vector& dim_names, + std::shared_ptr* out); ARROW_PYTHON_EXPORT Status TensorToSparseTensorCOO(const std::shared_ptr& tensor, std::shared_ptr* csparse_tensor); +ARROW_PYTHON_EXPORT Status +TensorToSparseTensorCSR(const std::shared_ptr& tensor, + std::shared_ptr* csparse_tensor); + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pyarrow_lib.h b/cpp/src/arrow/python/pyarrow_lib.h index acf223a64cd..a4bc1039ee8 100644 --- a/cpp/src/arrow/python/pyarrow_lib.h +++ b/cpp/src/arrow/python/pyarrow_lib.h @@ -61,18 +61,6 @@ __PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unw __PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCOO> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_coo(PyObject *); __PYX_EXTERN_C std::shared_ptr< arrow::SparseTensorCSR> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_sparse_tensor_csr(PyObject *); -__PYX_EXTERN_C int pyarrow_is_buffer(PyObject *); -__PYX_EXTERN_C int pyarrow_is_data_type(PyObject *); -__PYX_EXTERN_C int pyarrow_is_field(PyObject *); -__PYX_EXTERN_C int pyarrow_is_schema(PyObject *); -__PYX_EXTERN_C int pyarrow_is_array(PyObject *); -__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); -__PYX_EXTERN_C int pyarrow_is_tensor(PyObject *); -__PYX_EXTERN_C int pyarrow_is_sparse_tensor_coo(PyObject *); -__PYX_EXTERN_C int pyarrow_is_sparse_tensor_csr(PyObject *); -__PYX_EXTERN_C int pyarrow_is_column(PyObject *); -__PYX_EXTERN_C int pyarrow_is_table(PyObject *); -__PYX_EXTERN_C int pyarrow_is_batch(PyObject *); #endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd index cdc918f0385..432880556cc 100644 --- a/python/pyarrow/__init__.pxd +++ b/python/pyarrow/__init__.pxd @@ -22,7 +22,7 @@ from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType, CField, CRecordBatch, CSchema, CTable, CTensor, CSparseTensorCSR, CSparseTensorCOO) - +from pyarrow.compat import frombytes cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py": cdef int import_pyarrow() except -1 diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 39d0d9ff088..bcb0dfac1bf 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -891,15 +891,27 @@ type: {0.type} shape: {0.shape}""".format(self) @staticmethod - def from_dense_numpy(obj): + def from_dense_numpy(obj, dim_names=None): + """ + Convert numpy.ndarray to arrow::SparseTensorCOO + """ cdef shared_ptr[CSparseTensorCOO] csparse_tensor - with nogil: - check_status(NdarrayToSparseTensorCOO(c_default_memory_pool(), obj, - &csparse_tensor)) + cdef vector[c_string] c_dim_names + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + check_status(DenseNdarrayToSparseTensorCOO(c_default_memory_pool(), + obj, c_dim_names, + &csparse_tensor)) return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) @staticmethod def from_numpy(data, coords, shape, dim_names=None): + """ + Create arrow::SparseTensorCOO from numpy.ndarrays + """ cdef shared_ptr[CSparseTensorCOO] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names @@ -909,18 +921,23 @@ shape: {0.shape}""".format(self) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) - with nogil: - check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(), - data, coords, c_shape, c_dim_names, &csparse_tensor)) + + if coords.dtype != 'i8': + coords = coords.astype('i8') + + check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(), + data, coords, c_shape, c_dim_names, &csparse_tensor)) return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) @staticmethod def from_tensor(obj): + """ + Convert arrow::Tensor to arrow::SparseTensorCOO + """ cdef shared_ptr[CSparseTensorCOO] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) - with nogil: - check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor)) + check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor)) return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) @@ -931,9 +948,8 @@ shape: {0.shape}""".format(self) cdef PyObject* out_data cdef PyObject* out_coords - with nogil: - check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self, - &out_data, &out_coords)) + check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_coords)) return PyObject_to_object(out_data), PyObject_to_object(out_coords) def equals(self, SparseTensorCOO other): @@ -966,11 +982,11 @@ shape: {0.shape}""".format(self) return self.stp.size() def dim_name(self, i): - return self.stp.dim_name(i) + return frombytes(self.stp.dim_name(i)) @property def dim_names(self): - return self.stp.dim_names() + return [frombytes(x) for x in tuple(self.stp.dim_names())] @property def non_zero_length(self): @@ -998,15 +1014,27 @@ type: {0.type} shape: {0.shape}""".format(self) @staticmethod - def from_dense_numpy(obj): + def from_dense_numpy(obj, dim_names=None): + """ + Convert numpy.ndarray to arrow::SparseTensorCSR + """ cdef shared_ptr[CSparseTensorCSR] csparse_tensor - with nogil: - check_status(NdarrayToSparseTensorCSR(c_default_memory_pool(), obj, - &csparse_tensor)) + cdef vector[c_string] c_dim_names + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + check_status(DenseNdarrayToSparseTensorCSR(c_default_memory_pool(), + obj, c_dim_names, + &csparse_tensor)) return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) @staticmethod def from_numpy(data, indptr, indices, shape, dim_names=None): + """ + Create arrow::SparseTensorCSR from numpy.ndarrays + """ cdef shared_ptr[CSparseTensorCSR] csparse_tensor cdef vector[int64_t] c_shape cdef vector[c_string] c_dim_names @@ -1016,19 +1044,26 @@ shape: {0.shape}""".format(self) if dim_names is not None: for x in dim_names: c_dim_names.push_back(tobytes(x)) - with nogil: - check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(), - data, indptr, indices, c_shape, c_dim_names, - &csparse_tensor)) + + if indptr.dtype != 'i8': + indptr = indptr.astype('i8') + if indices.dtype != 'i8': + indices = indices.astype('i8') + + check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(), + data, indptr, indices, c_shape, c_dim_names, + &csparse_tensor)) return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) @staticmethod def from_tensor(obj): + """ + Convert arrow::Tensor to arrow::SparseTensorCSR + """ cdef shared_ptr[CSparseTensorCSR] csparse_tensor cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) - with nogil: - check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor)) + check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor)) return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) @@ -1040,9 +1075,8 @@ shape: {0.shape}""".format(self) cdef PyObject* out_indptr cdef PyObject* out_indices - with nogil: - check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self, - &out_data, &out_indptr, &out_indices)) + check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_indptr, &out_indices)) return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), PyObject_to_object(out_indices)) @@ -1076,11 +1110,11 @@ shape: {0.shape}""".format(self) return self.stp.size() def dim_name(self, i): - return self.stp.dim_name(i) + return frombytes(self.stp.dim_name(i)) @property def dim_names(self): - return self.stp.dim_names() + return [frombytes(x) for x in tuple(self.stp.dim_names())] @property def non_zero_length(self): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2ba05b20d79..e0fc502f468 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1259,11 +1259,13 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: const vector[c_string]& dim_names, shared_ptr[CSparseTensorCSR]* out) - CStatus NdarrayToSparseTensorCOO(CMemoryPool* pool, object ao, - shared_ptr[CSparseTensorCOO]* out) + CStatus DenseNdarrayToSparseTensorCOO(CMemoryPool* pool, object ao, + const vector[c_string]& dim_names, + shared_ptr[CSparseTensorCOO]* out) - CStatus NdarrayToSparseTensorCSR(CMemoryPool* pool, object ao, - shared_ptr[CSparseTensorCSR]* out) + CStatus DenseNdarrayToSparseTensorCSR(CMemoryPool* pool, object ao, + const vector[c_string]& dim_names, + shared_ptr[CSparseTensorCSR]* out) CStatus TensorToSparseTensorCOO(shared_ptr[CTensor], shared_ptr[CSparseTensorCOO]* out) diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 68f0499fe81..64efc878c02 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -34,29 +34,52 @@ def test_sparse_tensor_attrs(sparse_tensor_type): [0, 0, 0, 0, 0], [0, 3, 0, 0, 0], ]) - sparse_tensor = sparse_tensor_type.from_dense_numpy(data) + dim_names = ['x', 'y'] + sparse_tensor = sparse_tensor_type.from_dense_numpy(data, dim_names) assert sparse_tensor.ndim == 2 assert sparse_tensor.size == 25 assert sparse_tensor.shape == data.shape assert sparse_tensor.is_mutable - assert sparse_tensor.dim_name(0) == b'' - assert sparse_tensor.dim_names == [] + assert sparse_tensor.dim_name(0) == dim_names[0] + assert sparse_tensor.dim_names == dim_names assert sparse_tensor.non_zero_length == 4 def test_sparse_tensor_coo_base_object(): - sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(np.random.randn(10, 4)) + data = np.array([[4], [9], [7], [5]]) + coords = np.array([[0, 0], [1, 3], [0, 2], [1, 3]]) + array = np.array([[4, 0, 9, 0], + [0, 7, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 5]]) + sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(array) n = sys.getrefcount(sparse_tensor) - data, coords = sparse_tensor.to_numpy() # noqa - assert sys.getrefcount(sparse_tensor) == n + 1 + result_data, result_coords = sparse_tensor.to_numpy() + assert sys.getrefcount(sparse_tensor) == n + 2 + + sparse_tensor = None + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) def test_sparse_tensor_csr_base_object(): - sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(np.random.randn(10, 4)) + data = np.array([[1], [2], [3], [4], [5], [6]]) + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) + array = np.array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]) + + sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(array) n = sys.getrefcount(sparse_tensor) - data, indptr, indices = sparse_tensor.to_numpy() # noqa - assert sys.getrefcount(sparse_tensor) == n + 1 + result_data, result_indptr, result_indices = sparse_tensor.to_numpy() + assert sys.getrefcount(sparse_tensor) == n + 3 + + sparse_tensor = None + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) @pytest.mark.skip @@ -102,29 +125,29 @@ def ne(a, b): ]) def test_sparse_tensor_coo_from_dense(dtype_str, arrow_type): dtype = np.dtype(dtype_str) - data_vector = np.array([4, 9, 7, 5]).astype(dtype) + data = np.array([[4], [9], [7], [5]]).astype(dtype) coords = np.array([[0, 0], [1, 3], [0, 2], [1, 3]]) - data = np.array([[4, 0, 9, 0], - [0, 7, 0, 0], - [0, 0, 0, 0], - [0, 0, 0, 5]]).astype(dtype) - tensor = pa.Tensor.from_numpy(data) + array = np.array([[4, 0, 9, 0], + [0, 7, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 5]]).astype(dtype) + tensor = pa.Tensor.from_numpy(array) # Test from numpy array - sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(data) + sparse_tensor = pa.SparseTensorCOO.from_dense_numpy(array) repr(sparse_tensor) assert sparse_tensor.type == arrow_type result_data, result_coords = sparse_tensor.to_numpy() - assert (data_vector == result_data).all() - assert (result_coords == coords).all() + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) # Test from Tensor sparse_tensor = pa.SparseTensorCOO.from_tensor(tensor) repr(sparse_tensor) assert sparse_tensor.type == arrow_type result_data, result_coords = sparse_tensor.to_numpy() - assert (data_vector == result_data).all() - assert (result_coords == coords).all() + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) @pytest.mark.parametrize('dtype_str,arrow_type', [ @@ -146,7 +169,7 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): [0, 0, 3], [4, 5, 6]]).astype(dtype) - data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) + data = np.array([[1], [2], [3], [4], [5], [6]]) indptr = np.array([0, 2, 3, 6]) indices = np.array([0, 2, 2, 0, 1, 2]) tensor = pa.Tensor.from_numpy(dense_data) @@ -155,21 +178,20 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): sparse_tensor = pa.SparseTensorCSR.from_dense_numpy(dense_data) repr(sparse_tensor) result_data, result_indptr, result_indices = sparse_tensor.to_numpy() - assert (data == result_data).all() - assert (indptr == result_indptr).all() - assert (indices == result_indices).all() + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) # Test from Tensor sparse_tensor = pa.SparseTensorCSR.from_tensor(tensor) repr(sparse_tensor) assert sparse_tensor.type == arrow_type result_data, result_indptr, result_indices = sparse_tensor.to_numpy() - assert (data == result_data).all() - assert (indptr == result_indptr).all() - assert (indices == result_indices).all() + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) -@pytest.mark.skip @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), @@ -185,8 +207,8 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): ]) def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) - data = np.array([4, 5, 7, 9]).astype(dtype) - coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]]).astype('i8') + data = np.array([[4], [9], [7], [5]]).astype(dtype) + coords = np.array([[0, 0], [3, 3], [1, 1], [0, 2]]) shape = (4, 4) dim_names = ["x", "y"] @@ -195,8 +217,9 @@ def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): repr(sparse_tensor) assert sparse_tensor.type == arrow_type result_data, result_coords = sparse_tensor.to_numpy() - assert (data == result_data).all() - assert (coords == result_coords).all() + assert np.array_equal(data, result_data) + assert np.array_equal(coords, result_coords) + assert sparse_tensor.dim_names == dim_names @pytest.mark.parametrize('dtype_str,arrow_type', [ @@ -214,9 +237,9 @@ def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): ]) def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) - data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) - indptr = np.array([0, 2, 3, 6]).astype('i8') - indices = np.array([0, 2, 2, 0, 1, 2]).astype('i8') + data = np.array([[1], [2], [3], [4], [5], [6]]).astype(dtype) + indptr = np.array([0, 2, 3, 6]) + indices = np.array([0, 2, 2, 0, 1, 2]) shape = (3, 3) dim_names = ["x", "y"] @@ -225,6 +248,7 @@ def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type): repr(sparse_tensor) assert sparse_tensor.type == arrow_type result_data, result_indptr, result_indices = sparse_tensor.to_numpy() - assert (data == result_data).all() - assert (indptr == result_indptr).all() - assert (indices == result_indices).all() + assert np.array_equal(data, result_data) + assert np.array_equal(indptr, result_indptr) + assert np.array_equal(indices, result_indices) + assert sparse_tensor.dim_names == dim_names From c31b8eb32930fd34fca5830c246b8cd5ad086848 Mon Sep 17 00:00:00 2001 From: Rok Date: Sun, 30 Jun 2019 03:14:38 +0200 Subject: [PATCH 6/8] Enabling SparseTensor.Equals checks. --- cpp/src/arrow/compare.cc | 3 +- cpp/src/arrow/sparse_tensor-test.cc | 39 ++++++++++++++++++++++ python/pyarrow/tests/test_sparse_tensor.py | 1 - 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 12991b94aeb..4ae5d897917 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1026,9 +1026,8 @@ struct SparseTensorEqualsImpl { const uint8_t* left_data = left.data()->data(); const uint8_t* right_data = right.data()->data(); - return memcmp(left_data, right_data, - static_cast(byte_width * left.non_zero_length())); + static_cast(byte_width * left.non_zero_length())) == 0; } }; diff --git a/cpp/src/arrow/sparse_tensor-test.cc b/cpp/src/arrow/sparse_tensor-test.cc index daff0194fe5..69ec4ca5c60 100644 --- a/cpp/src/arrow/sparse_tensor-test.cc +++ b/cpp/src/arrow/sparse_tensor-test.cc @@ -182,6 +182,25 @@ TEST(TestSparseCOOTensor, CreationFromNonContiguousTensor) { AssertCOOIndex(sidx, 11, {1, 2, 3}); } +TEST(TestSparseCOOTensor, TensorEquality) { + std::vector shape = {2, 3, 4}; + std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values2 = {0, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::shared_ptr buffer1 = Buffer::Wrap(values1); + std::shared_ptr buffer2 = Buffer::Wrap(values2); + NumericTensor tensor1(buffer1, shape); + NumericTensor tensor2(buffer1, shape); + NumericTensor tensor3(buffer2, shape); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + SparseTensorImpl st3(tensor3); + + ASSERT_TRUE(st1.Equals(st2)); + ASSERT_TRUE(!st1.Equals(st3)); +} + TEST(TestSparseCSRMatrix, CreationFromNumericTensor2D) { std::vector shape = {6, 4}; std::vector values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, @@ -269,4 +288,24 @@ TEST(TestSparseCSRMatrix, CreationFromNonContiguousTensor) { ASSERT_EQ(std::vector({0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}), indices_values); } +TEST(TestSparseCSRMatrix, TensorEquality) { + std::vector shape = {6, 4}; + std::vector values1 = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, + 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16}; + std::vector values2 = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + std::shared_ptr buffer1 = Buffer::Wrap(values1); + std::shared_ptr buffer2 = Buffer::Wrap(values2); + NumericTensor tensor1(buffer1, shape); + NumericTensor tensor2(buffer1, shape); + NumericTensor tensor3(buffer2, shape); + SparseTensorImpl st1(tensor1); + SparseTensorImpl st2(tensor2); + SparseTensorImpl st3(tensor3); + + ASSERT_TRUE(st1.Equals(st2)); + ASSERT_TRUE(!st1.Equals(st3)); +} + } // namespace arrow diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 64efc878c02..09d6b852692 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -82,7 +82,6 @@ def test_sparse_tensor_csr_base_object(): assert np.array_equal(indices, result_indices) -@pytest.mark.skip @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseTensorCSR, pa.SparseTensorCOO, From 9e0363afef5199ffa33666782b3131f267936716 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 1 Jul 2019 11:39:13 +0200 Subject: [PATCH 7/8] Polish code --- cpp/src/arrow/python/numpy_convert.cc | 361 +++++--------------- cpp/src/arrow/python/numpy_convert.h | 9 +- cpp/src/arrow/python/serialize.cc | 2 +- python/pyarrow/array.pxi | 349 -------------------- python/pyarrow/includes/libarrow.pxd | 10 +- python/pyarrow/lib.pyx | 3 + python/pyarrow/tensor.pxi | 367 +++++++++++++++++++++ python/pyarrow/tests/test_sparse_tensor.py | 76 ++--- python/pyarrow/tests/test_tensor.py | 46 +-- 9 files changed, 497 insertions(+), 726 deletions(-) create mode 100644 python/pyarrow/tensor.pxi diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index b27ed73cd1e..209cea5faa9 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -28,6 +28,7 @@ #include "arrow/sparse_tensor.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/util/logging.h" #include "arrow/python/common.h" #include "arrow/python/pyarrow.h" @@ -187,7 +188,9 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { #undef TO_ARROW_TYPE_CASE -Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { +Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, + std::shared_ptr* out) { if (!PyArray_Check(ao)) { return Status::TypeError("Did not pass ndarray object"); } @@ -198,35 +201,29 @@ Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, std::shared_ptr* int ndim = PyArray_NDIM(ndarray); - // This is also holding the GIL, so don't already draw it. std::shared_ptr data = std::make_shared(ao); std::vector shape(ndim); std::vector strides(ndim); - { - PyAcquireGIL lock; - npy_intp* array_strides = PyArray_STRIDES(ndarray); - npy_intp* array_shape = PyArray_SHAPE(ndarray); - for (int i = 0; i < ndim; ++i) { - if (array_strides[i] < 0) { - return Status::Invalid("Negative ndarray strides not supported"); - } - shape[i] = array_shape[i]; - strides[i] = array_strides[i]; + npy_intp* array_strides = PyArray_STRIDES(ndarray); + npy_intp* array_shape = PyArray_SHAPE(ndarray); + for (int i = 0; i < ndim; ++i) { + if (array_strides[i] < 0) { + return Status::Invalid("Negative ndarray strides not supported"); } - - std::shared_ptr type; - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - *out = std::make_shared(type, data, shape, strides); - return Status::OK(); + shape[i] = array_shape[i]; + strides[i] = array_strides[i]; } + + std::shared_ptr type; + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); + *out = std::make_shared(type, data, shape, strides, dim_names); + return Status::OK(); } Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, PyObject** out) { - PyAcquireGIL lock; - int type_num; RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num)); PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num); @@ -275,63 +272,48 @@ Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, return Status::OK(); } -Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, - PyObject* base, PyObject** out_data, - PyObject** out_coords) { - PyAcquireGIL lock; - +// Wrap the dense data of a sparse tensor in a ndarray +static Status SparseTendorDataToNdarray(const SparseTensor& sparse_tensor, + std::vector data_shape, PyObject* base, + PyObject** out_data) { int type_num_data; - RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data)); + RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data)); PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); RETURN_IF_PYERROR(); - const auto& sparse_index = arrow::internal::checked_cast( - *sparse_tensor->sparse_index()); - const std::shared_ptr> sparse_index_coords = - sparse_index.indices(); - - RETURN_IF_PYERROR(); - - std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); - - const int ndim_coords = sparse_tensor->ndim(); - std::vector npy_shape_coords(ndim_coords); - - for (int i = 0; i < ndim_coords; ++i) { - npy_shape_coords[i] = sparse_index_coords->shape()[i]; - } - - const void* immutable_data = sparse_tensor->data()->data(); - const void* immutable_coords = sparse_index_coords->data()->data(); - + const void* immutable_data = sparse_tensor.data()->data(); // Remove const =( void* mutable_data = const_cast(immutable_data); - void* mutable_coords = const_cast(immutable_coords); - - int array_flags = 0; - if (sparse_tensor->is_mutable()) { + int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS; + if (sparse_tensor.is_mutable()) { array_flags |= NPY_ARRAY_WRITEABLE; } - int type_num_coords; - RETURN_NOT_OK(GetNumPyType(*sparse_index_coords->type(), &type_num_coords)); - PyArray_Descr* dtype_coords = PyArray_DescrNewFromType(type_num_coords); - - PyObject* result_data = - PyArray_NewFromDescr(&PyArray_Type, dtype_data, 2, npy_shape_data.data(), nullptr, - mutable_data, array_flags, nullptr); - PyObject* result_coords = PyArray_NewFromDescr(&PyArray_Type, dtype_coords, ndim_coords, - npy_shape_coords.data(), nullptr, - mutable_coords, array_flags, nullptr); + *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data, + static_cast(data_shape.size()), data_shape.data(), + nullptr, mutable_data, array_flags, nullptr); RETURN_IF_PYERROR() - - Py_XINCREF(base); Py_XINCREF(base); + PyArray_SetBaseObject(reinterpret_cast(*out_data), base); + return Status::OK(); +} + +Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_coords) { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTendorDataToNdarray( + *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); - PyArray_SetBaseObject(reinterpret_cast(result_data), base); - PyArray_SetBaseObject(reinterpret_cast(result_coords), base); + // Wrap indices + PyObject* result_coords; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords)); - *out_data = result_data; + *out_data = result_data.detach(); *out_coords = result_coords; return Status::OK(); } @@ -339,155 +321,26 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_tensor, PyObject* base, PyObject** out_data, PyObject** out_indptr, PyObject** out_indices) { - PyAcquireGIL lock; - - int type_num_data; - RETURN_NOT_OK(GetNumPyType(*sparse_tensor->type(), &type_num_data)); - PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); - RETURN_IF_PYERROR(); - const auto& sparse_index = arrow::internal::checked_cast( *sparse_tensor->sparse_index()); - const std::shared_ptr> sparse_index_indptr = - sparse_index.indptr(); - const std::shared_ptr> sparse_index_indices = - sparse_index.indices(); - - std::vector npy_shape_data({sparse_index.non_zero_length(), 1}); - - const int ndim_indptr = sparse_index_indptr->ndim(); - std::vector npy_shape_indptr(ndim_indptr); - - for (int i = 0; i < ndim_indptr; ++i) { - npy_shape_indptr[i] = sparse_index_indptr->shape()[i]; - } - - const int ndim_indices = sparse_index_indices->ndim(); - std::vector npy_shape_indices(ndim_indices); - - for (int i = 0; i < ndim_indices; ++i) { - npy_shape_indices[i] = sparse_index_indices->shape()[i]; - } - const void* immutable_data = sparse_tensor->data()->data(); - const void* immutable_indptr = sparse_index_indptr->data()->data(); - const void* immutable_indices = sparse_index_indices->data()->data(); + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTendorDataToNdarray( + *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); - // Remove const =( - void* mutable_data = const_cast(immutable_data); - void* mutable_indptr = const_cast(immutable_indptr); - void* mutable_indices = const_cast(immutable_indices); + // Wrap indices + OwnedRef result_indptr; + OwnedRef result_indices; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref())); + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref())); - int array_flags = 0; - if (sparse_tensor->is_mutable()) { - array_flags |= NPY_ARRAY_WRITEABLE; - } - - int type_num_indptr; - RETURN_NOT_OK(GetNumPyType(*sparse_index_indptr->type(), &type_num_indptr)); - PyArray_Descr* dtype_indptr = PyArray_DescrNewFromType(type_num_indptr); - - int type_num_indices; - RETURN_NOT_OK(GetNumPyType(*sparse_index_indptr->type(), &type_num_indices)); - PyArray_Descr* dtype_indices = PyArray_DescrNewFromType(type_num_indices); - - PyObject* result_data = - PyArray_NewFromDescr(&PyArray_Type, dtype_data, 2, npy_shape_data.data(), nullptr, - mutable_data, array_flags, nullptr); - PyObject* result_indptr = PyArray_NewFromDescr(&PyArray_Type, dtype_indptr, ndim_indptr, - npy_shape_indptr.data(), nullptr, - mutable_indptr, array_flags, nullptr); - PyObject* result_indices = PyArray_NewFromDescr( - &PyArray_Type, dtype_indices, ndim_indices, npy_shape_indices.data(), nullptr, - mutable_indices, array_flags, nullptr); - RETURN_IF_PYERROR() - - Py_XINCREF(base); - Py_XINCREF(base); - Py_XINCREF(base); - - PyArray_SetBaseObject(reinterpret_cast(result_data), base); - PyArray_SetBaseObject(reinterpret_cast(result_indptr), base); - PyArray_SetBaseObject(reinterpret_cast(result_indices), base); - - *out_data = result_data; - *out_indptr = result_indptr; - *out_indices = result_indices; + *out_data = result_data.detach(); + *out_indptr = result_indptr.detach(); + *out_indices = result_indices.detach(); return Status::OK(); } -Status DenseNdarrayToSparseTensorCOO(MemoryPool* pool, PyObject* ao, - const std::vector& dim_names, - std::shared_ptr* out) { - if (!PyArray_Check(ao)) { - return Status::TypeError("Did not pass ndarray object"); - } - - PyArrayObject* ndarray = reinterpret_cast(ao); - - int ndim = PyArray_NDIM(ndarray); - - std::shared_ptr data = std::make_shared(ao); - std::vector shape(ndim); - std::vector strides(ndim); - - { - PyAcquireGIL lock; - npy_intp* array_strides = PyArray_STRIDES(ndarray); - npy_intp* array_shape = PyArray_SHAPE(ndarray); - for (int i = 0; i < ndim; ++i) { - if (array_strides[i] < 0) { - return Status::Invalid("Negative ndarray strides not supported"); - } - shape[i] = array_shape[i]; - strides[i] = array_strides[i]; - } - - std::shared_ptr type; - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - Tensor tensor(type, data, shape, strides, dim_names); - *out = std::make_shared(tensor); - return Status::OK(); - } -} - -Status DenseNdarrayToSparseTensorCSR(MemoryPool* pool, PyObject* ao, - const std::vector& dim_names, - std::shared_ptr* out) { - if (!PyArray_Check(ao)) { - return Status::TypeError("Did not pass ndarray object"); - } - - PyArrayObject* ndarray = reinterpret_cast(ao); - - int ndim = PyArray_NDIM(ndarray); - - std::shared_ptr data = std::make_shared(ao); - std::vector shape(ndim); - std::vector strides(ndim); - - { - PyAcquireGIL lock; - npy_intp* array_strides = PyArray_STRIDES(ndarray); - npy_intp* array_shape = PyArray_SHAPE(ndarray); - for (int i = 0; i < ndim; ++i) { - if (array_strides[i] < 0) { - return Status::Invalid("Negative ndarray strides not supported"); - } - shape[i] = array_shape[i]; - strides[i] = array_strides[i]; - } - - std::shared_ptr type; - RETURN_NOT_OK( - GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); - Tensor tensor(type, data, shape, strides, dim_names); - *out = std::make_shared(tensor); - return Status::OK(); - } -} - Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, const std::vector& shape, const std::vector& dim_names, @@ -496,43 +349,18 @@ Status NdarraysToSparseTensorCOO(MemoryPool* pool, PyObject* data_ao, PyObject* return Status::TypeError("Did not pass ndarray object"); } - PyAcquireGIL lock; - PyArrayObject* ndarray_data = reinterpret_cast(data_ao); - PyArrayObject* ndarray_coords = reinterpret_cast(coords_ao); - std::shared_ptr data = std::make_shared(data_ao); - std::shared_ptr coords_buffer = std::make_shared(coords_ao); - - int coords_ndim = PyArray_NDIM(ndarray_coords); - std::shared_ptr type_data; - std::shared_ptr type_coords; RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), &type_data)); - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_coords)), - &type_coords)); - ARROW_CHECK_EQ(type_coords->id(), Type::INT64); - - const int64_t i64_size = sizeof(int64_t); - npy_intp* coords_array_shape = PyArray_SHAPE(ndarray_coords); - std::vector coords_shape(coords_ndim); - std::vector coords_strides(coords_ndim); - - for (int i = 0; i < coords_ndim; ++i) { - coords_shape[i] = coords_array_shape[i]; - if (i == 0) { - coords_strides[i] = i64_size; - } else { - coords_strides[i] = coords_strides[i - 1] * coords_array_shape[i - 1]; - } - } - std::shared_ptr> coords = - std::make_shared>(coords_buffer, coords_shape, - coords_strides); + std::shared_ptr coords; + RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords)); + ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller - std::shared_ptr sparse_index = std::make_shared(coords); + std::shared_ptr sparse_index = std::make_shared( + std::static_pointer_cast>(coords)); *out = std::make_shared>(sparse_index, type_data, data, shape, dim_names); return Status::OK(); @@ -547,50 +375,21 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* return Status::TypeError("Did not pass ndarray object"); } - PyAcquireGIL lock; - PyArrayObject* ndarray_data = reinterpret_cast(data_ao); - PyArrayObject* ndarray_indptr = reinterpret_cast(indptr_ao); - PyArrayObject* ndarray_indices = reinterpret_cast(indices_ao); - std::shared_ptr data = std::make_shared(data_ao); - std::shared_ptr indptr_buffer = std::make_shared(indptr_ao); - std::shared_ptr indices_buffer = std::make_shared(indices_ao); - - int indptr_ndim = PyArray_NDIM(ndarray_indptr); - int indices_ndim = PyArray_NDIM(ndarray_indices); - std::shared_ptr type_data; - std::shared_ptr type_indptr; - std::shared_ptr type_indices; - npy_intp* indptr_array_shape = PyArray_SHAPE(ndarray_indptr); - npy_intp* indices_array_shape = PyArray_SHAPE(ndarray_indices); - std::vector indptr_shape(indptr_ndim); - std::vector indices_shape(indices_ndim); - - for (int i = 0; i < indptr_ndim; ++i) { - indptr_shape[i] = indptr_array_shape[i]; - } - for (int i = 0; i < indices_ndim; ++i) { - indices_shape[i] = indices_array_shape[i]; - } - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), &type_data)); - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_indptr)), - &type_indptr)); - RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_indices)), - &type_indices)); - ARROW_CHECK_EQ(type_indptr->id(), Type::INT64); - ARROW_CHECK_EQ(type_indices->id(), Type::INT64); - - std::shared_ptr indptr = - std::make_shared(indptr_buffer, indptr_shape); - std::shared_ptr indices = - std::make_shared(indices_buffer, indices_shape); - - std::shared_ptr sparse_index = - std::make_shared(indptr, indices); + + std::shared_ptr indptr, indices; + RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr)); + RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices)); + ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller + ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller + + auto sparse_index = std::make_shared( + std::static_pointer_cast>(indptr), + std::static_pointer_cast>(indices)); *out = std::make_shared>(sparse_index, type_data, data, shape, dim_names); return Status::OK(); @@ -598,20 +397,14 @@ Status NdarraysToSparseTensorCSR(MemoryPool* pool, PyObject* data_ao, PyObject* Status TensorToSparseTensorCOO(const std::shared_ptr& tensor, std::shared_ptr* out) { - { - PyAcquireGIL lock; - *out = std::make_shared(*tensor); - return Status::OK(); - } + *out = std::make_shared(*tensor); + return Status::OK(); } Status TensorToSparseTensorCSR(const std::shared_ptr& tensor, std::shared_ptr* out) { - { - PyAcquireGIL lock; - *out = std::make_shared(*tensor); - return Status::OK(); - } + *out = std::make_shared(*tensor); + return Status::OK(); } } // namespace py diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h index aa877399461..5fa1326f52b 100644 --- a/cpp/src/arrow/python/numpy_convert.h +++ b/cpp/src/arrow/python/numpy_convert.h @@ -65,6 +65,7 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out); Status GetNumPyType(const DataType& type, int* type_num); ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, std::shared_ptr* out); ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, @@ -78,14 +79,6 @@ ARROW_PYTHON_EXPORT Status SparseTensorCSRToNdarray( const std::shared_ptr& sparse_tensor, PyObject* base, PyObject** out_data, PyObject** out_indptr, PyObject** out_indices); -ARROW_PYTHON_EXPORT Status DenseNdarrayToSparseTensorCOO( - MemoryPool* pool, PyObject* ao, const std::vector& dim_names, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status DenseNdarrayToSparseTensorCSR( - MemoryPool* pool, PyObject* ao, const std::vector& dim_names, - std::shared_ptr* out); - ARROW_PYTHON_EXPORT Status NdarraysToSparseTensorCOO( MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, const std::vector& shape, const std::vector& dim_names, diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 8ff0e01480f..d93e3954e41 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -515,7 +515,7 @@ Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* bui builder->AppendNdarray(static_cast(blobs_out->ndarrays.size()))); std::shared_ptr tensor; RETURN_NOT_OK(NdarrayToTensor(default_memory_pool(), - reinterpret_cast(array), &tensor)); + reinterpret_cast(array), {}, &tensor)); blobs_out->ndarrays.push_back(tensor); } break; default: { diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index bcb0dfac1bf..15905a18507 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -870,355 +870,6 @@ cdef class Array(_PandasConvertible): return res -cdef class SparseTensorCOO: - """ - A sparse COO tensor. - """ - - def __init__(self): - raise TypeError("Do not call SparseTensorCOO's constructor directly, " - "use one of the `pyarrow.SparseTensorCOO.from_*` " - "functions instead.") - - cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor): - self.sp_sparse_tensor = sp_sparse_tensor - self.stp = sp_sparse_tensor.get() - self.type = pyarrow_wrap_data_type(self.stp.type()) - - def __repr__(self): - return """ -type: {0.type} -shape: {0.shape}""".format(self) - - @staticmethod - def from_dense_numpy(obj, dim_names=None): - """ - Convert numpy.ndarray to arrow::SparseTensorCOO - """ - cdef shared_ptr[CSparseTensorCOO] csparse_tensor - cdef vector[c_string] c_dim_names - - if dim_names is not None: - for x in dim_names: - c_dim_names.push_back(tobytes(x)) - - check_status(DenseNdarrayToSparseTensorCOO(c_default_memory_pool(), - obj, c_dim_names, - &csparse_tensor)) - return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) - - @staticmethod - def from_numpy(data, coords, shape, dim_names=None): - """ - Create arrow::SparseTensorCOO from numpy.ndarrays - """ - cdef shared_ptr[CSparseTensorCOO] csparse_tensor - cdef vector[int64_t] c_shape - cdef vector[c_string] c_dim_names - - for x in shape: - c_shape.push_back(x) - if dim_names is not None: - for x in dim_names: - c_dim_names.push_back(tobytes(x)) - - if coords.dtype != 'i8': - coords = coords.astype('i8') - - check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(), - data, coords, c_shape, c_dim_names, &csparse_tensor)) - return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) - - @staticmethod - def from_tensor(obj): - """ - Convert arrow::Tensor to arrow::SparseTensorCOO - """ - cdef shared_ptr[CSparseTensorCOO] csparse_tensor - cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) - - check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor)) - - return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) - - def to_numpy(self): - """ - Convert arrow::SparseTensorCOO to numpy.ndarrays with zero copy - """ - cdef PyObject* out_data - cdef PyObject* out_coords - - check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self, - &out_data, &out_coords)) - return PyObject_to_object(out_data), PyObject_to_object(out_coords) - - def equals(self, SparseTensorCOO other): - """ - Return true if sparse tensors contains exactly equal data - """ - return self.stp.Equals(deref(other.stp)) - - def __eq__(self, other): - if isinstance(other, SparseTensorCOO): - return self.equals(other) - else: - return NotImplemented - - @property - def is_mutable(self): - return self.stp.is_mutable() - - @property - def ndim(self): - return self.stp.ndim() - - @property - def shape(self): - # Cython knows how to convert a vector[T] to a Python list - return tuple(self.stp.shape()) - - @property - def size(self): - return self.stp.size() - - def dim_name(self, i): - return frombytes(self.stp.dim_name(i)) - - @property - def dim_names(self): - return [frombytes(x) for x in tuple(self.stp.dim_names())] - - @property - def non_zero_length(self): - return self.stp.non_zero_length() - - -cdef class SparseTensorCSR: - """ - A sparse CSR tensor. - """ - - def __init__(self): - raise TypeError("Do not call SparseTensorCSR's constructor directly, " - "use one of the `pyarrow.SparseTensorCSR.from_*` " - "functions instead.") - - cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor): - self.sp_sparse_tensor = sp_sparse_tensor - self.stp = sp_sparse_tensor.get() - self.type = pyarrow_wrap_data_type(self.stp.type()) - - def __repr__(self): - return """ -type: {0.type} -shape: {0.shape}""".format(self) - - @staticmethod - def from_dense_numpy(obj, dim_names=None): - """ - Convert numpy.ndarray to arrow::SparseTensorCSR - """ - cdef shared_ptr[CSparseTensorCSR] csparse_tensor - cdef vector[c_string] c_dim_names - - if dim_names is not None: - for x in dim_names: - c_dim_names.push_back(tobytes(x)) - - check_status(DenseNdarrayToSparseTensorCSR(c_default_memory_pool(), - obj, c_dim_names, - &csparse_tensor)) - return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) - - @staticmethod - def from_numpy(data, indptr, indices, shape, dim_names=None): - """ - Create arrow::SparseTensorCSR from numpy.ndarrays - """ - cdef shared_ptr[CSparseTensorCSR] csparse_tensor - cdef vector[int64_t] c_shape - cdef vector[c_string] c_dim_names - - for x in shape: - c_shape.push_back(x) - if dim_names is not None: - for x in dim_names: - c_dim_names.push_back(tobytes(x)) - - if indptr.dtype != 'i8': - indptr = indptr.astype('i8') - if indices.dtype != 'i8': - indices = indices.astype('i8') - - check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(), - data, indptr, indices, c_shape, c_dim_names, - &csparse_tensor)) - return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) - - @staticmethod - def from_tensor(obj): - """ - Convert arrow::Tensor to arrow::SparseTensorCSR - """ - cdef shared_ptr[CSparseTensorCSR] csparse_tensor - cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) - - check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor)) - - return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) - - def to_numpy(self): - """ - Convert arrow::SparseTensorCSR to numpy.ndarrays with zero copy - """ - cdef PyObject* out_data - cdef PyObject* out_indptr - cdef PyObject* out_indices - - check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self, - &out_data, &out_indptr, &out_indices)) - return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), - PyObject_to_object(out_indices)) - - def equals(self, SparseTensorCSR other): - """ - Return true if sparse tensors contains exactly equal data - """ - return self.stp.Equals(deref(other.stp)) - - def __eq__(self, other): - if isinstance(other, SparseTensorCSR): - return self.equals(other) - else: - return NotImplemented - - @property - def is_mutable(self): - return self.stp.is_mutable() - - @property - def ndim(self): - return self.stp.ndim() - - @property - def shape(self): - # Cython knows how to convert a vector[T] to a Python list - return tuple(self.stp.shape()) - - @property - def size(self): - return self.stp.size() - - def dim_name(self, i): - return frombytes(self.stp.dim_name(i)) - - @property - def dim_names(self): - return [frombytes(x) for x in tuple(self.stp.dim_names())] - - @property - def non_zero_length(self): - return self.stp.non_zero_length() - - -cdef class Tensor: - """ - A n-dimensional array a.k.a Tensor. - """ - - def __init__(self): - raise TypeError("Do not call Tensor's constructor directly, use one " - "of the `pyarrow.Tensor.from_*` functions instead.") - - cdef void init(self, const shared_ptr[CTensor]& sp_tensor): - self.sp_tensor = sp_tensor - self.tp = sp_tensor.get() - self.type = pyarrow_wrap_data_type(self.tp.type()) - - def __repr__(self): - return """ -type: {0.type} -shape: {0.shape} -strides: {0.strides}""".format(self) - - @staticmethod - def from_numpy(obj): - cdef shared_ptr[CTensor] ctensor - with nogil: - check_status(NdarrayToTensor(c_default_memory_pool(), obj, - &ctensor)) - return pyarrow_wrap_tensor(ctensor) - - def to_numpy(self): - """ - Convert arrow::Tensor to numpy.ndarray with zero copy - """ - cdef PyObject* out - - with nogil: - check_status(TensorToNdarray(self.sp_tensor, self, &out)) - return PyObject_to_object(out) - - def equals(self, Tensor other): - """ - Return true if the tensors contains exactly equal data - """ - return self.tp.Equals(deref(other.tp)) - - def __eq__(self, other): - if isinstance(other, Tensor): - return self.equals(other) - else: - return NotImplemented - - @property - def is_mutable(self): - return self.tp.is_mutable() - - @property - def is_contiguous(self): - return self.tp.is_contiguous() - - @property - def ndim(self): - return self.tp.ndim() - - @property - def size(self): - return self.tp.size() - - @property - def shape(self): - # Cython knows how to convert a vector[T] to a Python list - return tuple(self.tp.shape()) - - @property - def strides(self): - return tuple(self.tp.strides()) - - def __getbuffer__(self, cp.Py_buffer* buffer, int flags): - buffer.buf = self.tp.data().get().data() - pep3118_format = self.type.pep3118_format - if pep3118_format is None: - raise NotImplementedError("type %s not supported for buffer " - "protocol" % (self.type,)) - buffer.format = pep3118_format - buffer.itemsize = self.type.bit_width // 8 - buffer.internal = NULL - buffer.len = self.tp.size() * buffer.itemsize - buffer.ndim = self.tp.ndim() - buffer.obj = self - if self.tp.is_mutable(): - buffer.readonly = 0 - else: - buffer.readonly = 1 - # NOTE: This assumes Py_ssize_t == int64_t, and that the shape - # and strides arrays lifetime is tied to the tensor's - buffer.shape = &self.tp.shape()[0] - buffer.strides = &self.tp.strides()[0] - buffer.suboffsets = NULL - - cdef wrap_array_output(PyObject* output): cdef object obj = PyObject_to_object(output) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e0fc502f468..93a75945ce3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -593,6 +593,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int64_t size() int ndim() + const vector[c_string]& dim_names() const c_string& dim_name(int i) c_bool is_mutable() @@ -1234,6 +1235,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, + const vector[c_string]& dim_names, shared_ptr[CTensor]* out) CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base, @@ -1259,14 +1261,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: const vector[c_string]& dim_names, shared_ptr[CSparseTensorCSR]* out) - CStatus DenseNdarrayToSparseTensorCOO(CMemoryPool* pool, object ao, - const vector[c_string]& dim_names, - shared_ptr[CSparseTensorCOO]* out) - - CStatus DenseNdarrayToSparseTensorCSR(CMemoryPool* pool, object ao, - const vector[c_string]& dim_names, - shared_ptr[CSparseTensorCSR]* out) - CStatus TensorToSparseTensorCOO(shared_ptr[CTensor], shared_ptr[CSparseTensorCOO]* out) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 783e2b2731a..2da5a8301bc 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -121,6 +121,9 @@ include "builder.pxi" # Column, Table, Record Batch include "table.pxi" +# Tensors +include "tensor.pxi" + # File IO include "io.pxi" include "io-hdfs.pxi" diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi new file mode 100644 index 00000000000..17554e61740 --- /dev/null +++ b/python/pyarrow/tensor.pxi @@ -0,0 +1,367 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +cdef class Tensor: + """ + A n-dimensional array a.k.a Tensor. + """ + + def __init__(self): + raise TypeError("Do not call Tensor's constructor directly, use one " + "of the `pyarrow.Tensor.from_*` functions instead.") + + cdef void init(self, const shared_ptr[CTensor]& sp_tensor): + self.sp_tensor = sp_tensor + self.tp = sp_tensor.get() + self.type = pyarrow_wrap_data_type(self.tp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape} +strides: {0.strides}""".format(self) + + @staticmethod + def from_numpy(obj, dim_names=None): + cdef: + vector[c_string] c_dim_names + shared_ptr[CTensor] ctensor + + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + check_status(NdarrayToTensor(c_default_memory_pool(), obj, + c_dim_names, &ctensor)) + return pyarrow_wrap_tensor(ctensor) + + def to_numpy(self): + """ + Convert arrow::Tensor to numpy.ndarray with zero copy + """ + cdef PyObject* out + + check_status(TensorToNdarray(self.sp_tensor, self, &out)) + return PyObject_to_object(out) + + def equals(self, Tensor other): + """ + Return true if the tensors contains exactly equal data + """ + return self.tp.Equals(deref(other.tp)) + + def __eq__(self, other): + if isinstance(other, Tensor): + return self.equals(other) + else: + return NotImplemented + + def dim_name(self, i): + return frombytes(self.tp.dim_name(i)) + + @property + def dim_names(self): + return [frombytes(x) for x in tuple(self.tp.dim_names())] + + @property + def is_mutable(self): + return self.tp.is_mutable() + + @property + def is_contiguous(self): + return self.tp.is_contiguous() + + @property + def ndim(self): + return self.tp.ndim() + + @property + def size(self): + return self.tp.size() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.tp.shape()) + + @property + def strides(self): + return tuple(self.tp.strides()) + + def __getbuffer__(self, cp.Py_buffer* buffer, int flags): + buffer.buf = self.tp.data().get().data() + pep3118_format = self.type.pep3118_format + if pep3118_format is None: + raise NotImplementedError("type %s not supported for buffer " + "protocol" % (self.type,)) + buffer.format = pep3118_format + buffer.itemsize = self.type.bit_width // 8 + buffer.internal = NULL + buffer.len = self.tp.size() * buffer.itemsize + buffer.ndim = self.tp.ndim() + buffer.obj = self + if self.tp.is_mutable(): + buffer.readonly = 0 + else: + buffer.readonly = 1 + # NOTE: This assumes Py_ssize_t == int64_t, and that the shape + # and strides arrays lifetime is tied to the tensor's + buffer.shape = &self.tp.shape()[0] + buffer.strides = &self.tp.strides()[0] + buffer.suboffsets = NULL + + +cdef class SparseTensorCOO: + """ + A sparse COO tensor. + """ + + def __init__(self): + raise TypeError("Do not call SparseTensorCOO's constructor directly, " + "use one of the `pyarrow.SparseTensorCOO.from_*` " + "functions instead.") + + cdef void init(self, const shared_ptr[CSparseTensorCOO]& sp_sparse_tensor): + self.sp_sparse_tensor = sp_sparse_tensor + self.stp = sp_sparse_tensor.get() + self.type = pyarrow_wrap_data_type(self.stp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape}""".format(self) + + @classmethod + def from_dense_numpy(cls, obj, dim_names=None): + """ + Convert numpy.ndarray to arrow::SparseTensorCOO + """ + return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) + + @staticmethod + def from_numpy(data, coords, shape, dim_names=None): + """ + Create arrow::SparseTensorCOO from numpy.ndarrays + """ + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + # Enforce precondition for SparseTensorCOO indices + coords = np.require(coords, dtype='i8', requirements='F') + if coords.ndim != 2: + raise ValueError("Expected 2-dimensional array for " + "SparseTensorCOO indices") + + check_status(NdarraysToSparseTensorCOO(c_default_memory_pool(), + data, coords, c_shape, c_dim_names, &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + @staticmethod + def from_tensor(obj): + """ + Convert arrow::Tensor to arrow::SparseTensorCOO + """ + cdef shared_ptr[CSparseTensorCOO] csparse_tensor + cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) + + with nogil: + check_status(TensorToSparseTensorCOO(ctensor, &csparse_tensor)) + + return pyarrow_wrap_sparse_tensor_coo(csparse_tensor) + + def to_numpy(self): + """ + Convert arrow::SparseTensorCOO to numpy.ndarrays with zero copy + """ + cdef PyObject* out_data + cdef PyObject* out_coords + + check_status(SparseTensorCOOToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_coords)) + return PyObject_to_object(out_data), PyObject_to_object(out_coords) + + def equals(self, SparseTensorCOO other): + """ + Return true if sparse tensors contains exactly equal data + """ + return self.stp.Equals(deref(other.stp)) + + def __eq__(self, other): + if isinstance(other, SparseTensorCOO): + return self.equals(other) + else: + return NotImplemented + + @property + def is_mutable(self): + return self.stp.is_mutable() + + @property + def ndim(self): + return self.stp.ndim() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.stp.shape()) + + @property + def size(self): + return self.stp.size() + + def dim_name(self, i): + return frombytes(self.stp.dim_name(i)) + + @property + def dim_names(self): + return [frombytes(x) for x in tuple(self.stp.dim_names())] + + @property + def non_zero_length(self): + return self.stp.non_zero_length() + + +cdef class SparseTensorCSR: + """ + A sparse CSR tensor. + """ + + def __init__(self): + raise TypeError("Do not call SparseTensorCSR's constructor directly, " + "use one of the `pyarrow.SparseTensorCSR.from_*` " + "functions instead.") + + cdef void init(self, const shared_ptr[CSparseTensorCSR]& sp_sparse_tensor): + self.sp_sparse_tensor = sp_sparse_tensor + self.stp = sp_sparse_tensor.get() + self.type = pyarrow_wrap_data_type(self.stp.type()) + + def __repr__(self): + return """ +type: {0.type} +shape: {0.shape}""".format(self) + + @classmethod + def from_dense_numpy(cls, obj, dim_names=None): + """ + Convert numpy.ndarray to arrow::SparseTensorCSR + """ + return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) + + @staticmethod + def from_numpy(data, indptr, indices, shape, dim_names=None): + """ + Create arrow::SparseTensorCSR from numpy.ndarrays + """ + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + cdef vector[int64_t] c_shape + cdef vector[c_string] c_dim_names + + for x in shape: + c_shape.push_back(x) + if dim_names is not None: + for x in dim_names: + c_dim_names.push_back(tobytes(x)) + + # Enforce precondition for SparseTensorCSR indices + indptr = np.require(indptr, dtype='i8') + indices = np.require(indices, dtype='i8') + if indptr.ndim != 1: + raise ValueError("Expected 1-dimensional array for " + "SparseTensorCSR indptr") + if indices.ndim != 1: + raise ValueError("Expected 1-dimensional array for " + "SparseTensorCSR indices") + + check_status(NdarraysToSparseTensorCSR(c_default_memory_pool(), + data, indptr, indices, c_shape, c_dim_names, + &csparse_tensor)) + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + @staticmethod + def from_tensor(obj): + """ + Convert arrow::Tensor to arrow::SparseTensorCSR + """ + cdef shared_ptr[CSparseTensorCSR] csparse_tensor + cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) + + with nogil: + check_status(TensorToSparseTensorCSR(ctensor, &csparse_tensor)) + + return pyarrow_wrap_sparse_tensor_csr(csparse_tensor) + + def to_numpy(self): + """ + Convert arrow::SparseTensorCSR to numpy.ndarrays with zero copy + """ + cdef PyObject* out_data + cdef PyObject* out_indptr + cdef PyObject* out_indices + + check_status(SparseTensorCSRToNdarray(self.sp_sparse_tensor, self, + &out_data, &out_indptr, &out_indices)) + return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), + PyObject_to_object(out_indices)) + + def equals(self, SparseTensorCSR other): + """ + Return true if sparse tensors contains exactly equal data + """ + return self.stp.Equals(deref(other.stp)) + + def __eq__(self, other): + if isinstance(other, SparseTensorCSR): + return self.equals(other) + else: + return NotImplemented + + @property + def is_mutable(self): + return self.stp.is_mutable() + + @property + def ndim(self): + return self.stp.ndim() + + @property + def shape(self): + # Cython knows how to convert a vector[T] to a Python list + return tuple(self.stp.shape()) + + @property + def size(self): + return self.stp.size() + + def dim_name(self, i): + return frombytes(self.stp.dim_name(i)) + + @property + def dim_names(self): + return [frombytes(x) for x in tuple(self.stp.dim_names())] + + @property + def non_zero_length(self): + return self.stp.non_zero_length() diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 09d6b852692..68564dacf4b 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -22,6 +22,21 @@ import pyarrow as pa +tensor_type_pairs = [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +] + + @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseTensorCSR, pa.SparseTensorCOO, @@ -48,7 +63,7 @@ def test_sparse_tensor_attrs(sparse_tensor_type): def test_sparse_tensor_coo_base_object(): data = np.array([[4], [9], [7], [5]]) - coords = np.array([[0, 0], [1, 3], [0, 2], [1, 3]]) + coords = np.array([[0, 0], [0, 2], [1, 1], [3, 3]]) array = np.array([[4, 0, 9, 0], [0, 7, 0, 0], [0, 0, 0, 0], @@ -61,6 +76,7 @@ def test_sparse_tensor_coo_base_object(): sparse_tensor = None assert np.array_equal(data, result_data) assert np.array_equal(coords, result_coords) + assert result_coords.flags.f_contiguous # column-major def test_sparse_tensor_csr_base_object(): @@ -109,23 +125,11 @@ def ne(a, b): ne(sparse_tensor1, sparse_tensor2) -@pytest.mark.parametrize('dtype_str,arrow_type', [ - ('i1', pa.int8()), - ('i2', pa.int16()), - ('i4', pa.int32()), - ('i8', pa.int64()), - ('u1', pa.uint8()), - ('u2', pa.uint16()), - ('u4', pa.uint32()), - ('u8', pa.uint64()), - ('f2', pa.float16()), - ('f4', pa.float32()), - ('f8', pa.float64()) -]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_tensor_coo_from_dense(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[4], [9], [7], [5]]).astype(dtype) - coords = np.array([[0, 0], [1, 3], [0, 2], [1, 3]]) + coords = np.array([[0, 0], [0, 2], [1, 1], [3, 3]]) array = np.array([[4, 0, 9, 0], [0, 7, 0, 0], [0, 0, 0, 0], @@ -149,19 +153,7 @@ def test_sparse_tensor_coo_from_dense(dtype_str, arrow_type): assert np.array_equal(coords, result_coords) -@pytest.mark.parametrize('dtype_str,arrow_type', [ - ('i1', pa.int8()), - ('i2', pa.int16()), - ('i4', pa.int32()), - ('i8', pa.int64()), - ('u1', pa.uint8()), - ('u2', pa.uint16()), - ('u4', pa.uint32()), - ('u8', pa.uint64()), - ('f2', pa.float16()), - ('f4', pa.float32()), - ('f8', pa.float64()) -]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): dtype = np.dtype(dtype_str) dense_data = np.array([[1, 0, 2], @@ -191,19 +183,7 @@ def test_sparse_tensor_csr_from_dense(dtype_str, arrow_type): assert np.array_equal(indices, result_indices) -@pytest.mark.parametrize('dtype_str,arrow_type', [ - ('i1', pa.int8()), - ('i2', pa.int16()), - ('i4', pa.int32()), - ('i8', pa.int64()), - ('u1', pa.uint8()), - ('u2', pa.uint16()), - ('u4', pa.uint32()), - ('u8', pa.uint64()), - ('f2', pa.float16()), - ('f4', pa.float32()), - ('f8', pa.float64()) -]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[4], [9], [7], [5]]).astype(dtype) @@ -221,19 +201,7 @@ def test_sparse_tensor_coo_numpy_roundtrip(dtype_str, arrow_type): assert sparse_tensor.dim_names == dim_names -@pytest.mark.parametrize('dtype_str,arrow_type', [ - ('i1', pa.int8()), - ('i2', pa.int16()), - ('i4', pa.int32()), - ('i8', pa.int64()), - ('u1', pa.uint8()), - ('u2', pa.uint16()), - ('u4', pa.uint32()), - ('u8', pa.uint64()), - ('f2', pa.float16()), - ('f4', pa.float32()), - ('f8', pa.float64()) -]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_sparse_tensor_csr_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = np.array([[1], [2], [3], [4], [5], [6]]).astype(dtype) diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index 188a4a5e1a5..13f05d27489 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -23,12 +23,28 @@ import pyarrow as pa +tensor_type_pairs = [ + ('i1', pa.int8()), + ('i2', pa.int16()), + ('i4', pa.int32()), + ('i8', pa.int64()), + ('u1', pa.uint8()), + ('u2', pa.uint16()), + ('u4', pa.uint32()), + ('u8', pa.uint64()), + ('f2', pa.float16()), + ('f4', pa.float32()), + ('f8', pa.float64()) +] + + def test_tensor_attrs(): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) assert tensor.ndim == 2 + assert tensor.dim_names == [] assert tensor.size == 40 assert tensor.shape == data.shape assert tensor.strides == data.strides @@ -42,6 +58,13 @@ def test_tensor_attrs(): tensor = pa.Tensor.from_numpy(data2) assert not tensor.is_mutable + # With dim_names + tensor = pa.Tensor.from_numpy(data, dim_names=('x', 'y')) + assert tensor.ndim == 2 + assert tensor.dim_names == ['x', 'y'] + assert tensor.dim_name(0) == 'x' + assert tensor.dim_name(1) == 'y' + def test_tensor_base_object(): tensor = pa.Tensor.from_numpy(np.random.randn(10, 4)) @@ -50,19 +73,7 @@ def test_tensor_base_object(): assert sys.getrefcount(tensor) == n + 1 -@pytest.mark.parametrize('dtype_str,arrow_type', [ - ('i1', pa.int8()), - ('i2', pa.int16()), - ('i4', pa.int32()), - ('i8', pa.int64()), - ('u1', pa.uint8()), - ('u2', pa.uint16()), - ('u4', pa.uint32()), - ('u8', pa.uint64()), - ('f2', pa.float16()), - ('f4', pa.float32()), - ('f8', pa.float64()) -]) +@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = (100 * np.random.randn(10, 4)).astype(dtype) @@ -76,15 +87,6 @@ def test_tensor_numpy_roundtrip(dtype_str, arrow_type): assert (data == result).all() -def _try_delete(path): - import gc - gc.collect() - try: - os.remove(path) - except os.error: - pass - - def test_tensor_ipc_roundtrip(tmpdir): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) From db5d620fe19715f259cafc66cc5957a119a448e3 Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 1 Jul 2019 16:39:13 +0200 Subject: [PATCH 8/8] Typo. --- cpp/src/arrow/python/numpy_convert.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index 209cea5faa9..515864ae287 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -273,7 +273,7 @@ Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, } // Wrap the dense data of a sparse tensor in a ndarray -static Status SparseTendorDataToNdarray(const SparseTensor& sparse_tensor, +static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor, std::vector data_shape, PyObject* base, PyObject** out_data) { int type_num_data; @@ -306,7 +306,7 @@ Status SparseTensorCOOToNdarray(const std::shared_ptr& sparse_t // Wrap tensor data OwnedRef result_data; - RETURN_NOT_OK(SparseTendorDataToNdarray( + RETURN_NOT_OK(SparseTensorDataToNdarray( *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); // Wrap indices @@ -326,7 +326,7 @@ Status SparseTensorCSRToNdarray(const std::shared_ptr& sparse_t // Wrap tensor data OwnedRef result_data; - RETURN_NOT_OK(SparseTendorDataToNdarray( + RETURN_NOT_OK(SparseTensorDataToNdarray( *sparse_tensor, {sparse_index.non_zero_length(), 1}, base, result_data.ref())); // Wrap indices