From 5e09bfe0ac6b25a54880bf17074872a3532bac34 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 26 Mar 2016 22:33:46 -0700 Subject: [PATCH 1/6] Compiling, but untested draft of pandas <-> arrow converters --- cpp/src/arrow/array.h | 4 + cpp/src/arrow/types/string.cc | 10 + cpp/src/arrow/types/string.h | 4 +- cpp/src/arrow/util/buffer.h | 42 ++ python/CMakeLists.txt | 4 + python/src/pyarrow/adapters/pandas.cc | 727 ++++++++++++++++++++++++++ 6 files changed, 788 insertions(+), 3 deletions(-) create mode 100644 python/src/pyarrow/adapters/pandas.cc diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 133adf32cbd..f2992741852 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -60,6 +60,10 @@ class Array { return null_bitmap_; } + const uint8_t* null_bitmap_data() const { + return null_bitmap_data_; + } + bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index dea42e102b0..80b075cdfbb 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -20,8 +20,18 @@ #include #include +#include "arrow/type.h" + namespace arrow { +const std::shared_ptr STRING(new StringType()); + +StringArray::StringArray(int32_t length, + const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap) : + StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + std::string CharType::ToString() const { std::stringstream s; s << "char(" << size << ")"; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index fda722ba6de..84cd0326ec8 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -79,9 +79,7 @@ class StringArray : public ListArray { const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) : - StringArray(std::make_shared(), length, offsets, values, - null_count, null_bitmap) {} + const std::shared_ptr& null_bitmap = nullptr); // Compute the pointer t const uint8_t* GetValue(int i, int32_t* out_length) const { diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 0c3e210abd9..2bf604f1160 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -18,11 +18,13 @@ #ifndef ARROW_UTIL_BUFFER_H #define ARROW_UTIL_BUFFER_H +#include #include #include #include #include "arrow/util/macros.h" +#include "arrow/util/status.h" namespace arrow { @@ -146,6 +148,46 @@ class PoolBuffer : public ResizableBuffer { MemoryPool* pool_; }; +static constexpr int64_t MIN_BUFFER_CAPACITY = 1024; + +class BufferBuilder { + public: + BufferBuilder(MemoryPool* pool) : + pool_(pool), + capacity_(0), + size_(0) {} + + Status Append(const uint8_t* data, int length) { + if (capacity_ < length + size_) { + if (capacity_ == 0) { + buffer_ = std::make_shared(pool_); + } + capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_); + while (capacity_ < length + size_) { + capacity_ *= 2; + } + RETURN_NOT_OK(buffer_->Resize(capacity_)); + data_ = buffer_->mutable_data(); + } + memcpy(data_ + size_, data, length); + size_ += length; + return Status::OK(); + } + + std::shared_ptr Finish() { + auto result = buffer_; + buffer_ = nullptr; + return result; + } + + private: + std::shared_ptr buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; +}; + } // namespace arrow #endif // ARROW_UTIL_BUFFER_H diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0ecafc7202e..dc8d87888ff 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -220,9 +220,12 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") ## Python and libraries find_package(PythonLibsNew REQUIRED) +find_package(NumPy REQUIRED) include(UseCython) include_directories(SYSTEM + ${NUMPY_INCLUDE_DIRS} + ${PYTHON_INCLUDE_DIRS} src) ############################################################ @@ -414,6 +417,7 @@ set(PYARROW_SRCS src/pyarrow/status.cc src/pyarrow/adapters/builtin.cc + src/pyarrow/adapters/pandas.cc ) set(LINK_LIBS diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc new file mode 100644 index 00000000000..c8dc3ba9532 --- /dev/null +++ b/python/src/pyarrow/adapters/pandas.cc @@ -0,0 +1,727 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for pandas conversion via NumPy + +#include + +#include +#include +#include +#include +#include + +#include + +#ifdef NPY_1_7_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#else +#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED +#define NPY_ARRAY_ALIGNED NPY_ALIGNED +#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE +#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY +#endif + +#include + +#include "arrow/api.h" +#include "arrow/util/bit-util.h" + +#include "pyarrow/common.h" +#include "pyarrow/status.h" + +namespace pyarrow { + +using arrow::Array; +namespace util = arrow::util; + +// ---------------------------------------------------------------------- +// Serialization + +template +struct npy_traits { +}; + +template <> +struct npy_traits { + typedef uint8_t value_type; + using ArrayType = arrow::BooleanArray; + + static constexpr bool supports_nulls = false; + static inline bool isnull(uint8_t v) { + return false; + } +}; + +#define NPY_INT_DECL(TYPE, CapType, T) \ + template <> \ + struct npy_traits { \ + typedef T value_type; \ + using ArrayType = arrow::CapType##Array; \ + \ + static constexpr bool supports_nulls = false; \ + static inline bool isnull(T v) { \ + return false; \ + } \ + }; + +NPY_INT_DECL(INT8, Int8, int8_t); +NPY_INT_DECL(INT16, Int16, int16_t); +NPY_INT_DECL(INT32, Int32, int32_t); +NPY_INT_DECL(INT64, Int64, int64_t); +NPY_INT_DECL(UINT8, UInt8, uint8_t); +NPY_INT_DECL(UINT16, UInt16, uint16_t); +NPY_INT_DECL(UINT32, UInt32, uint32_t); +NPY_INT_DECL(UINT64, UInt64, uint64_t); + +template <> +struct npy_traits { + typedef float value_type; + using ArrayType = arrow::FloatArray; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(float v) { + return v != v; + } +}; + +template <> +struct npy_traits { + typedef double value_type; + using ArrayType = arrow::DoubleArray; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(double v) { + return v != v; + } +}; + +template <> +struct npy_traits { + typedef PyObject* value_type; + static constexpr bool supports_nulls = true; +}; + +template +class ArrowSerializer { + public: + ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) : + pool_(pool), + arr_(arr), + mask_(mask) { + length_ = PyArray_SIZE(arr_); + } + + Status Convert(std::shared_ptr* out); + + int stride() const { + return PyArray_STRIDES(arr_)[0]; + } + + Status InitNullBitmap() { + int null_bytes = util::bytes_for_bits(length_); + + null_bitmap_ = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(null_bitmap_->Resize(null_bytes)); + + null_bitmap_data_ = null_bitmap_->mutable_data(); + memset(null_bitmap_data_, 0, null_bytes); + + return Status::OK(); + } + + bool is_strided() const { + npy_intp* astrides = PyArray_STRIDES(arr_); + return astrides[0] != PyArray_DESCR(arr_)->elsize; + } + + private: + Status ConvertData(); + + Status ConvertObjectStrings(std::shared_ptr* out) { + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + auto offsets_buffer = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(offsets_buffer->Resize(sizeof(int32_t) * (length_ + 1))); + int32_t* offsets = reinterpret_cast(offsets_buffer->mutable_data()); + + arrow::BufferBuilder data_builder(pool_); + arrow::Status s; + PyObject* obj; + int length; + int offset = 0; + int64_t null_count = 0; + for (int64_t i = 0; i < length_; ++i) { + obj = objects[i]; + if (PyUnicode_Check(obj)) { + obj = PyUnicode_AsUTF8String(obj); + if (obj == NULL) { + PyErr_Clear(); + return Status::TypeError("failed converting unicode to UTF8"); + } + length = PyBytes_GET_SIZE(obj); + s = data_builder.Append( + reinterpret_cast(PyBytes_AS_STRING(obj)), length); + Py_DECREF(obj); + if (!s.ok()) { + return Status::ArrowError(s.ToString()); + } + util::set_bit(null_bitmap_data_, i); + } else if (PyBytes_Check(obj)) { + length = PyBytes_GET_SIZE(obj); + RETURN_ARROW_NOT_OK(data_builder.Append( + reinterpret_cast(PyBytes_AS_STRING(obj)), length)); + util::set_bit(null_bitmap_data_, i); + } else { + // NULL + // No change to offset + length = 0; + ++null_count; + } + offsets[i] = offset; + offset += length; + } + // End offset + offsets[length_] = offset; + + std::shared_ptr data_buffer = data_builder.Finish(); + + auto values = std::make_shared(data_buffer->size(), + data_buffer); + *out = std::shared_ptr( + new arrow::StringArray(length_, offsets_buffer, values, null_count, + null_bitmap_)); + + return Status::OK(); + } + + Status ConvertBooleans(std::shared_ptr* out) { + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + int nbytes = util::bytes_for_bits(length_); + auto data = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(data->Resize(nbytes)); + uint8_t* bitmap = data->mutable_data(); + memset(bitmap, 0, nbytes); + + int64_t null_count = 0; + for (int64_t i = 0; i < length_; ++i) { + if (objects[i] == Py_True) { + util::set_bit(bitmap, i); + util::set_bit(null_bitmap_data_, i); + } else if (objects[i] != Py_False) { + ++null_count; + } else { + util::set_bit(null_bitmap_data_, i); + } + } + + *out = std::make_shared(length_, data, null_count, + null_bitmap_); + + return Status::OK(); + } + + arrow::MemoryPool* pool_; + + PyArrayObject* arr_; + PyArrayObject* mask_; + + int64_t length_; + + std::shared_ptr data_; + std::shared_ptr null_bitmap_; + uint8_t* null_bitmap_data_; +}; + +// Returns null count +static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { + int64_t null_count = 0; + const uint8_t* mask_values = static_cast(PyArray_DATA(mask)); + // TODO(wesm): strided null mask + for (int i = 0; i < length; ++i) { + if (mask_values[i]) { + ++null_count; + } else { + util::set_bit(bitmap, i); + } + } + return null_count; +} + +template +static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) { + typedef npy_traits traits; + typedef typename traits::value_type T; + + int64_t null_count = 0; + const T* values = reinterpret_cast(data); + + // TODO(wesm): striding + for (int i = 0; i < length; ++i) { + if (traits::isnull(values[i])) { + ++null_count; + } else { + util::set_bit(bitmap, i); + } + } + + return null_count; +} + +template +inline Status ArrowSerializer::Convert(std::shared_ptr* out) { + typedef npy_traits traits; + + if (mask_ != nullptr || traits::supports_nulls) { + RETURN_NOT_OK(InitNullBitmap()); + } + + int64_t null_count = 0; + if (mask_ != nullptr) { + null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); + } else if (traits::supports_nulls) { + null_count = ValuesToBitmap(PyArray_DATA(arr_), length_, null_bitmap_data_); + } + + RETURN_NOT_OK(ConvertData()); + *out = std::make_shared(length_, data_, null_count, + null_bitmap_); + + return Status::OK(); +} + +PyObject* numpy_nan = nullptr; + +static inline bool PyObject_is_null(const PyObject* obj) { + return obj == Py_None || obj == numpy_nan; +} + +static inline bool PyObject_is_string(const PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + return PyString_Check(obj) || PyBytes_Check(obj); +#else + return PyString_Check(obj) || PyUnicode_Check(obj); +#endif +} + +static inline bool PyObject_is_bool(const PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + return PyString_Check(obj) || PyBytes_Check(obj); +#else + return PyString_Check(obj) || PyUnicode_Check(obj); +#endif +} + +template <> +inline Status ArrowSerializer::Convert(std::shared_ptr* out) { + // Python object arrays are annoying, since we could have one of: + // + // * Strings + // * Booleans with nulls + // * Mixed type (not supported at the moment by arrow format) + // + // Additionally, nulls may be encoded either as np.nan or None. So we have to + // do some type inference and conversion + + RETURN_NOT_OK(InitNullBitmap()); + + // TODO: mask not supported here + const PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + for (int64_t i = 0; i < length_; ++i) { + if (PyObject_is_null(objects[i])) { + continue; + } else if (PyObject_is_string(objects[i])) { + return ConvertObjectStrings(out); + } else if (PyBool_Check(objects[i])) { + return ConvertBooleans(out); + } else { + return Status::TypeError("unhandled python type"); + } + } + + return Status::TypeError("Unable to infer type of object array, were all null"); +} + +class NumPyBuffer : public arrow::Buffer { + public: + NumPyBuffer(PyArrayObject* arr) : + Buffer(nullptr, 0) { + arr_ = arr; + Py_INCREF(arr); + + data_ = reinterpret_cast(PyArray_DATA(arr_)); + size_ = PyArray_SIZE(arr_); + } + + virtual ~NumPyBuffer() { + Py_XDECREF(arr_); + } + + private: + PyArrayObject* arr_; +}; + +template +inline Status ArrowSerializer::ConvertData() { + // TODO(wesm): strided arrays + if (is_strided()) { + return Status::ValueError("no support for strided data yet"); + } + + data_ = std::make_shared(arr_); + return Status::OK(); +} + +template <> +inline Status ArrowSerializer::ConvertData() { + if (is_strided()) { + return Status::ValueError("no support for strided data yet"); + } + + int nbytes = util::bytes_for_bits(length_); + auto buffer = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(buffer->Resize(nbytes)); + + const uint8_t* values = reinterpret_cast(PyArray_DATA(arr_)); + + uint8_t* bitmap = buffer->mutable_data(); + + memset(bitmap, 0, nbytes); + for (int i = 0; i < length_; ++i) { + if (values[i] > 0) { + util::set_bit(bitmap, i); + } + } + + data_ = buffer; + + return Status::OK(); +} + +template <> +inline Status ArrowSerializer::ConvertData() { + return Status::TypeError("NYI"); +} + + +#define TO_ARROW_CASE(TYPE) \ + case NPY_##TYPE: \ + { \ + ArrowSerializer converter(pool, arr, mask); \ + RETURN_NOT_OK(converter.Convert(out)); \ + } \ + break; + +Status pandas_masked_to_primitive(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, + std::shared_ptr* out) { + PyArrayObject* arr = reinterpret_cast(ao); + PyArrayObject* mask = nullptr; + + if (mo != nullptr) { + mask = reinterpret_cast(mo); + } + + if (PyArray_NDIM(arr) != 1) { + return Status::ValueError("only handle 1-dimensional arrays"); + } + + switch(PyArray_DESCR(arr)->type_num) { + TO_ARROW_CASE(BOOL); + TO_ARROW_CASE(INT8); + TO_ARROW_CASE(INT16); + TO_ARROW_CASE(INT32); + TO_ARROW_CASE(INT64); + TO_ARROW_CASE(UINT8); + TO_ARROW_CASE(UINT16); + TO_ARROW_CASE(UINT32); + TO_ARROW_CASE(UINT64); + TO_ARROW_CASE(FLOAT32); + TO_ARROW_CASE(FLOAT64); + TO_ARROW_CASE(OBJECT); + default: + std::stringstream ss; + ss << "unsupported type " << PyArray_DESCR(arr)->type_num + << std::endl; + return Status::NotImplemented(ss.str()); + } + return Status::OK(); +} + +Status pandas_to_primitive(arrow::MemoryPool* pool, PyObject* ao, + std::shared_ptr* out) { + return pandas_masked_to_primitive(pool, ao, nullptr, out); +} + +// ---------------------------------------------------------------------- +// Deserialization + +template +struct arrow_traits { +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_BOOL; + static constexpr bool supports_nulls = false; + static constexpr bool is_boolean = true; + static constexpr bool is_integer = false; + static constexpr bool is_floating = false; +}; + +#define INT_DECL(TYPE) \ + template <> \ + struct arrow_traits { \ + static constexpr int npy_type = NPY_##TYPE; \ + static constexpr bool supports_nulls = false; \ + static constexpr double na_value = NAN; \ + static constexpr bool is_boolean = false; \ + static constexpr bool is_integer = true; \ + static constexpr bool is_floating = false; \ + typedef typename npy_traits::value_type T; \ + }; + +INT_DECL(INT8); +INT_DECL(INT16); +INT_DECL(INT32); +INT_DECL(INT64); +INT_DECL(UINT8); +INT_DECL(UINT16); +INT_DECL(UINT32); +INT_DECL(UINT64); + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT32; + static constexpr bool supports_nulls = true; + static constexpr float na_value = NAN; + static constexpr bool is_boolean = false; + static constexpr bool is_integer = false; + static constexpr bool is_floating = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT64; + static constexpr bool supports_nulls = true; + static constexpr double na_value = NAN; + static constexpr bool is_boolean = false; + static constexpr bool is_integer = false; + static constexpr bool is_floating = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + static constexpr bool is_boolean = false; + static constexpr bool is_integer = false; + static constexpr bool is_floating = false; +}; + + +static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { +#if PY_MAJOR_VERSION >= 3 + return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); +#else + return PyString_FromStringAndSize(reinterpret_cast(data), length); +#endif +} + +template +class ArrowDeserializer { + public: + ArrowDeserializer(const std::shared_ptr& arr) : + arr_(arr) {} + + PyObject* Convert() { + ConvertValues(); + return reinterpret_cast(out_); + } + + template + inline typename std::enable_if::is_floating, void>::type + ConvertValues() { + typedef typename arrow_traits::T T; + + arrow::PrimitiveArray* prim_arr = static_cast( + arr_.get()); + + npy_intp dims[1] = {arr_->length()}; + out_ = reinterpret_cast( + PyArray_SimpleNew(1, dims, arrow_traits::npy_type)); + + if (out_ == NULL) { + // Error occurred, trust that SimpleNew set the error state + return; + } + + if (arr_->null_count() > 0) { + T* out_values = reinterpret_cast(PyArray_DATA(out_)); + const T* in_values = reinterpret_cast(prim_arr->data()->data()); + for (int64_t i = 0; i < arr_->length(); ++i) { + out_values[i] = arr_->IsNull(i) ? NAN : in_values[i]; + } + } else { + memcpy(PyArray_DATA(out_), prim_arr->data()->data(), + arr_->length() * arr_->type()->value_size()); + } + } + + // Integer specialization + template + inline typename std::enable_if::is_integer, void>::type + ConvertValues() { + typedef typename arrow_traits::T T; + + arrow::PrimitiveArray* prim_arr = static_cast( + arr_.get()); + + const T* in_values = reinterpret_cast(prim_arr->data()->data()); + + npy_intp dims[1] = {arr_->length()}; + if (arr_->null_count() > 0) { + out_ = reinterpret_cast(PyArray_SimpleNew(1, dims, NPY_FLOAT64)); + if (out_ == NULL) return; + + // Upcast to double, set NaN as appropriate + double* out_values = reinterpret_cast(PyArray_DATA(out_)); + for (int i = 0; i < arr_->length(); ++i) { + out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; + } + } else { + out_ = reinterpret_cast( + PyArray_SimpleNew(1, dims, arrow_traits::npy_type)); + if (out_ == NULL) return; + memcpy(PyArray_DATA(out_), in_values, + arr_->length() * arr_->type()->value_size()); + } + } + + // Boolean specialization + template + inline typename std::enable_if::is_boolean, void>::type + ConvertValues() { + npy_intp dims[1] = {arr_->length()}; + + arrow::BooleanArray* bool_arr = static_cast(arr_.get()); + + if (arr_->null_count() > 0) { + out_ = reinterpret_cast( + PyArray_SimpleNew(1, dims, NPY_OBJECT)); + if (out_ == NULL) return; + PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); + for (int64_t i = 0; i < arr_->length(); ++i) { + if (bool_arr->IsNull(i)) { + Py_INCREF(Py_None); + out_values[i] = Py_None; + } else if (bool_arr->Value(i)) { + // True + Py_INCREF(Py_True); + out_values[i] = Py_True; + } else { + // False + Py_INCREF(Py_False); + out_values[i] = Py_False; + } + } + } else { + out_ = reinterpret_cast( + PyArray_SimpleNew(1, dims, arrow_traits::npy_type)); + if (out_ == NULL) return; + + uint8_t* out_values = reinterpret_cast(PyArray_DATA(out_)); + for (int64_t i = 0; i < arr_->length(); ++i) { + out_values[i] = static_cast(bool_arr->Value(i)); + } + } + } + + // UTF8 + template + inline typename std::enable_if::type + ConvertValues() { + npy_intp dims[1] = {arr_->length()}; + out_ = reinterpret_cast( + PyArray_SimpleNew(1, dims, NPY_OBJECT)); + if (out_ == NULL) return; + PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); + + arrow::StringArray* string_arr = static_cast(arr_.get()); + + const uint8_t* data; + int32_t length; + if (arr_->null_count() > 0) { + for (int64_t i = 0; i < arr_->length(); ++i) { + if (string_arr->IsNull(i)) { + Py_INCREF(Py_None); + out_values[i] = Py_None; + } else { + data = string_arr->GetValue(i, &length); + out_values[i] = make_pystring(data, length); + if (out_values[i] == nullptr) return; + } + } + } else { + for (int64_t i = 0; i < arr_->length(); ++i) { + data = string_arr->GetValue(i, &length); + out_values[i] = make_pystring(data, length); + if (out_values[i] == nullptr) return; + } + } + } + private: + std::shared_ptr arr_; + PyArrayObject* out_; +}; + +#define FROM_ARROW_CASE(TYPE) \ + case arrow::Type::TYPE: \ + { \ + ArrowDeserializer converter(arr); \ + return converter.Convert(); \ + } \ + break; + +PyObject* primitive_to_pandas(const std::shared_ptr& arr) { + switch(arr->type_enum()) { + FROM_ARROW_CASE(BOOL); + FROM_ARROW_CASE(INT8); + FROM_ARROW_CASE(INT16); + FROM_ARROW_CASE(INT32); + FROM_ARROW_CASE(INT64); + FROM_ARROW_CASE(UINT8); + FROM_ARROW_CASE(UINT16); + FROM_ARROW_CASE(UINT32); + FROM_ARROW_CASE(UINT64); + FROM_ARROW_CASE(FLOAT); + FROM_ARROW_CASE(DOUBLE); + FROM_ARROW_CASE(STRING); + default: + break; + } + PyErr_SetString(PyExc_NotImplementedError, + "Arrow type reading not implemented"); + return NULL; +} + +} // namespace pyarrow From f0cc45184b8b90528ec6c7c70237c1b6fa27de38 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 27 Mar 2016 15:23:34 -0700 Subject: [PATCH 2/6] Give libpyarrow a reference to numpy.nan --- python/CMakeLists.txt | 2 +- python/pyarrow/config.pyx | 6 ++++- python/src/pyarrow/adapters/pandas.cc | 18 ++----------- python/src/pyarrow/{init.cc => config.cc} | 11 +++++++- python/src/pyarrow/{init.h => config.h} | 33 ++++++++++++++++++++--- 5 files changed, 48 insertions(+), 22 deletions(-) rename python/src/pyarrow/{init.cc => config.cc} (84%) rename python/src/pyarrow/{init.h => config.h} (57%) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index dc8d87888ff..ebe825f65c4 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -412,8 +412,8 @@ add_subdirectory(src/pyarrow/util) set(PYARROW_SRCS src/pyarrow/common.cc + src/pyarrow/config.cc src/pyarrow/helpers.cc - src/pyarrow/init.cc src/pyarrow/status.cc src/pyarrow/adapters/builtin.cc diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx index 521bc066cd4..2ba8cbf05bd 100644 --- a/python/pyarrow/config.pyx +++ b/python/pyarrow/config.pyx @@ -2,7 +2,11 @@ # distutils: language = c++ # cython: embedsignature = True -cdef extern from 'pyarrow/init.h' namespace 'pyarrow': +cdef extern from 'pyarrow/config.h' namespace 'pyarrow': void pyarrow_init() + void pyarrow_set_numpy_nan(object o) pyarrow_init() + +import numpy as np +pyarrow_set_numpy_nan(np.nan) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index c8dc3ba9532..56de6e4d2b2 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -25,23 +25,11 @@ #include #include -#include - -#ifdef NPY_1_7_API_VERSION -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#else -#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED -#define NPY_ARRAY_ALIGNED NPY_ALIGNED -#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE -#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY -#endif - -#include - #include "arrow/api.h" #include "arrow/util/bit-util.h" #include "pyarrow/common.h" +#include "pyarrow/config.h" #include "pyarrow/status.h" namespace pyarrow { @@ -307,15 +295,13 @@ inline Status ArrowSerializer::Convert(std::shared_ptr* out) { return Status::OK(); } -PyObject* numpy_nan = nullptr; - static inline bool PyObject_is_null(const PyObject* obj) { return obj == Py_None || obj == numpy_nan; } static inline bool PyObject_is_string(const PyObject* obj) { #if PY_MAJOR_VERSION >= 3 - return PyString_Check(obj) || PyBytes_Check(obj); + return PyUnicode_Check(obj) || PyBytes_Check(obj); #else return PyString_Check(obj) || PyUnicode_Check(obj); #endif diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/config.cc similarity index 84% rename from python/src/pyarrow/init.cc rename to python/src/pyarrow/config.cc index acd851e1687..730d2db99a5 100644 --- a/python/src/pyarrow/init.cc +++ b/python/src/pyarrow/config.cc @@ -15,11 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include "pyarrow/init.h" +#include + +#include "pyarrow/config.h" namespace pyarrow { void pyarrow_init() { } +PyObject* numpy_nan = nullptr; + +void pyarrow_set_numpy_nan(PyObject* obj) { + Py_INCREF(obj); + numpy_nan = obj; +} + } // namespace pyarrow diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/config.h similarity index 57% rename from python/src/pyarrow/init.h rename to python/src/pyarrow/config.h index 71e67a20c1c..85fbe24355f 100644 --- a/python/src/pyarrow/init.h +++ b/python/src/pyarrow/config.h @@ -15,13 +15,40 @@ // specific language governing permissions and limitations // under the License. -#ifndef PYARROW_INIT_H -#define PYARROW_INIT_H +#ifndef PYARROW_CONFIG_H +#define PYARROW_CONFIG_H + +#include + +#include + +#ifdef NPY_1_7_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#else +#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED +#define NPY_ARRAY_ALIGNED NPY_ALIGNED +#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE +#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY +#endif + +#if PY_MAJOR_VERSION >= 3 + #define PyString_Check PyUnicode_Check +#endif + +#include + +#if PY_MAJOR_VERSION >= 3 + #define PyString_Check PyUnicode_Check +#endif namespace pyarrow { +extern PyObject* numpy_nan; + void pyarrow_init(); +void pyarrow_set_numpy_nan(PyObject* obj); + } // namespace pyarrow -#endif // PYARROW_INIT_H +#endif // PYARROW_CONFIG_H From d1f05c5807e56ec33f9d604d820488454f2fec96 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 27 Mar 2016 15:26:09 -0700 Subject: [PATCH 3/6] cpplint --- cpp/src/arrow/util/buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 2bf604f1160..c15f9b630cd 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -152,7 +152,7 @@ static constexpr int64_t MIN_BUFFER_CAPACITY = 1024; class BufferBuilder { public: - BufferBuilder(MemoryPool* pool) : + explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {} From 8475a0ebb4e45a6de381e8264ffee4136279102e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 27 Mar 2016 22:16:39 -0700 Subject: [PATCH 4/6] More pandas conversion scaffolding, enable libpyarrow to use the NumPy C API globally --- python/pyarrow/__init__.py | 6 +- python/pyarrow/array.pyx | 30 ++++ python/pyarrow/config.pyx | 7 + python/pyarrow/includes/common.pxd | 2 + python/pyarrow/includes/pyarrow.pxd | 7 + python/pyarrow/tests/test_convert_pandas.py | 168 ++++++++++++++++++++ python/src/pyarrow/adapters/pandas.cc | 119 +++++++------- python/src/pyarrow/adapters/pandas.h | 20 +++ python/src/pyarrow/common.h | 23 ++- python/src/pyarrow/config.h | 17 +- python/src/pyarrow/do_import_numpy.h | 21 +++ python/src/pyarrow/numpy_interop.h | 58 +++++++ 12 files changed, 397 insertions(+), 81 deletions(-) create mode 100644 python/pyarrow/tests/test_convert_pandas.py create mode 100644 python/src/pyarrow/do_import_numpy.h create mode 100644 python/src/pyarrow/numpy_interop.h diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9a080709beb..79a9f06f995 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -17,7 +17,11 @@ # flake8: noqa -from pyarrow.array import (Array, from_pylist, total_allocated_bytes, +import pyarrow.config + +from pyarrow.array import (Array, + from_pandas_series, from_pylist, + total_allocated_bytes, BooleanArray, NumericArray, Int8Array, UInt8Array, ListArray, StringArray) diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index c5d40ddd7a4..e2c51d0512f 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -22,6 +22,8 @@ from pyarrow.includes.libarrow cimport * cimport pyarrow.includes.pyarrow as pyarrow +import pyarrow.config + from pyarrow.compat import frombytes, tobytes from pyarrow.error cimport check_status @@ -194,6 +196,34 @@ def from_pylist(object list_obj, DataType type=None): return box_arrow_array(sp_array) + +def from_pandas_series(object series, object mask=None): + cdef: + shared_ptr[CArray] out + + series_values = series_as_ndarray(series) + + if mask is None: + check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(), + series_values, &out)) + else: + mask = series_as_ndarray(mask) + check_status(pyarrow.PandasMaskedToArrow( + pyarrow.GetMemoryPool(), series_values, mask, &out)) + + return box_arrow_array(out) + + +cdef object series_as_ndarray(object obj): + import pandas as pd + + if isinstance(obj, pd.Series): + result = obj.values + else: + result = obj + + return result + #---------------------------------------------------------------------- # Table-like data structures diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx index 2ba8cbf05bd..1047a472fe3 100644 --- a/python/pyarrow/config.pyx +++ b/python/pyarrow/config.pyx @@ -2,10 +2,17 @@ # distutils: language = c++ # cython: embedsignature = True +cdef extern from 'pyarrow/do_import_numpy.h': + pass + +cdef extern from 'pyarrow/numpy_interop.h' namespace 'pyarrow': + int import_numpy() + cdef extern from 'pyarrow/config.h' namespace 'pyarrow': void pyarrow_init() void pyarrow_set_numpy_nan(object o) +import_numpy() pyarrow_init() import numpy as np diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 839427a6990..6ab67a50e4a 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -22,6 +22,8 @@ from libcpp cimport bool as c_bool from libcpp.string cimport string as c_string from libcpp.vector cimport vector +from cpython cimport PyObject + # This must be included for cerr and other things to work cdef extern from "": pass diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index eedfc854468..9cd54035a12 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -41,4 +41,11 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: shared_ptr[CDataType] GetPrimitiveType(Type type) Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + Status PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CArray]* out) + Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, + shared_ptr[CArray]* out) + + Status ArrowToPandas(const shared_ptr[CArray]& arr, + PyObject** out) + MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py new file mode 100644 index 00000000000..dd70349785a --- /dev/null +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest + +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + +import pyarrow as A + + +class TestPandasConversion(unittest.TestCase): + + def setUp(self): + pass + + def tearDown(self): + pass + + def _check_pandas_roundtrip(self, df, expected=None): + table = A.from_pandas_dataframe(df) + result = table.to_pandas() + if expected is None: + expected = df + tm.assert_frame_equal(result, expected) + + def test_float_no_nulls(self): + data = {} + numpy_dtypes = ['f4', 'f8'] + num_values = 100 + + for dtype in numpy_dtypes: + values = np.random.randn(num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + self._check_pandas_roundtrip(df) + + def test_float_nulls(self): + num_values = 100 + + null_mask = np.random.randint(0, 10, size=num_values) < 3 + dtypes = ['f4', 'f8'] + expected_cols = [] + + arrays = [] + for name in dtypes: + values = np.random.randn(num_values).astype(name) + + arr = A.from_pandas_series(values, null_mask) + arrays.append(arr) + + values[null_mask] = np.nan + + expected_cols.append((name, values)) + + ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), + columns=dtypes) + + result = A.Table.from_arrays(dtypes, arrays) + tm.assert_frame_equal(result, ex_frame) + + def test_integer_no_nulls(self): + data = {} + + numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + num_values = 100 + + for dtype in numpy_dtypes: + info = np.iinfo(dtype) + values = np.random.randint(info.min, + min(info.max, np.iinfo('i8').max), + size=num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + self._check_pandas_roundtrip(df) + + def test_integer_with_nulls(self): + # pandas requires upcast to float dtype + + int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + num_values = 100 + + null_mask = np.random.randint(0, 10, size=num_values) < 3 + + expected_cols = [] + arrays = [] + for name in int_dtypes: + values = np.random.randint(0, 100, size=num_values) + + arr = A.from_pandas_series(values, null_mask) + arrays.append(arr) + + expected = values.astype('f8') + expected[null_mask] = np.nan + + expected_cols.append(expected) + + ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), + columns=int_dtypes) + + result = A.Table.from_arrays(int_dtypes, arrays) + + tm.assert_frame_equal(result, ex_frame) + + def test_boolean_no_nulls(self): + num_values = 100 + + np.random.seed(0) + + df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) + self._check_pandas_roundtrip(df) + + def test_boolean_nulls(self): + # pandas requires upcast to object dtype + num_values = 100 + np.random.seed(0) + + mask = np.random.randint(0, 10, size=num_values) < 3 + values = np.random.randint(0, 10, size=num_values) < 5 + + arr = A.from_pandas_series(values, mask) + + expected = values.astype(object) + expected[mask] = None + + ex_frame = pd.DataFrame({'bools': expected}) + + result = A.Table.from_arrays(['bools'], [arr]) + tm.assert_frame_equal(result, ex_frame) + + def test_boolean_object_nulls(self): + arr = np.array([False, None, True] * 100, dtype=object) + df = pd.DataFrame({'bools': arr}) + self._check_pandas_roundtrip(df) + + def test_strings(self): + repeats = 1000 + values = [b'foo', None, u'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + + values = ['foo', None, u'bar', 'qux', None] + expected = pd.DataFrame({'strings': values * repeats}) + self._check_pandas_roundtrip(df, expected) + + # def test_category(self): + # repeats = 1000 + # values = [b'foo', None, u'bar', 'qux', np.nan] + # df = pd.DataFrame({'strings': values * repeats}) + # df['strings'] = df['strings'].astype('category') + # self._check_pandas_roundtrip(df) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 56de6e4d2b2..c25efb2fdb3 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -19,6 +19,8 @@ #include +#include "pyarrow/numpy_interop.h" + #include #include #include @@ -346,25 +348,6 @@ inline Status ArrowSerializer::Convert(std::shared_ptr* out) return Status::TypeError("Unable to infer type of object array, were all null"); } -class NumPyBuffer : public arrow::Buffer { - public: - NumPyBuffer(PyArrayObject* arr) : - Buffer(nullptr, 0) { - arr_ = arr; - Py_INCREF(arr); - - data_ = reinterpret_cast(PyArray_DATA(arr_)); - size_ = PyArray_SIZE(arr_); - } - - virtual ~NumPyBuffer() { - Py_XDECREF(arr_); - } - - private: - PyArrayObject* arr_; -}; - template inline Status ArrowSerializer::ConvertData() { // TODO(wesm): strided arrays @@ -416,7 +399,7 @@ inline Status ArrowSerializer::ConvertData() { } \ break; -Status pandas_masked_to_primitive(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, +Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out) { PyArrayObject* arr = reinterpret_cast(ao); PyArrayObject* mask = nullptr; @@ -451,9 +434,9 @@ Status pandas_masked_to_primitive(arrow::MemoryPool* pool, PyObject* ao, PyObjec return Status::OK(); } -Status pandas_to_primitive(arrow::MemoryPool* pool, PyObject* ao, +Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr* out) { - return pandas_masked_to_primitive(pool, ao, nullptr, out); + return PandasMaskedToArrow(pool, ao, nullptr, out); } // ---------------------------------------------------------------------- @@ -539,27 +522,34 @@ class ArrowDeserializer { ArrowDeserializer(const std::shared_ptr& arr) : arr_(arr) {} - PyObject* Convert() { - ConvertValues(); - return reinterpret_cast(out_); + Status Convert(PyObject** out) { + RETURN_NOT_OK(ConvertValues()); + *out = reinterpret_cast(out_); + return Status::OK(); + } + + Status AllocateOutput(int type) { + npy_intp dims[1] = {arr_->length()}; + out_ = reinterpret_cast(PyArray_SimpleNew(1, dims, type)); + + if (out_ == NULL) { + // Error occurred, trust that SimpleNew set the error state + return Status::OK(); + } + + return Status::OK(); } template - inline typename std::enable_if::is_floating, void>::type + inline typename std::enable_if< + arrow_traits::is_floating, Status>::type ConvertValues() { typedef typename arrow_traits::T T; arrow::PrimitiveArray* prim_arr = static_cast( arr_.get()); - npy_intp dims[1] = {arr_->length()}; - out_ = reinterpret_cast( - PyArray_SimpleNew(1, dims, arrow_traits::npy_type)); - - if (out_ == NULL) { - // Error occurred, trust that SimpleNew set the error state - return; - } + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); if (arr_->null_count() > 0) { T* out_values = reinterpret_cast(PyArray_DATA(out_)); @@ -571,11 +561,14 @@ class ArrowDeserializer { memcpy(PyArray_DATA(out_), prim_arr->data()->data(), arr_->length() * arr_->type()->value_size()); } + + return Status::OK(); } // Integer specialization template - inline typename std::enable_if::is_integer, void>::type + inline typename std::enable_if< + arrow_traits::is_integer, Status>::type ConvertValues() { typedef typename arrow_traits::T T; @@ -584,10 +577,8 @@ class ArrowDeserializer { const T* in_values = reinterpret_cast(prim_arr->data()->data()); - npy_intp dims[1] = {arr_->length()}; if (arr_->null_count() > 0) { - out_ = reinterpret_cast(PyArray_SimpleNew(1, dims, NPY_FLOAT64)); - if (out_ == NULL) return; + RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); // Upcast to double, set NaN as appropriate double* out_values = reinterpret_cast(PyArray_DATA(out_)); @@ -595,26 +586,25 @@ class ArrowDeserializer { out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; } } else { - out_ = reinterpret_cast( - PyArray_SimpleNew(1, dims, arrow_traits::npy_type)); - if (out_ == NULL) return; + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + memcpy(PyArray_DATA(out_), in_values, arr_->length() * arr_->type()->value_size()); } + + return Status::OK(); } // Boolean specialization template - inline typename std::enable_if::is_boolean, void>::type + inline typename std::enable_if< + arrow_traits::is_boolean, Status>::type ConvertValues() { - npy_intp dims[1] = {arr_->length()}; - arrow::BooleanArray* bool_arr = static_cast(arr_.get()); if (arr_->null_count() > 0) { - out_ = reinterpret_cast( - PyArray_SimpleNew(1, dims, NPY_OBJECT)); - if (out_ == NULL) return; + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); for (int64_t i = 0; i < arr_->length(); ++i) { if (bool_arr->IsNull(i)) { @@ -631,25 +621,24 @@ class ArrowDeserializer { } } } else { - out_ = reinterpret_cast( - PyArray_SimpleNew(1, dims, arrow_traits::npy_type)); - if (out_ == NULL) return; + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); uint8_t* out_values = reinterpret_cast(PyArray_DATA(out_)); for (int64_t i = 0; i < arr_->length(); ++i) { out_values[i] = static_cast(bool_arr->Value(i)); } } + + return Status::OK(); } // UTF8 template - inline typename std::enable_if::type + inline typename std::enable_if< + T2 == arrow::Type::STRING, Status>::type ConvertValues() { - npy_intp dims[1] = {arr_->length()}; - out_ = reinterpret_cast( - PyArray_SimpleNew(1, dims, NPY_OBJECT)); - if (out_ == NULL) return; + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); arrow::StringArray* string_arr = static_cast(arr_.get()); @@ -663,17 +652,23 @@ class ArrowDeserializer { out_values[i] = Py_None; } else { data = string_arr->GetValue(i, &length); + out_values[i] = make_pystring(data, length); - if (out_values[i] == nullptr) return; + if (out_values[i] == nullptr) { + return Status::OK(); + } } } } else { for (int64_t i = 0; i < arr_->length(); ++i) { data = string_arr->GetValue(i, &length); out_values[i] = make_pystring(data, length); - if (out_values[i] == nullptr) return; + if (out_values[i] == nullptr) { + return Status::OK(); + } } } + return Status::OK(); } private: std::shared_ptr arr_; @@ -684,11 +679,11 @@ class ArrowDeserializer { case arrow::Type::TYPE: \ { \ ArrowDeserializer converter(arr); \ - return converter.Convert(); \ + return converter.Convert(out); \ } \ break; -PyObject* primitive_to_pandas(const std::shared_ptr& arr) { +Status ArrowToPandas(const std::shared_ptr& arr, PyObject** out) { switch(arr->type_enum()) { FROM_ARROW_CASE(BOOL); FROM_ARROW_CASE(INT8); @@ -703,11 +698,9 @@ PyObject* primitive_to_pandas(const std::shared_ptr& arr) { FROM_ARROW_CASE(DOUBLE); FROM_ARROW_CASE(STRING); default: - break; + return Status::NotImplemented("Arrow type reading not implemented"); } - PyErr_SetString(PyExc_NotImplementedError, - "Arrow type reading not implemented"); - return NULL; + return Status::OK(); } } // namespace pyarrow diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index a4f41638087..53200e59da6 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -21,8 +21,28 @@ #ifndef PYARROW_ADAPTERS_PANDAS_H #define PYARROW_ADAPTERS_PANDAS_H +#include + +#include + +namespace arrow { + +class Array; + +} // namespace arrow + namespace pyarrow { +class Status; + +Status ArrowToPandas(const std::shared_ptr& arr, PyObject** out); + +Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, + std::shared_ptr* out); + +Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, + std::shared_ptr* out); + } // namespace pyarrow #endif // PYARROW_ADAPTERS_PANDAS_H diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index db6361384c1..cc9ad9ec5bb 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -18,7 +18,9 @@ #ifndef PYARROW_COMMON_H #define PYARROW_COMMON_H -#include +#include "pyarrow/config.h" + +#include "arrow/util/buffer.h" namespace arrow { class MemoryPool; } @@ -90,6 +92,25 @@ struct PyObjectStringify { arrow::MemoryPool* GetMemoryPool(); +class NumPyBuffer : public arrow::Buffer { + public: + NumPyBuffer(PyArrayObject* arr) : + Buffer(nullptr, 0) { + arr_ = arr; + Py_INCREF(arr); + + data_ = reinterpret_cast(PyArray_DATA(arr_)); + size_ = PyArray_SIZE(arr_); + } + + virtual ~NumPyBuffer() { + Py_XDECREF(arr_); + } + + private: + PyArrayObject* arr_; +}; + } // namespace pyarrow #endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h index 85fbe24355f..48ae715d842 100644 --- a/python/src/pyarrow/config.h +++ b/python/src/pyarrow/config.h @@ -20,22 +20,7 @@ #include -#include - -#ifdef NPY_1_7_API_VERSION -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#else -#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED -#define NPY_ARRAY_ALIGNED NPY_ALIGNED -#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE -#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY -#endif - -#if PY_MAJOR_VERSION >= 3 - #define PyString_Check PyUnicode_Check -#endif - -#include +#include "pyarrow/numpy_interop.h" #if PY_MAJOR_VERSION >= 3 #define PyString_Check PyUnicode_Check diff --git a/python/src/pyarrow/do_import_numpy.h b/python/src/pyarrow/do_import_numpy.h new file mode 100644 index 00000000000..bb4a3829591 --- /dev/null +++ b/python/src/pyarrow/do_import_numpy.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Trick borrowed from dynd-python for initializing the NumPy array API + +// Trigger the array import (inversion of NO_IMPORT_ARRAY) +#define NUMPY_IMPORT_ARRAY diff --git a/python/src/pyarrow/numpy_interop.h b/python/src/pyarrow/numpy_interop.h new file mode 100644 index 00000000000..882d287c7c5 --- /dev/null +++ b/python/src/pyarrow/numpy_interop.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_NUMPY_INTEROP_H +#define PYARROW_NUMPY_INTEROP_H + +#include + +#include + +// Don't use the deprecated Numpy functions +#ifdef NPY_1_7_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#else +#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED +#define NPY_ARRAY_ALIGNED NPY_ALIGNED +#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE +#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY +#endif + +// This is required to be able to access the NumPy C API properly in C++ files +// other than this main one +#define PY_ARRAY_UNIQUE_SYMBOL pyarrow_ARRAY_API +#ifndef NUMPY_IMPORT_ARRAY +#define NO_IMPORT_ARRAY +#endif + +#include +#include + +namespace pyarrow { + +inline int import_numpy() { +#ifdef NUMPY_IMPORT_ARRAY + import_array1(-1); + import_umath1(-1); +#endif + + return 0; +} + +} // namespace pyarrow + +#endif // PYARROW_NUMPY_INTEROP_H From 4c9f76627887f201159322d5f5ff084bf0c2c59c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 28 Mar 2016 00:12:38 -0700 Subject: [PATCH 5/6] More scaffolding. Table wrapper. Initial unit tests passing --- cpp/README.md | 6 +- cpp/src/arrow/array.h | 9 +- python/pyarrow/__init__.py | 2 +- python/pyarrow/array.pyx | 105 ++++++++++++++++++++ python/pyarrow/includes/common.pxd | 4 + python/pyarrow/includes/libarrow.pxd | 52 +++++++--- python/pyarrow/includes/pyarrow.pxd | 6 +- python/pyarrow/tests/test_convert_pandas.py | 12 ++- python/src/pyarrow/adapters/pandas.cc | 66 ++++++------ python/src/pyarrow/adapters/pandas.h | 3 +- 10 files changed, 204 insertions(+), 61 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 542cce43a13..9026cf963f8 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -42,12 +42,12 @@ Detailed unit test logs will be placed in the build directory under `build/test- ### Building/Running benchmarks -Follow the directions for simple build except run cmake +Follow the directions for simple build except run cmake with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: cmake -DARROW_BUILD_BENCHMARKS=ON .. -and instead of make unittest run either `make; ctest` to run both unit tests +and instead of make unittest run either `make; ctest` to run both unit tests and benchmarks or `make runbenchmark` to run only the benchmark tests. Benchmark logs will be placed in the build directory under `build/benchmark-logs`. @@ -60,4 +60,4 @@ variables * Googletest: `GTEST_HOME` (only required to build the unit tests) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) - +* Flatbuffers: `FLATBUFFERS_HOME` (only required for the IPC extensions) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index f2992741852..097634d74f8 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -34,13 +34,10 @@ class Buffer; // // The base class is only required to have a null bitmap buffer if the null // count is greater than 0 -// -// Any buffers used to initialize the array have their references "stolen". If -// you wish to use the buffer beyond the lifetime of the array, you need to -// explicitly increment its reference count class Array { public: - Array(const TypePtr& type, int32_t length, int32_t null_count = 0, + Array(const std::shared_ptr& type, int32_t length, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); virtual ~Array() {} @@ -68,7 +65,7 @@ class Array { virtual bool Equals(const std::shared_ptr& arr) const = 0; protected: - TypePtr type_; + std::shared_ptr type_; int32_t null_count_; int32_t length_; diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 79a9f06f995..c343f5ba5f1 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -41,4 +41,4 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.array import RowBatch +from pyarrow.array import RowBatch, Table, from_pandas_dataframe diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index e2c51d0512f..88770cdaa96 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -46,6 +46,10 @@ cdef class Array: self.type = DataType() self.type.init(self.sp_array.get().type()) + @staticmethod + def from_pandas(obj, mask=None): + return from_pandas_series(obj, mask) + property null_count: def __get__(self): @@ -162,7 +166,15 @@ cdef class StringArray(Array): cdef dict _array_classes = { Type_NA: NullArray, Type_BOOL: BooleanArray, + Type_UINT8: UInt8Array, + Type_UINT16: UInt16Array, + Type_UINT32: UInt32Array, + Type_UINT64: UInt64Array, + Type_INT8: Int8Array, + Type_INT16: Int16Array, + Type_INT32: Int32Array, Type_INT64: Int64Array, + Type_FLOAT: FloatArray, Type_DOUBLE: DoubleArray, Type_LIST: ListArray, Type_STRING: StringArray, @@ -214,6 +226,21 @@ def from_pandas_series(object series, object mask=None): return box_arrow_array(out) +def from_pandas_dataframe(object df, name=None): + cdef: + list names = [] + list arrays = [] + + for name in df.columns: + col = df[name] + arr = from_pandas_series(col) + + names.append(name) + arrays.append(arr) + + return Table.from_arrays(names, arrays, name=name) + + cdef object series_as_ndarray(object obj): import pandas as pd @@ -255,3 +282,81 @@ cdef class RowBatch: def __getitem__(self, i): return self.arrays[i] + + +cdef class Table: + ''' + Do not call this class's constructor directly. + ''' + cdef: + shared_ptr[CTable] sp_table + CTable* table + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CTable]& table): + self.sp_table = table + self.table = table.get() + + @staticmethod + def from_pandas(df, name=None): + pass + + @staticmethod + def from_arrays(names, arrays, name=None): + cdef: + Array arr + Table result + c_string c_name + vector[shared_ptr[CField]] fields + vector[shared_ptr[CColumn]] columns + shared_ptr[CSchema] schema + shared_ptr[CTable] table + + cdef int K = len(arrays) + + fields.resize(K) + columns.resize(K) + for i in range(K): + arr = arrays[i] + c_name = tobytes(names[i]) + + fields[i].reset(new CField(c_name, arr.type.sp_type, True)) + columns[i].reset(new CColumn(fields[i], arr.sp_array)) + + if name is None: + c_name = '' + else: + c_name = tobytes(name) + + schema.reset(new CSchema(fields)) + table.reset(new CTable(c_name, schema, columns)) + + result = Table() + result.init(table) + + return result + + def to_pandas(self): + """ + Convert the arrow::Table to a pandas DataFrame + """ + cdef: + PyObject* arr + shared_ptr[CColumn] col + + import pandas as pd + + names = [] + data = [] + for i in range(self.table.num_columns()): + col = self.table.column(i) + check_status(pyarrow.ArrowToPandas(col, &arr)) + names.append(frombytes(col.get().name())) + data.append( arr) + + # One ref count too many + Py_XDECREF(arr) + + return pd.DataFrame(dict(zip(names, data)), columns=names) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 6ab67a50e4a..e86d5d77e8b 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -23,11 +23,15 @@ from libcpp.string cimport string as c_string from libcpp.vector cimport vector from cpython cimport PyObject +cimport cpython # This must be included for cerr and other things to work cdef extern from "": pass +cdef extern from "": + void Py_XDECREF(PyObject* o) + cdef extern from "" namespace "std" nogil: cdef cppclass shared_ptr[T]: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 943a08f84a0..42f1f25073d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -19,6 +19,25 @@ from pyarrow.includes.common cimport * +cdef extern from "arrow/api.h" namespace "arrow" nogil: + # We can later add more of the common status factory methods as needed + cdef CStatus CStatus_OK "Status::OK"() + + cdef cppclass CStatus "arrow::Status": + CStatus() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsNotImplemented() + c_bool IsInvalid() + + cdef cppclass Buffer: + uint8_t* data() + int64_t size() + cdef extern from "arrow/api.h" namespace "arrow" nogil: enum Type" arrow::Type::type": @@ -129,25 +148,30 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringArray" arrow::StringArray"(CListArray): c_string GetString(int i) + cdef cppclass CChunkedArray" arrow::ChunkedArray": + pass -cdef extern from "arrow/api.h" namespace "arrow" nogil: - # We can later add more of the common status factory methods as needed - cdef CStatus CStatus_OK "Status::OK"() + cdef cppclass CColumn" arrow::Column": + CColumn(const shared_ptr[CField]& field, + const shared_ptr[CArray]& data) - cdef cppclass CStatus "arrow::Status": - CStatus() + int64_t length() + int64_t null_count() + const c_string& name() + const shared_ptr[CDataType]& type() + const shared_ptr[CChunkedArray]& data() - c_string ToString() + cdef cppclass CTable" arrow::Table": + CTable(const c_string& name, const shared_ptr[CSchema]& schema, + const vector[shared_ptr[CColumn]]& columns) - c_bool ok() - c_bool IsOutOfMemory() - c_bool IsKeyError() - c_bool IsNotImplemented() - c_bool IsInvalid() + int num_columns() + int num_rows() - cdef cppclass Buffer: - uint8_t* data() - int64_t size() + const c_string& name() + + const shared_ptr[CSchema]& schema() + const shared_ptr[CColumn]& column(int i) cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil: diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 9cd54035a12..1066b8034be 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,7 +18,8 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport CArray, CDataType, Type, MemoryPool +from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, + Type, MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -45,7 +46,6 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, shared_ptr[CArray]* out) - Status ArrowToPandas(const shared_ptr[CArray]& arr, - PyObject** out) + Status ArrowToPandas(const shared_ptr[CColumn]& arr, PyObject** out) MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index dd70349785a..6dc9c689e24 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -68,12 +68,13 @@ def test_float_nulls(self): values[null_mask] = np.nan - expected_cols.append((name, values)) + expected_cols.append(values) ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), columns=dtypes) - result = A.Table.from_arrays(dtypes, arrays) + table = A.Table.from_arrays(dtypes, arrays) + result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) def test_integer_no_nulls(self): @@ -116,7 +117,8 @@ def test_integer_with_nulls(self): ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), columns=int_dtypes) - result = A.Table.from_arrays(int_dtypes, arrays) + table = A.Table.from_arrays(int_dtypes, arrays) + result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) @@ -143,7 +145,9 @@ def test_boolean_nulls(self): ex_frame = pd.DataFrame({'bools': expected}) - result = A.Table.from_arrays(['bools'], [arr]) + table = A.Table.from_arrays(['bools'], [arr]) + result = table.to_pandas() + tm.assert_frame_equal(result, ex_frame) def test_boolean_object_nulls(self): diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index c25efb2fdb3..22f1d7575f8 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -37,6 +37,7 @@ namespace pyarrow { using arrow::Array; +using arrow::Column; namespace util = arrow::util; // ---------------------------------------------------------------------- @@ -519,17 +520,24 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { template class ArrowDeserializer { public: - ArrowDeserializer(const std::shared_ptr& arr) : - arr_(arr) {} + ArrowDeserializer(const std::shared_ptr& col) : + col_(col) {} Status Convert(PyObject** out) { - RETURN_NOT_OK(ConvertValues()); + const std::shared_ptr data = col_->data(); + if (data->num_chunks() > 1) { + return Status::NotImplemented("Chunked column conversion NYI"); + } + + auto chunk = data->chunk(0); + + RETURN_NOT_OK(ConvertValues(chunk)); *out = reinterpret_cast(out_); return Status::OK(); } Status AllocateOutput(int type) { - npy_intp dims[1] = {arr_->length()}; + npy_intp dims[1] = {col_->length()}; out_ = reinterpret_cast(PyArray_SimpleNew(1, dims, type)); if (out_ == NULL) { @@ -543,23 +551,23 @@ class ArrowDeserializer { template inline typename std::enable_if< arrow_traits::is_floating, Status>::type - ConvertValues() { + ConvertValues(const std::shared_ptr& arr) { typedef typename arrow_traits::T T; arrow::PrimitiveArray* prim_arr = static_cast( - arr_.get()); + arr.get()); RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); - if (arr_->null_count() > 0) { + if (arr->null_count() > 0) { T* out_values = reinterpret_cast(PyArray_DATA(out_)); const T* in_values = reinterpret_cast(prim_arr->data()->data()); - for (int64_t i = 0; i < arr_->length(); ++i) { - out_values[i] = arr_->IsNull(i) ? NAN : in_values[i]; + for (int64_t i = 0; i < arr->length(); ++i) { + out_values[i] = arr->IsNull(i) ? NAN : in_values[i]; } } else { memcpy(PyArray_DATA(out_), prim_arr->data()->data(), - arr_->length() * arr_->type()->value_size()); + arr->length() * arr->type()->value_size()); } return Status::OK(); @@ -569,27 +577,27 @@ class ArrowDeserializer { template inline typename std::enable_if< arrow_traits::is_integer, Status>::type - ConvertValues() { + ConvertValues(const std::shared_ptr& arr) { typedef typename arrow_traits::T T; arrow::PrimitiveArray* prim_arr = static_cast( - arr_.get()); + arr.get()); const T* in_values = reinterpret_cast(prim_arr->data()->data()); - if (arr_->null_count() > 0) { + if (arr->null_count() > 0) { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); // Upcast to double, set NaN as appropriate double* out_values = reinterpret_cast(PyArray_DATA(out_)); - for (int i = 0; i < arr_->length(); ++i) { + for (int i = 0; i < arr->length(); ++i) { out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; } } else { RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); memcpy(PyArray_DATA(out_), in_values, - arr_->length() * arr_->type()->value_size()); + arr->length() * arr->type()->value_size()); } return Status::OK(); @@ -599,14 +607,14 @@ class ArrowDeserializer { template inline typename std::enable_if< arrow_traits::is_boolean, Status>::type - ConvertValues() { - arrow::BooleanArray* bool_arr = static_cast(arr_.get()); + ConvertValues(const std::shared_ptr& arr) { + arrow::BooleanArray* bool_arr = static_cast(arr.get()); - if (arr_->null_count() > 0) { + if (arr->null_count() > 0) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); - for (int64_t i = 0; i < arr_->length(); ++i) { + for (int64_t i = 0; i < arr->length(); ++i) { if (bool_arr->IsNull(i)) { Py_INCREF(Py_None); out_values[i] = Py_None; @@ -624,7 +632,7 @@ class ArrowDeserializer { RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); uint8_t* out_values = reinterpret_cast(PyArray_DATA(out_)); - for (int64_t i = 0; i < arr_->length(); ++i) { + for (int64_t i = 0; i < arr->length(); ++i) { out_values[i] = static_cast(bool_arr->Value(i)); } } @@ -636,17 +644,17 @@ class ArrowDeserializer { template inline typename std::enable_if< T2 == arrow::Type::STRING, Status>::type - ConvertValues() { + ConvertValues(const std::shared_ptr& arr) { RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); - arrow::StringArray* string_arr = static_cast(arr_.get()); + arrow::StringArray* string_arr = static_cast(arr.get()); const uint8_t* data; int32_t length; - if (arr_->null_count() > 0) { - for (int64_t i = 0; i < arr_->length(); ++i) { + if (arr->null_count() > 0) { + for (int64_t i = 0; i < arr->length(); ++i) { if (string_arr->IsNull(i)) { Py_INCREF(Py_None); out_values[i] = Py_None; @@ -660,7 +668,7 @@ class ArrowDeserializer { } } } else { - for (int64_t i = 0; i < arr_->length(); ++i) { + for (int64_t i = 0; i < arr->length(); ++i) { data = string_arr->GetValue(i, &length); out_values[i] = make_pystring(data, length); if (out_values[i] == nullptr) { @@ -671,20 +679,20 @@ class ArrowDeserializer { return Status::OK(); } private: - std::shared_ptr arr_; + std::shared_ptr col_; PyArrayObject* out_; }; #define FROM_ARROW_CASE(TYPE) \ case arrow::Type::TYPE: \ { \ - ArrowDeserializer converter(arr); \ + ArrowDeserializer converter(col); \ return converter.Convert(out); \ } \ break; -Status ArrowToPandas(const std::shared_ptr& arr, PyObject** out) { - switch(arr->type_enum()) { +Status ArrowToPandas(const std::shared_ptr& col, PyObject** out) { + switch(col->type()->type) { FROM_ARROW_CASE(BOOL); FROM_ARROW_CASE(INT8); FROM_ARROW_CASE(INT16); diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index 53200e59da6..58eb3ca61cd 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -28,6 +28,7 @@ namespace arrow { class Array; +class Column; } // namespace arrow @@ -35,7 +36,7 @@ namespace pyarrow { class Status; -Status ArrowToPandas(const std::shared_ptr& arr, PyObject** out); +Status ArrowToPandas(const std::shared_ptr& col, PyObject** out); Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out); From 0a9e747fc7b84ffd84fff44142bcf88fba921951 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 28 Mar 2016 00:23:32 -0700 Subject: [PATCH 6/6] Invoke py.test with python -m pytest --- ci/travis_script_python.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index af6b0085724..d45b895d8cf 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -48,17 +48,11 @@ python_version_tests() { python setup.py build_ext --inplace - py.test -vv -r sxX pyarrow + python -m pytest -vv -r sxX pyarrow } # run tests for python 2.7 and 3.5 python_version_tests 2.7 python_version_tests 3.5 -# if [ $TRAVIS_OS_NAME == "linux" ]; then -# valgrind --tool=memcheck py.test -vv -r sxX arrow -# else -# py.test -vv -r sxX arrow -# fi - popd