diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 8d4308065c2..7672861d447 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -68,6 +68,9 @@ class Ndarray1DIndexer { int64_t stride_; }; +// Handling of Numpy Types by their static numbers +// (the NPY_TYPES enum and related defines) + static inline std::string GetNumPyTypeName(int npy_type) { #define TYPE_CASE(TYPE, NAME) \ case NPY_##TYPE: \ @@ -79,14 +82,20 @@ static inline std::string GetNumPyTypeName(int npy_type) { TYPE_CASE(INT16, "int16") TYPE_CASE(INT32, "int32") TYPE_CASE(INT64, "int64") -#if (NPY_INT64 != NPY_LONGLONG) +#if !NPY_INT32_IS_INT + TYPE_CASE(INT, "intc") +#endif +#if !NPY_INT64_IS_LONG_LONG TYPE_CASE(LONGLONG, "longlong") #endif TYPE_CASE(UINT8, "uint8") TYPE_CASE(UINT16, "uint16") TYPE_CASE(UINT32, "uint32") TYPE_CASE(UINT64, "uint64") -#if (NPY_UINT64 != NPY_ULONGLONG) +#if !NPY_INT32_IS_INT + TYPE_CASE(UINT, "uintc") +#endif +#if !NPY_INT64_IS_LONG_LONG TYPE_CASE(ULONGLONG, "ulonglong") #endif TYPE_CASE(FLOAT16, "float16") @@ -100,9 +109,48 @@ static inline std::string GetNumPyTypeName(int npy_type) { } #undef TYPE_CASE - return "unrecognized type in GetNumPyTypeName"; + std::stringstream ss; + ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName"; + return ss.str(); } +#define TYPE_VISIT_INLINE(TYPE) \ + case NPY_##TYPE: \ + return visitor->template Visit(arr); + +template +inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { + switch (PyArray_TYPE(arr)) { + TYPE_VISIT_INLINE(BOOL); + TYPE_VISIT_INLINE(INT8); + TYPE_VISIT_INLINE(UINT8); + TYPE_VISIT_INLINE(INT16); + TYPE_VISIT_INLINE(UINT16); + TYPE_VISIT_INLINE(INT32); + TYPE_VISIT_INLINE(UINT32); + TYPE_VISIT_INLINE(INT64); + TYPE_VISIT_INLINE(UINT64); +#if !NPY_INT32_IS_INT + TYPE_VISIT_INLINE(INT); + TYPE_VISIT_INLINE(UINT); +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_VISIT_INLINE(LONGLONG); + TYPE_VISIT_INLINE(ULONGLONG); +#endif + TYPE_VISIT_INLINE(FLOAT16); + TYPE_VISIT_INLINE(FLOAT32); + TYPE_VISIT_INLINE(FLOAT64); + TYPE_VISIT_INLINE(DATETIME); + TYPE_VISIT_INLINE(OBJECT); + } + std::stringstream ss; + ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr)); + return Status::NotImplemented(ss.str()); +} + +#undef TYPE_VISIT_INLINE + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_interop.h b/cpp/src/arrow/python/numpy_interop.h index 8c569e232c1..0715c66c50d 100644 --- a/cpp/src/arrow/python/numpy_interop.h +++ b/cpp/src/arrow/python/numpy_interop.h @@ -43,6 +43,31 @@ #include #include +// A bit subtle. Numpy has 5 canonical integer types: +// (or, rather, type pairs: signed and unsigned) +// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG +// It also has 4 fixed-width integer aliases. +// When mapping Arrow integer types to these 4 fixed-width aliases, +// we always miss one of the canonical types (even though it may +// have the same width as one of the aliases). +// Which one depends on the platform... +// On a LP64 system, NPY_INT64 maps to NPY_LONG and +// NPY_LONGLONG needs to be handled separately. +// On a LLP64 system, NPY_INT32 maps to NPY_LONG and +// NPY_INT needs to be handled separately. + +#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64 +#define NPY_INT64_IS_LONG_LONG 1 +#else +#define NPY_INT64_IS_LONG_LONG 0 +#endif + +#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64 +#define NPY_INT32_IS_INT 1 +#else +#define NPY_INT32_IS_INT 0 +#endif + namespace arrow { namespace py { diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 04a71c1f640..6ddc4a7bee0 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -84,6 +84,38 @@ inline bool PyObject_is_integer(PyObject* obj) { return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj); } +Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { + if (PyArray_NDIM(numpy_array) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + const int received_type = PyArray_DESCR(numpy_array)->type_num; + if (received_type != np_type) { + std::stringstream ss; + ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got " + << GetNumPyTypeName(received_type); + return Status::Invalid(ss.str()); + } + + return Status::OK(); +} + +Status AllocateNullBitmap(MemoryPool* pool, int64_t length, + std::shared_ptr* out) { + int64_t null_bytes = BitUtil::BytesForBits(length); + std::shared_ptr null_bitmap; + + null_bitmap = std::make_shared(pool); + RETURN_NOT_OK(null_bitmap->Resize(null_bytes)); + + memset(null_bitmap->mutable_data(), 0, static_cast(null_bytes)); + *out = null_bitmap; + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Conversion from NumPy-in-Pandas to Arrow null bitmap + template inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef internal::npy_traits traits; @@ -103,6 +135,55 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { return null_count; } +class NumPyNullsConverter { + public: + /// Convert the given array's null values to a null bitmap. + /// The null bitmap is only allocated if null values are ever possible. + static Status Convert(MemoryPool* pool, PyArrayObject* arr, + bool use_pandas_null_sentinels, + std::shared_ptr* out_null_bitmap_, + int64_t* out_null_count) { + NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels); + RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); + *out_null_bitmap_ = converter.null_bitmap_; + *out_null_count = converter.null_count_; + return Status::OK(); + } + + template + Status Visit(PyArrayObject* arr) { + typedef internal::npy_traits traits; + + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + TYPE == NPY_DATETIME || + // Observing pandas's null sentinels + (use_pandas_null_sentinels_ && traits::supports_nulls); + + if (null_sentinels_possible) { + RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_)); + null_count_ = ValuesToBitmap(arr, null_bitmap_->mutable_data()); + } + return Status::OK(); + } + + protected: + NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, + bool use_pandas_null_sentinels) + : pool_(pool), + arr_(arr), + use_pandas_null_sentinels_(use_pandas_null_sentinels), + null_bitmap_data_(nullptr), + null_count_(0) {} + + MemoryPool* pool_; + PyArrayObject* arr_; + bool use_pandas_null_sentinels_; + std::shared_ptr null_bitmap_; + uint8_t* null_bitmap_data_; + int64_t null_count_; +}; + // Returns null count int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { int64_t null_count = 0; @@ -119,22 +200,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { return null_count; } -Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { - if (PyArray_NDIM(numpy_array) != 1) { - return Status::Invalid("only handle 1-dimensional arrays"); - } - - const int received_type = PyArray_DESCR(numpy_array)->type_num; - if (received_type != np_type) { - std::stringstream ss; - ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got " - << GetNumPyTypeName(received_type); - return Status::Invalid(ss.str()); - } - - return Status::OK(); -} - } // namespace /// Append as many string objects from NumPy arrays to a `StringBuilder` as we @@ -301,7 +366,9 @@ class NumPyConverter { dtype_(PyArray_DESCR(arr_)), mask_(nullptr), use_pandas_null_sentinels_(use_pandas_null_sentinels), - decimal_type_() { + decimal_type_(), + null_bitmap_data_(nullptr), + null_count_(0) { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } @@ -356,14 +423,8 @@ class NumPyConverter { protected: Status InitNullBitmap() { - int64_t null_bytes = BitUtil::BytesForBits(length_); - - null_bitmap_ = std::make_shared(pool_); - RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); - + RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_)); null_bitmap_data_ = null_bitmap_->mutable_data(); - memset(null_bitmap_data_, 0, static_cast(null_bytes)); - return Status::OK(); } @@ -414,32 +475,18 @@ class NumPyConverter { template Status VisitNative() { - using traits = internal::arrow_traits; - - const bool null_sentinels_possible = - // NumPy has a NaT type - (ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == Type::DATE32) || - - // Observing pandas's null sentinels - ((use_pandas_null_sentinels_ && traits::supports_nulls)); - - if (mask_ != nullptr || null_sentinels_possible) { + if (mask_ != nullptr) { RETURN_NOT_OK(InitNullBitmap()); + null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); + } else { + RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, use_pandas_null_sentinels_, + &null_bitmap_, &null_count_)); } std::shared_ptr data; RETURN_NOT_OK(ConvertData(&data)); - int64_t null_count = 0; - if (mask_ != nullptr) { - null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); - } else if (null_sentinels_possible) { - // TODO(wesm): this presumes the NumPy C type and arrow C type are the - // same - null_count = ValuesToBitmap(arr_, null_bitmap_data_); - } - - auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count, 0); + auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); return PushArray(arr_data); } @@ -493,6 +540,7 @@ class NumPyConverter { std::shared_ptr null_bitmap_; uint8_t* null_bitmap_data_; + int64_t null_count_; }; Status NumPyConverter::Convert() { @@ -659,12 +707,10 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d Status s = StaticCastBuffer(**data, length_, pool_, data); RETURN_NOT_OK(s); } else { - // TODO(wesm): This is redundant, and recomputed in VisitNative() - const int64_t null_count = ValuesToBitmap(arr_, null_bitmap_data_); - RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count, + // The null bitmap was already computed in VisitNative() + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, pool_, data)); } } diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index 587b27c9a60..ff39aad672e 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -34,6 +34,9 @@ namespace arrow { namespace py { namespace internal { +// +// Type traits for Numpy -> Arrow equivalence +// template struct npy_traits {}; @@ -68,7 +71,11 @@ NPY_INT_DECL(UINT16, UInt16, uint16_t); NPY_INT_DECL(UINT32, UInt32, uint32_t); NPY_INT_DECL(UINT64, UInt64, uint64_t); -#if NPY_INT64 != NPY_LONGLONG +#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 +NPY_INT_DECL(INT, Int32, int32_t); +NPY_INT_DECL(UINT, UInt32, uint32_t); +#endif +#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 NPY_INT_DECL(LONGLONG, Int64, int64_t); NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); #endif @@ -127,8 +134,14 @@ template <> struct npy_traits { typedef PyObject* value_type; static constexpr bool supports_nulls = true; + + static inline bool isnull(PyObject* v) { return v == Py_None; } }; +// +// Type traits for Arrow -> Numpy equivalence +// Note *supports_nulls* means the equivalent Numpy type support nulls +// template struct arrow_traits {}; @@ -252,30 +265,27 @@ struct arrow_traits { static inline int NumPyTypeSize(int npy_type) { switch (npy_type) { case NPY_BOOL: - return 1; case NPY_INT8: - return 1; - case NPY_INT16: - return 2; - case NPY_INT32: - return 4; - case NPY_INT64: - return 8; -#if (NPY_INT64 != NPY_LONGLONG) - case NPY_LONGLONG: - return 8; -#endif case NPY_UINT8: return 1; + case NPY_INT16: case NPY_UINT16: return 2; + case NPY_INT32: case NPY_UINT32: return 4; + case NPY_INT64: case NPY_UINT64: return 8; -#if (NPY_UINT64 != NPY_ULONGLONG) +#if !NPY_INT32_IS_INT + case NPY_INT: + case NPY_UINT: + return NPY_BITSOF_INT / 8; +#endif +#if !NPY_INT64_IS_LONG_LONG + case NPY_LONGLONG: case NPY_ULONGLONG: - return 8; + return NPY_BITSOF_LONGLONG / 8; #endif case NPY_FLOAT16: return 2; diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 5abc026bf9b..aef3dca1252 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -501,6 +501,14 @@ def test_float_nulls(self): result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) + def test_float_nulls_to_ints(self): + # ARROW-2135 + df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]}) + schema = pa.schema([pa.field("a", pa.int16(), nullable=True)]) + table = pa.Table.from_pandas(df, schema=schema) + assert table[0].to_pylist() == [1, 2, None] + tm.assert_frame_equal(df, table.to_pandas()) + def test_integer_no_nulls(self): data = OrderedDict() fields = [] @@ -526,6 +534,17 @@ def test_integer_no_nulls(self): schema = pa.schema(fields) _check_pandas_roundtrip(df, expected_schema=schema) + def test_all_integer_types(self): + # Test all Numpy integer aliases + data = OrderedDict() + numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', + 'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc', + 'int_', 'uint', 'longlong', 'ulonglong'] + for dtype in numpy_dtypes: + data[dtype] = np.arange(12, dtype=dtype) + df = pd.DataFrame(data) + _check_pandas_roundtrip(df) + def test_integer_with_nulls(self): # pandas requires upcast to float dtype