Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 51 additions & 3 deletions cpp/src/arrow/python/numpy-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ class Ndarray1DIndexer {
int64_t stride_;
};

// Handling of Numpy Types by their static numbers
// (the NPY_TYPES enum and related defines)

static inline std::string GetNumPyTypeName(int npy_type) {
#define TYPE_CASE(TYPE, NAME) \
case NPY_##TYPE: \
Expand All @@ -79,14 +82,20 @@ static inline std::string GetNumPyTypeName(int npy_type) {
TYPE_CASE(INT16, "int16")
TYPE_CASE(INT32, "int32")
TYPE_CASE(INT64, "int64")
#if (NPY_INT64 != NPY_LONGLONG)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some reason (macro expansion?) these #ifs wouldn't work correctly here, even though NPY_INT64 is defined to NPY_LONG.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, actually, that must be because NPY_LONGLONG is not a macro...

#if !NPY_INT32_IS_INT
TYPE_CASE(INT, "intc")
#endif
#if !NPY_INT64_IS_LONG_LONG
TYPE_CASE(LONGLONG, "longlong")
#endif
TYPE_CASE(UINT8, "uint8")
TYPE_CASE(UINT16, "uint16")
TYPE_CASE(UINT32, "uint32")
TYPE_CASE(UINT64, "uint64")
#if (NPY_UINT64 != NPY_ULONGLONG)
#if !NPY_INT32_IS_INT
TYPE_CASE(UINT, "uintc")
#endif
#if !NPY_INT64_IS_LONG_LONG
TYPE_CASE(ULONGLONG, "ulonglong")
#endif
TYPE_CASE(FLOAT16, "float16")
Expand All @@ -100,9 +109,48 @@ static inline std::string GetNumPyTypeName(int npy_type) {
}

#undef TYPE_CASE
return "unrecognized type in GetNumPyTypeName";
std::stringstream ss;
ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
return ss.str();
}

#define TYPE_VISIT_INLINE(TYPE) \
case NPY_##TYPE: \
return visitor->template Visit<NPY_##TYPE>(arr);

template <typename VISITOR>
inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
switch (PyArray_TYPE(arr)) {
TYPE_VISIT_INLINE(BOOL);
TYPE_VISIT_INLINE(INT8);
TYPE_VISIT_INLINE(UINT8);
TYPE_VISIT_INLINE(INT16);
TYPE_VISIT_INLINE(UINT16);
TYPE_VISIT_INLINE(INT32);
TYPE_VISIT_INLINE(UINT32);
TYPE_VISIT_INLINE(INT64);
TYPE_VISIT_INLINE(UINT64);
#if !NPY_INT32_IS_INT
TYPE_VISIT_INLINE(INT);
TYPE_VISIT_INLINE(UINT);
#endif
#if !NPY_INT64_IS_LONG_LONG
TYPE_VISIT_INLINE(LONGLONG);
TYPE_VISIT_INLINE(ULONGLONG);
#endif
TYPE_VISIT_INLINE(FLOAT16);
TYPE_VISIT_INLINE(FLOAT32);
TYPE_VISIT_INLINE(FLOAT64);
TYPE_VISIT_INLINE(DATETIME);
TYPE_VISIT_INLINE(OBJECT);
}
std::stringstream ss;
ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr));
return Status::NotImplemented(ss.str());
}

#undef TYPE_VISIT_INLINE

} // namespace py
} // namespace arrow

Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/python/numpy_interop.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,31 @@
#include <numpy/arrayscalars.h>
#include <numpy/ufuncobject.h>

// A bit subtle. Numpy has 5 canonical integer types:
// (or, rather, type pairs: signed and unsigned)
// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
// It also has 4 fixed-width integer aliases.
// When mapping Arrow integer types to these 4 fixed-width aliases,
// we always miss one of the canonical types (even though it may
// have the same width as one of the aliases).
// Which one depends on the platform...
// On a LP64 system, NPY_INT64 maps to NPY_LONG and
// NPY_LONGLONG needs to be handled separately.
// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
// NPY_INT needs to be handled separately.

#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
#define NPY_INT64_IS_LONG_LONG 1
#else
#define NPY_INT64_IS_LONG_LONG 0
#endif

#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
#define NPY_INT32_IS_INT 1
#else
#define NPY_INT32_IS_INT 0
#endif

namespace arrow {
namespace py {

Expand Down
142 changes: 94 additions & 48 deletions cpp/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,38 @@ inline bool PyObject_is_integer(PyObject* obj) {
return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj);
}

Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
if (PyArray_NDIM(numpy_array) != 1) {
return Status::Invalid("only handle 1-dimensional arrays");
}

const int received_type = PyArray_DESCR(numpy_array)->type_num;
if (received_type != np_type) {
std::stringstream ss;
ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
<< GetNumPyTypeName(received_type);
return Status::Invalid(ss.str());
}

return Status::OK();
}

Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
std::shared_ptr<ResizableBuffer>* out) {
int64_t null_bytes = BitUtil::BytesForBits(length);
std::shared_ptr<ResizableBuffer> null_bitmap;

null_bitmap = std::make_shared<PoolBuffer>(pool);
RETURN_NOT_OK(null_bitmap->Resize(null_bytes));

memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
*out = null_bitmap;
return Status::OK();
}

// ----------------------------------------------------------------------
// Conversion from NumPy-in-Pandas to Arrow null bitmap

template <int TYPE>
inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
typedef internal::npy_traits<TYPE> traits;
Expand All @@ -103,6 +135,55 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
return null_count;
}

class NumPyNullsConverter {
public:
/// Convert the given array's null values to a null bitmap.
/// The null bitmap is only allocated if null values are ever possible.
static Status Convert(MemoryPool* pool, PyArrayObject* arr,
bool use_pandas_null_sentinels,
std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
int64_t* out_null_count) {
NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels);
RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
*out_null_bitmap_ = converter.null_bitmap_;
*out_null_count = converter.null_count_;
return Status::OK();
}

template <int TYPE>
Status Visit(PyArrayObject* arr) {
typedef internal::npy_traits<TYPE> traits;

const bool null_sentinels_possible =
// Always treat Numpy's NaT as null
TYPE == NPY_DATETIME ||
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, I don't know what that is, but this is required to have the tests pass. Why do we always treat NaT as null but not floating-point NaN? @wesm

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIU There's no other way to interpret NaT other than NULL (unless there's a standard that defines it in a different way than "missing"). nan is part of the IEEE floating point specification (as I'm sure you know) and it has a different meaning than null.

// Observing pandas's null sentinels
(use_pandas_null_sentinels_ && traits::supports_nulls);

if (null_sentinels_possible) {
RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
}
return Status::OK();
}

protected:
NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr,
bool use_pandas_null_sentinels)
: pool_(pool),
arr_(arr),
use_pandas_null_sentinels_(use_pandas_null_sentinels),
null_bitmap_data_(nullptr),
null_count_(0) {}

MemoryPool* pool_;
PyArrayObject* arr_;
bool use_pandas_null_sentinels_;
std::shared_ptr<ResizableBuffer> null_bitmap_;
uint8_t* null_bitmap_data_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point we may want to have an STL-compatible view class that makes interacting with iterators constructs in the STL much easier. We have a lot of code that is manually handling iteration using a size/count and a buffer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which iterators are you thinking about? Do you mean the ndarray 1d iterator?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's one, though I added begin()/end() for that in #1651.

int64_t null_count_;
};

// Returns null count
int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
int64_t null_count = 0;
Expand All @@ -119,22 +200,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
return null_count;
}

Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
if (PyArray_NDIM(numpy_array) != 1) {
return Status::Invalid("only handle 1-dimensional arrays");
}

const int received_type = PyArray_DESCR(numpy_array)->type_num;
if (received_type != np_type) {
std::stringstream ss;
ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
<< GetNumPyTypeName(received_type);
return Status::Invalid(ss.str());
}

return Status::OK();
}

} // namespace

/// Append as many string objects from NumPy arrays to a `StringBuilder` as we
Expand Down Expand Up @@ -301,7 +366,9 @@ class NumPyConverter {
dtype_(PyArray_DESCR(arr_)),
mask_(nullptr),
use_pandas_null_sentinels_(use_pandas_null_sentinels),
decimal_type_() {
decimal_type_(),
null_bitmap_data_(nullptr),
null_count_(0) {
if (mo != nullptr && mo != Py_None) {
mask_ = reinterpret_cast<PyArrayObject*>(mo);
}
Expand Down Expand Up @@ -356,14 +423,8 @@ class NumPyConverter {

protected:
Status InitNullBitmap() {
int64_t null_bytes = BitUtil::BytesForBits(length_);

null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));

RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
null_bitmap_data_ = null_bitmap_->mutable_data();
memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));

return Status::OK();
}

Expand Down Expand Up @@ -414,32 +475,18 @@ class NumPyConverter {

template <typename ArrowType>
Status VisitNative() {
using traits = internal::arrow_traits<ArrowType::type_id>;

const bool null_sentinels_possible =
// NumPy has a NaT type
(ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == Type::DATE32) ||

// Observing pandas's null sentinels
((use_pandas_null_sentinels_ && traits::supports_nulls));

if (mask_ != nullptr || null_sentinels_possible) {
if (mask_ != nullptr) {
RETURN_NOT_OK(InitNullBitmap());
null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
} else {
RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, use_pandas_null_sentinels_,
&null_bitmap_, &null_count_));
}

std::shared_ptr<Buffer> data;
RETURN_NOT_OK(ConvertData<ArrowType>(&data));

int64_t null_count = 0;
if (mask_ != nullptr) {
null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
} else if (null_sentinels_possible) {
// TODO(wesm): this presumes the NumPy C type and arrow C type are the
// same
null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
}

auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count, 0);
auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
return PushArray(arr_data);
}

Expand Down Expand Up @@ -493,6 +540,7 @@ class NumPyConverter {

std::shared_ptr<ResizableBuffer> null_bitmap_;
uint8_t* null_bitmap_data_;
int64_t null_count_;
};

Status NumPyConverter::Convert() {
Expand Down Expand Up @@ -659,12 +707,10 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
RETURN_NOT_OK(s);
} else {
// TODO(wesm): This is redundant, and recomputed in VisitNative()
const int64_t null_count = ValuesToBitmap<NPY_DATETIME>(arr_, null_bitmap_data_);

RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count,
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, pool_, data));
}
}
Expand Down
Loading