apache · pitrou · Feb 28, 2018 · pitrou · Feb 28, 2018 · pitrou
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
@@ -68,6 +68,9 @@ class Ndarray1DIndexer {
   int64_t stride_;
 };
 
+// Handling of Numpy Types by their static numbers
+// (the NPY_TYPES enum and related defines)
+
 static inline std::string GetNumPyTypeName(int npy_type) {
 #define TYPE_CASE(TYPE, NAME) \
   case NPY_##TYPE:            \
@@ -79,14 +82,20 @@ static inline std::string GetNumPyTypeName(int npy_type) {
     TYPE_CASE(INT16, "int16")
     TYPE_CASE(INT32, "int32")
     TYPE_CASE(INT64, "int64")
-#if (NPY_INT64 != NPY_LONGLONG)
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(INT, "intc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
     TYPE_CASE(LONGLONG, "longlong")
 #endif
     TYPE_CASE(UINT8, "uint8")
     TYPE_CASE(UINT16, "uint16")
     TYPE_CASE(UINT32, "uint32")
     TYPE_CASE(UINT64, "uint64")
-#if (NPY_UINT64 != NPY_ULONGLONG)
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(UINT, "uintc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
     TYPE_CASE(ULONGLONG, "ulonglong")
 #endif
     TYPE_CASE(FLOAT16, "float16")
@@ -100,9 +109,48 @@ static inline std::string GetNumPyTypeName(int npy_type) {
   }
 
 #undef TYPE_CASE
-  return "unrecognized type in GetNumPyTypeName";
+  std::stringstream ss;
+  ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
+  return ss.str();
 }
 
+#define TYPE_VISIT_INLINE(TYPE) \
+  case NPY_##TYPE:              \
+    return visitor->template Visit<NPY_##TYPE>(arr);
+
+template <typename VISITOR>
+inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
+  switch (PyArray_TYPE(arr)) {
+    TYPE_VISIT_INLINE(BOOL);
+    TYPE_VISIT_INLINE(INT8);
+    TYPE_VISIT_INLINE(UINT8);
+    TYPE_VISIT_INLINE(INT16);
+    TYPE_VISIT_INLINE(UINT16);
+    TYPE_VISIT_INLINE(INT32);
+    TYPE_VISIT_INLINE(UINT32);
+    TYPE_VISIT_INLINE(INT64);
+    TYPE_VISIT_INLINE(UINT64);
+#if !NPY_INT32_IS_INT
+    TYPE_VISIT_INLINE(INT);
+    TYPE_VISIT_INLINE(UINT);
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    TYPE_VISIT_INLINE(LONGLONG);
+    TYPE_VISIT_INLINE(ULONGLONG);
+#endif
+    TYPE_VISIT_INLINE(FLOAT16);
+    TYPE_VISIT_INLINE(FLOAT32);
+    TYPE_VISIT_INLINE(FLOAT64);
+    TYPE_VISIT_INLINE(DATETIME);
+    TYPE_VISIT_INLINE(OBJECT);
+  }
+  std::stringstream ss;
+  ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr));
+  return Status::NotImplemented(ss.str());
+}
+
+#undef TYPE_VISIT_INLINE
+
 }  // namespace py
 }  // namespace arrow
 

diff --git a/cpp/src/arrow/python/numpy_interop.h b/cpp/src/arrow/python/numpy_interop.h
@@ -43,6 +43,31 @@
 #include <numpy/arrayscalars.h>
 #include <numpy/ufuncobject.h>
 
+// A bit subtle. Numpy has 5 canonical integer types:
+// (or, rather, type pairs: signed and unsigned)
+//   NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
+// It also has 4 fixed-width integer aliases.
+// When mapping Arrow integer types to these 4 fixed-width aliases,
+// we always miss one of the canonical types (even though it may
+// have the same width as one of the aliases).
+// Which one depends on the platform...
+// On a LP64 system, NPY_INT64 maps to NPY_LONG and
+// NPY_LONGLONG needs to be handled separately.
+// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
+// NPY_INT needs to be handled separately.
+
+#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
+#define NPY_INT64_IS_LONG_LONG 1
+#else
+#define NPY_INT64_IS_LONG_LONG 0
+#endif
+
+#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
+#define NPY_INT32_IS_INT 1
+#else
+#define NPY_INT32_IS_INT 0
+#endif
+
 namespace arrow {
 namespace py {
 

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -84,6 +84,38 @@ inline bool PyObject_is_integer(PyObject* obj) {
   return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj);
 }
 
+Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
+  if (PyArray_NDIM(numpy_array) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  const int received_type = PyArray_DESCR(numpy_array)->type_num;
+  if (received_type != np_type) {
+    std::stringstream ss;
+    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
+       << GetNumPyTypeName(received_type);
+    return Status::Invalid(ss.str());
+  }
+
+  return Status::OK();
+}
+
+Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
+                          std::shared_ptr<ResizableBuffer>* out) {
+  int64_t null_bytes = BitUtil::BytesForBits(length);
+  std::shared_ptr<ResizableBuffer> null_bitmap;
+
+  null_bitmap = std::make_shared<PoolBuffer>(pool);
+  RETURN_NOT_OK(null_bitmap->Resize(null_bytes));
+
+  memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
+  *out = null_bitmap;
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Conversion from NumPy-in-Pandas to Arrow null bitmap
+
 template <int TYPE>
 inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   typedef internal::npy_traits<TYPE> traits;
@@ -103,6 +135,55 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   return null_count;
 }
 
+class NumPyNullsConverter {
+ public:
+  /// Convert the given array's null values to a null bitmap.
+  /// The null bitmap is only allocated if null values are ever possible.
+  static Status Convert(MemoryPool* pool, PyArrayObject* arr,
+                        bool use_pandas_null_sentinels,
+                        std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
+                        int64_t* out_null_count) {
+    NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out_null_bitmap_ = converter.null_bitmap_;
+    *out_null_count = converter.null_count_;
+    return Status::OK();
+  }
+
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    typedef internal::npy_traits<TYPE> traits;
+
+    const bool null_sentinels_possible =
+        // Always treat Numpy's NaT as null
+        TYPE == NPY_DATETIME ||
+        // Observing pandas's null sentinels
+        (use_pandas_null_sentinels_ && traits::supports_nulls);
+
+    if (null_sentinels_possible) {
+      RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
+      null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
+    }
+    return Status::OK();
+  }
+
+ protected:
+  NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr,
+                      bool use_pandas_null_sentinels)
+      : pool_(pool),
+        arr_(arr),
+        use_pandas_null_sentinels_(use_pandas_null_sentinels),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {}
+
+  MemoryPool* pool_;
+  PyArrayObject* arr_;
+  bool use_pandas_null_sentinels_;
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+  int64_t null_count_;
+};
+
 // Returns null count
 int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
   int64_t null_count = 0;
@@ -119,22 +200,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
   return null_count;
 }
 
-Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
-  if (PyArray_NDIM(numpy_array) != 1) {
-    return Status::Invalid("only handle 1-dimensional arrays");
-  }
-
-  const int received_type = PyArray_DESCR(numpy_array)->type_num;
-  if (received_type != np_type) {
-    std::stringstream ss;
-    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
-       << GetNumPyTypeName(received_type);
-    return Status::Invalid(ss.str());
-  }
-
-  return Status::OK();
-}
-
 }  // namespace
 
 /// Append as many string objects from NumPy arrays to a `StringBuilder` as we
@@ -301,7 +366,9 @@ class NumPyConverter {
         dtype_(PyArray_DESCR(arr_)),
         mask_(nullptr),
         use_pandas_null_sentinels_(use_pandas_null_sentinels),
-        decimal_type_() {
+        decimal_type_(),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {
     if (mo != nullptr && mo != Py_None) {
       mask_ = reinterpret_cast<PyArrayObject*>(mo);
     }
@@ -356,14 +423,8 @@ class NumPyConverter {
 
  protected:
   Status InitNullBitmap() {
-    int64_t null_bytes = BitUtil::BytesForBits(length_);
-
-    null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
-
+    RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
     null_bitmap_data_ = null_bitmap_->mutable_data();
-    memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
-
     return Status::OK();
   }
 
@@ -414,32 +475,18 @@ class NumPyConverter {
 
   template <typename ArrowType>
   Status VisitNative() {
-    using traits = internal::arrow_traits<ArrowType::type_id>;
-
-    const bool null_sentinels_possible =
-        // NumPy has a NaT type
-        (ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == Type::DATE32) ||
-
-        // Observing pandas's null sentinels
-        ((use_pandas_null_sentinels_ && traits::supports_nulls));
-
-    if (mask_ != nullptr || null_sentinels_possible) {
+    if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
+      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+    } else {
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, use_pandas_null_sentinels_,
+                                                 &null_bitmap_, &null_count_));
     }
 
     std::shared_ptr<Buffer> data;
     RETURN_NOT_OK(ConvertData<ArrowType>(&data));
 
-    int64_t null_count = 0;
-    if (mask_ != nullptr) {
-      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
-    } else if (null_sentinels_possible) {
-      // TODO(wesm): this presumes the NumPy C type and arrow C type are the
-      // same
-      null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
-    }
-
-    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count, 0);
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
     return PushArray(arr_data);
   }
 
@@ -493,6 +540,7 @@ class NumPyConverter {
 
   std::shared_ptr<ResizableBuffer> null_bitmap_;
   uint8_t* null_bitmap_data_;
+  int64_t null_count_;
 };
 
 Status NumPyConverter::Convert() {
@@ -659,12 +707,10 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
       Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
       RETURN_NOT_OK(s);
     } else {
-      // TODO(wesm): This is redundant, and recomputed in VisitNative()
-      const int64_t null_count = ValuesToBitmap<NPY_DATETIME>(arr_, null_bitmap_data_);
-
       RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
       if (!input_type->Equals(*type_)) {
-        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count,
+        // The null bitmap was already computed in VisitNative()
+        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
                                  type_, pool_, data));
       }
     }