apache · cpcloud · Feb 14, 2018 · Feb 19, 2018 · Feb 21, 2018
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -639,11 +639,11 @@ static Status ConvertTimes(PandasOptions options, const ChunkedArray& data,
 static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data,
                               PyObject** out_values) {
   PyAcquireGIL lock;
-  OwnedRef decimal_ref;
-  OwnedRef Decimal_ref;
-  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_ref));
-  RETURN_NOT_OK(internal::ImportFromModule(decimal_ref, "Decimal", &Decimal_ref));
-  PyObject* Decimal = Decimal_ref.obj();
+  OwnedRef decimal;
+  OwnedRef Decimal;
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal));
+  RETURN_NOT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal));
+  PyObject* decimal_constructor = Decimal.obj();
 
   for (int c = 0; c < data.num_chunks(); c++) {
     const auto& arr = static_cast<const arrow::Decimal128Array&>(*data.chunk(c));
@@ -653,7 +653,8 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data,
         Py_INCREF(Py_None);
         *out_values++ = Py_None;
       } else {
-        *out_values++ = internal::DecimalFromString(Decimal, arr.FormatValue(i));
+        *out_values++ =
+            internal::DecimalFromString(decimal_constructor, arr.FormatValue(i));
         RETURN_IF_PYERROR();
       }
     }

diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
@@ -76,7 +76,18 @@ class ScalarVisitor {
         timestamp_count_(0),
         float_count_(0),
         binary_count_(0),
-        unicode_count_(0) {}
+        unicode_count_(0),
+        decimal_count_(0),
+        max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
+                              std::numeric_limits<int32_t>::min()),
+        decimal_type_() {
+    OwnedRefNoGIL decimal_module;
+    Status status = ::arrow::py::internal::ImportModule("decimal", &decimal_module);
+    DCHECK(status.ok()) << "Unable to import decimal module";
+    status = ::arrow::py::internal::ImportFromModule(decimal_module, "Decimal",
+                                                     &decimal_type_);
+    DCHECK(status.ok()) << "Unable to import decimal.Decimal";
+  }
 
   Status Visit(PyObject* obj) {
     ++total_count_;
@@ -111,10 +122,16 @@ class ScalarVisitor {
         ss << type->ToString();
         return Status::Invalid(ss.str());
       }
+    } else if (PyObject_IsInstance(obj, decimal_type_.obj())) {
+      // Don't infer anything if we encounter a Decimal('nan')
+      if (!internal::PyDecimal_ISNAN(obj)) {
+        RETURN_NOT_OK(max_decimal_metadata_.Update(obj));
+      }
+      ++decimal_count_;
     } else {
       // TODO(wesm): accumulate error information somewhere
       static std::string supported_types =
-          "bool, float, integer, date, datetime, bytes, unicode";
+          "bool, float, integer, date, datetime, bytes, unicode, decimal";
       std::stringstream ss;
       ss << "Error inferring Arrow data type for collection of Python objects. ";
       RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss));
@@ -125,7 +142,9 @@ class ScalarVisitor {
 
   std::shared_ptr<DataType> GetType() {
     // TODO(wesm): handling mixed-type cases
-    if (float_count_) {
+    if (decimal_count_) {
+      return decimal(max_decimal_metadata_.precision(), max_decimal_metadata_.scale());
+    } else if (float_count_) {
       return float64();
     } else if (int_count_) {
       // TODO(wesm): tighter type later
@@ -157,8 +176,13 @@ class ScalarVisitor {
   int64_t float_count_;
   int64_t binary_count_;
   int64_t unicode_count_;
+  int64_t decimal_count_;
+
+  internal::DecimalMetadata max_decimal_metadata_;
+
   // Place to accumulate errors
   // std::vector<Status> errors_;
+  OwnedRefNoGIL decimal_type_;
 };
 
 static constexpr int MAX_NESTING_LEVELS = 32;
@@ -379,17 +403,14 @@ class TypedConverter : public SeqConverter {
   BuilderType* typed_builder_;
 };
 
-// We use the CRTP trick here to devirtualize the AppendItem() and AppendNull()
+// We use the CRTP trick here to devirtualize the AppendItem(), AppendNull(), and IsNull()
 // method calls.
 template <typename BuilderType, class Derived>
 class TypedConverterVisitor : public TypedConverter<BuilderType> {
  public:
   Status AppendSingle(PyObject* obj) override {
-    if (obj == Py_None) {
-      return static_cast<Derived*>(this)->AppendNull();
-    } else {
-      return static_cast<Derived*>(this)->AppendItem(obj);
-    }
+    auto self = static_cast<Derived*>(this);
+    return self->IsNull(obj) ? self->AppendNull() : self->AppendItem(obj);
   }
 
   Status AppendMultiple(PyObject* obj, int64_t size) override {
@@ -409,6 +430,7 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
 
   // Append a missing item (default implementation)
   Status AppendNull() { return this->typed_builder_->AppendNull(); }
+  bool IsNull(PyObject* obj) const { return obj == Py_None; }
 };
 
 class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter> {
@@ -830,12 +852,16 @@ class DecimalConverter
  public:
   // Append a non-missing item
   Status AppendItem(PyObject* obj) {
-    /// TODO(phillipc): Check for nan?
     Decimal128 value;
     const auto& type = static_cast<const DecimalType&>(*typed_builder_->type());
     RETURN_NOT_OK(internal::DecimalFromPythonDecimal(obj, type, &value));
     return typed_builder_->Append(value);
   }
+
+  bool IsNull(PyObject* obj) const {
+    return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj) ||
+           (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
+  }
 };
 
 // Dynamic constructor for sequence converters

diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
@@ -61,6 +61,7 @@ namespace internal {
 Status ImportModule(const std::string& module_name, OwnedRef* ref) {
   PyObject* module = PyImport_ImportModule(module_name.c_str());
   RETURN_IF_PYERROR();
+  DCHECK_NE(module, nullptr) << "unable to import the " << module_name << " module";
   ref->reset(module);
   return Status::OK();
 }
@@ -71,6 +72,7 @@ Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRe
 
   PyObject* attr = PyObject_GetAttrString(module.obj(), name.c_str());
   RETURN_IF_PYERROR();
+  DCHECK_NE(attr, nullptr) << "unable to import the " << name << " object";
   ref->reset(attr);
   return Status::OK();
 }
@@ -93,8 +95,13 @@ Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
   return Status::OK();
 }
 
-Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
-                                     int32_t* scale) {
+// \brief Infer the precision and scale of a Python decimal.Decimal instance
+// \param python_decimal[in] An instance of decimal.Decimal
+// \param precision[out] The value of the inferred precision
+// \param scale[out] The value of the inferred scale
+// \return The status of the operation
+static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
+                                            int32_t* scale) {
   DCHECK_NE(python_decimal, NULLPTR);
   DCHECK_NE(precision, NULLPTR);
   DCHECK_NE(scale, NULLPTR);
@@ -193,6 +200,53 @@ Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
   return Status::OK();
 }
 
+bool PyFloat_isnan(PyObject* obj) {
+  return PyFloat_Check(obj) && std::isnan(PyFloat_AS_DOUBLE(obj));
+}
+
+bool PyDecimal_Check(PyObject* obj) {
+  // TODO(phillipc): Is this expensive?
+  OwnedRef Decimal;
+  OwnedRef decimal;
+  Status status = ImportModule("decimal", &decimal);
+  DCHECK(status.ok()) << "Error during import of the decimal module";
+  status = ImportFromModule(decimal, "Decimal", &Decimal);
+  DCHECK(status.ok())
+      << "Error during import of the Decimal object from the decimal module";
+  const int32_t result = PyObject_IsInstance(obj, Decimal.obj());
+  DCHECK_NE(result, -1) << " error during PyObject_IsInstance check";
+  return result == 1;
+}
+
+bool PyDecimal_ISNAN(PyObject* obj) {
+  DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
+  OwnedRef is_nan(PyObject_CallMethod(obj, "is_nan", ""));
+  return PyObject_IsTrue(is_nan.obj()) == 1;
+}
+
+DecimalMetadata::DecimalMetadata()
+    : precision_(std::numeric_limits<int32_t>::min()),
+      scale_(std::numeric_limits<int32_t>::min()) {}
+
+DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale)
+    : precision_(precision), scale_(scale) {}
+
+Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) {
+  precision_ = std::max(precision_, suggested_precision);
+  scale_ = std::max(scale_, suggested_scale);
+  return Status::OK();
+}
+
+Status DecimalMetadata::Update(PyObject* object) {
+  DCHECK(PyDecimal_Check(object)) << "Object is not a Python Decimal";
+  DCHECK(!PyDecimal_ISNAN(object))
+      << "Decimal object cannot be NAN when inferring precision and scale";
+  int32_t precision;
+  int32_t scale;
+  RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
+  return Update(precision, scale);
+}
+
 }  // namespace internal
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
@@ -36,29 +36,89 @@ namespace py {
 
 class OwnedRef;
 
-ARROW_EXPORT
-std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
+// \brief Get an arrow DataType instance from Arrow's Type::type enum
+// \param[in] type One of the values of Arrow's Type::type enum
+// \return A shared pointer to DataType
+ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
 
 namespace internal {
 
+// \brief Import a Python module
+// \param[in] module_name The name of the module
+// \param[out] ref The OwnedRef containing the module PyObject*
 Status ImportModule(const std::string& module_name, OwnedRef* ref);
-Status ImportFromModule(const OwnedRef& module, const std::string& module_name,
-                        OwnedRef* ref);
 
+// \brief Import an object from a Python module
+// \param[in] module A Python module
+// \param[in] name The name of the object to import
+// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
+// module
+Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
+
+// \brief Convert a Python Decimal object to a C++ string
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[out] The string representation of the Python Decimal instance
+// \return The status of the operation
 Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
 
-Status InferDecimalPrecisionAndScale(PyObject* python_decimal,
-                                     int32_t* precision = NULLPTR,
-                                     int32_t* scale = NULLPTR);
-
+// \brief Convert a C++ std::string to a Python Decimal instance
+// \param[in] decimal_constructor The decimal type object
+// \param[in] decimal_string A decimal string
+// \return An instance of decimal.Decimal
 PyObject* DecimalFromString(PyObject* decimal_constructor,
                             const std::string& decimal_string);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
 Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
                                 Decimal128* out);
+
+// \brief Check whether obj is an integer, independent of Python versions.
 bool IsPyInteger(PyObject* obj);
 
+// \brief Check whether obj is nan
+bool PyFloat_isnan(PyObject* obj);
+
+// \brief Check whether obj is an instance of Decimal
+bool PyDecimal_Check(PyObject* obj);
+
+// \brief Check whether obj is nan. This function will abort the program if the argument
+// is not a Decimal instance
+bool PyDecimal_ISNAN(PyObject* obj);
+
+// \brief Convert a Python integer into an unsigned 64-bit integer
+// \param[in] obj A Python integer
+// \param[out] out A pointer to a C uint64_t to hold the result of the conversion
+// \return The status of the operation
 Status UInt64FromPythonInt(PyObject* obj, uint64_t* out);
 
+// \brief Helper class to track and update the precision and scale of a decimal
+class DecimalMetadata {
+ public:
+  DecimalMetadata();
+  DecimalMetadata(int32_t precision, int32_t scale);
+
+  // \brief Adjust the precision and scale of a decimal type given a new precision and a
+  // new scale \param[in] suggested_precision A candidate precision \param[in]
+  // suggested_scale A candidate scale \return The status of the operation
+  Status Update(int32_t suggested_precision, int32_t suggested_scale);
+
+  // \brief A convenient interface for updating the precision and scale based on a Python
+  // Decimal object \param object A Python Decimal object \return The status of the
+  // operation
+  Status Update(PyObject* object);
+
+  int32_t precision() const { return precision_; }
+  int32_t scale() const { return scale_; }
+
+ private:
+  int32_t precision_;
+  int32_t scale_;
+};
+
 }  // namespace internal
 }  // namespace py
 }  // namespace arrow

diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
@@ -54,6 +54,9 @@ class Ndarray1DIndexer {
 
   T* data() const { return data_; }
 
+  T* begin() const { return data(); }
+  T* end() const { return begin() + size() * stride_; }
+
   bool is_strided() const { return stride_ == 1; }
 
   T& operator[](size_type index) { return data_[index * stride_]; }