diff --git a/.travis.yml b/.travis.yml index a4c74657e52..b1241e793bc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -174,7 +174,7 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_c_glib.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh - # [OS X] C++ & glib w/ XCode 8.3 & autotools + # [OS X] C++ & glib w/ XCode 8.3 & autotools & homebrew - compiler: clang osx_image: xcode8.3 os: osx @@ -185,7 +185,8 @@ matrix: - BUILD_SYSTEM=autotools before_script: - if [ $ARROW_CI_C_GLIB_AFFECTED != "1" ]; then exit; fi - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library + - $TRAVIS_BUILD_DIR/ci/travis_install_osx.sh + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library --homebrew - $TRAVIS_BUILD_DIR/ci/travis_before_script_c_glib.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_c_glib.sh diff --git a/c_glib/Brewfile b/c_glib/Brewfile index 9fe5c3b6163..955072e1ea7 100644 --- a/c_glib/Brewfile +++ b/c_glib/Brewfile @@ -16,7 +16,7 @@ # under the License. brew "autoconf-archive" -brew "boost" +brew "boost", args: ["1.65.0"] brew "ccache" brew "cmake" brew "git" diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index 27d1e86fd95..033fbd7c6ce 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -21,9 +21,7 @@ set -ex source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh -if [ $TRAVIS_OS_NAME = "osx" ]; then - brew update && brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile -else # Linux +if [ $TRAVIS_OS_NAME = "linux" ]; then sudo apt-get install -y -q gtk-doc-tools autoconf-archive libgirepository1.0-dev fi diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 17b5deb36b5..b9afbee7863 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -22,10 +22,22 @@ set -ex source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh -if [ "$1" == "--only-library" ]; then - only_library_mode=yes -else - only_library_mode=no +only_library_mode=no +using_homebrew=no + +while true; do + case "$1" in + --only-library) + only_library_mode=yes + shift ;; + --homebrew) + using_homebrew=yes + shift ;; + *) break ;; + esac +done + +if [ "$only_library_mode" == "no" ]; then source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh fi @@ -78,6 +90,10 @@ if [ $TRAVIS_OS_NAME == "linux" ]; then -DBUILD_WARNING_LEVEL=$ARROW_BUILD_WARNING_LEVEL \ $ARROW_CPP_DIR else + if [ "$using_homebrew" = "yes" ]; then + # build against homebrew's boost if we're using it + export BOOST_ROOT=/usr/local/opt/boost + fi cmake $CMAKE_COMMON_FLAGS \ $CMAKE_OSX_FLAGS \ -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ diff --git a/ci/travis_build_parquet_cpp.sh b/ci/travis_build_parquet_cpp.sh index 7d2e3ab7364..f64a85d621d 100755 --- a/ci/travis_build_parquet_cpp.sh +++ b/ci/travis_build_parquet_cpp.sh @@ -38,7 +38,7 @@ cmake \ -GNinja \ -DCMAKE_BUILD_TYPE=debug \ -DCMAKE_INSTALL_PREFIX=$ARROW_PYTHON_PARQUET_HOME \ - -DPARQUET_BOOST_USE_SHARED=off \ + -DPARQUET_BOOST_USE_SHARED=on \ -DPARQUET_BUILD_BENCHMARKS=off \ -DPARQUET_BUILD_EXECUTABLES=off \ -DPARQUET_BUILD_TESTS=off \ diff --git a/ci/travis_install_linux.sh b/ci/travis_install_linux.sh index acee9ebcb2a..74fde277413 100755 --- a/ci/travis_install_linux.sh +++ b/ci/travis_install_linux.sh @@ -19,7 +19,7 @@ sudo apt-get install -y -q \ gdb ccache libboost-dev libboost-filesystem-dev \ - libboost-system-dev libjemalloc-dev + libboost-system-dev libboost-regex-dev libjemalloc-dev if [ "$ARROW_TRAVIS_VALGRIND" == "1" ]; then sudo apt-get install -y -q valgrind diff --git a/ci/travis_install_osx.sh b/ci/travis_install_osx.sh new file mode 100755 index 00000000000..b03a5f16a31 --- /dev/null +++ b/ci/travis_install_osx.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if [ "$ARROW_CI_C_GLIB_AFFECTED" = "1" ]; then + brew update + brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile +fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8c0e95634fc..47692a831fa 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -574,11 +574,13 @@ set(ARROW_LINK_LIBS set(ARROW_SHARED_PRIVATE_LINK_LIBS ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY}) + ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY}) set(ARROW_STATIC_PRIVATE_LINK_LIBS ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY}) + ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY}) if (NOT MSVC) set(ARROW_LINK_LIBS diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 944ca1d3bb1..4103af41b53 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -157,8 +157,11 @@ if (ARROW_BOOST_VENDORED) "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BOOST_STATIC_FILESYSTEM_LIBRARY "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BOOST_STATIC_REGEX_LIBRARY + "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}") set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}") + set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}") if (ARROW_BOOST_HEADER_ONLY) set(BOOST_BUILD_PRODUCTS) set(BOOST_CONFIGURE_COMMAND "") @@ -166,7 +169,8 @@ if (ARROW_BOOST_VENDORED) else() set(BOOST_BUILD_PRODUCTS ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY}) + ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY}) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh" "--prefix=${BOOST_PREFIX}" @@ -210,16 +214,19 @@ else() if (ARROW_BOOST_HEADER_ONLY) find_package(Boost REQUIRED) else() - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS system filesystem regex REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_shared) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + set(BOOST_REGEX_LIBRARY boost_regex_shared) endif() else() # Find static boost headers and libs @@ -228,16 +235,19 @@ else() if (ARROW_BOOST_HEADER_ONLY) find_package(Boost REQUIRED) else() - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS system filesystem regex REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) + set(BOOST_REGEX_LIBRARY boost_regex_static) endif() endif() endif() @@ -254,7 +264,11 @@ if (NOT ARROW_BOOST_HEADER_ONLY) STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}" SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}") - SET(ARROW_BOOST_LIBS boost_system boost_filesystem) + ADD_THIRDPARTY_LIB(boost_regex + STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}" + SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}") + + SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex) endif() include_directories(SYSTEM ${Boost_INCLUDE_DIR}) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index aefd4d76dfa..17b87bf4eca 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -640,11 +640,11 @@ static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; - OwnedRef decimal_ref; - OwnedRef Decimal_ref; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_ref)); - RETURN_NOT_OK(internal::ImportFromModule(decimal_ref, "Decimal", &Decimal_ref)); - PyObject* Decimal = Decimal_ref.obj(); + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); + RETURN_NOT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); + PyObject* decimal_constructor = Decimal.obj(); for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = static_cast(*data.chunk(c)); @@ -654,7 +654,8 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, Py_INCREF(Py_None); *out_values++ = Py_None; } else { - *out_values++ = internal::DecimalFromString(Decimal, arr.FormatValue(i)); + *out_values++ = + internal::DecimalFromString(decimal_constructor, arr.FormatValue(i)); RETURN_IF_PYERROR(); } } diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index a286c6bd5e9..d2f900f6ae8 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -76,7 +76,15 @@ class ScalarVisitor { timestamp_count_(0), float_count_(0), binary_count_(0), - unicode_count_(0) {} + unicode_count_(0), + decimal_count_(0), + max_decimal_metadata_(std::numeric_limits::min(), + std::numeric_limits::min()), + decimal_type_() { + PyAcquireGIL lock; + Status status = internal::ImportDecimalType(&decimal_type_); + DCHECK_OK(status); + } Status Visit(PyObject* obj) { ++total_count_; @@ -111,10 +119,13 @@ class ScalarVisitor { ss << type->ToString(); return Status::Invalid(ss.str()); } + } else if (PyObject_IsInstance(obj, decimal_type_.obj())) { + RETURN_NOT_OK(max_decimal_metadata_.Update(obj)); + ++decimal_count_; } else { // TODO(wesm): accumulate error information somewhere static std::string supported_types = - "bool, float, integer, date, datetime, bytes, unicode"; + "bool, float, integer, date, datetime, bytes, unicode, decimal"; std::stringstream ss; ss << "Error inferring Arrow data type for collection of Python objects. "; RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss)); @@ -125,7 +136,9 @@ class ScalarVisitor { std::shared_ptr GetType() { // TODO(wesm): handling mixed-type cases - if (float_count_) { + if (decimal_count_) { + return decimal(max_decimal_metadata_.precision(), max_decimal_metadata_.scale()); + } else if (float_count_) { return float64(); } else if (int_count_) { // TODO(wesm): tighter type later @@ -157,8 +170,13 @@ class ScalarVisitor { int64_t float_count_; int64_t binary_count_; int64_t unicode_count_; + int64_t decimal_count_; + + internal::DecimalMetadata max_decimal_metadata_; + // Place to accumulate errors // std::vector errors_; + OwnedRefNoGIL decimal_type_; }; static constexpr int MAX_NESTING_LEVELS = 32; @@ -379,17 +397,14 @@ class TypedConverter : public SeqConverter { BuilderType* typed_builder_; }; -// We use the CRTP trick here to devirtualize the AppendItem() and AppendNull() +// We use the CRTP trick here to devirtualize the AppendItem(), AppendNull(), and IsNull() // method calls. template class TypedConverterVisitor : public TypedConverter { public: Status AppendSingle(PyObject* obj) override { - if (obj == Py_None) { - return static_cast(this)->AppendNull(); - } else { - return static_cast(this)->AppendItem(obj); - } + auto self = static_cast(this); + return self->IsNull(obj) ? self->AppendNull() : self->AppendItem(obj); } Status AppendMultiple(PyObject* obj, int64_t size) override { @@ -409,6 +424,7 @@ class TypedConverterVisitor : public TypedConverter { // Append a missing item (default implementation) Status AppendNull() { return this->typed_builder_->AppendNull(); } + bool IsNull(PyObject* obj) const { return obj == Py_None; } }; class NullConverter : public TypedConverterVisitor { @@ -830,12 +846,16 @@ class DecimalConverter public: // Append a non-missing item Status AppendItem(PyObject* obj) { - /// TODO(phillipc): Check for nan? Decimal128 value; const auto& type = static_cast(*typed_builder_->type()); RETURN_NOT_OK(internal::DecimalFromPythonDecimal(obj, type, &value)); return typed_builder_->Append(value); } + + bool IsNull(PyObject* obj) const { + return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj) || + (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj)); + } }; // Dynamic constructor for sequence converters diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index df1db99911b..429068dd1ac 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include +#include #include #include "arrow/python/common.h" @@ -61,6 +63,7 @@ namespace internal { Status ImportModule(const std::string& module_name, OwnedRef* ref) { PyObject* module = PyImport_ImportModule(module_name.c_str()); RETURN_IF_PYERROR(); + DCHECK_NE(module, nullptr) << "unable to import the " << module_name << " module"; ref->reset(module); return Status::OK(); } @@ -71,10 +74,18 @@ Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRe PyObject* attr = PyObject_GetAttrString(module.obj(), name.c_str()); RETURN_IF_PYERROR(); + DCHECK_NE(attr, nullptr) << "unable to import the " << name << " object"; ref->reset(attr); return Status::OK(); } +Status ImportDecimalType(OwnedRef* decimal_type) { + OwnedRef decimal_module; + RETURN_NOT_OK(ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK(ImportFromModule(decimal_module, "Decimal", decimal_type)); + return Status::OK(); +} + Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { // Call Python's str(decimal_object) OwnedRef str_obj(PyObject_Str(python_decimal)); @@ -93,13 +104,19 @@ Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { return Status::OK(); } -Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision, - int32_t* scale) { +// \brief Infer the precision and scale of a Python decimal.Decimal instance +// \param python_decimal[in] An instance of decimal.Decimal +// \param precision[out] The value of the inferred precision +// \param scale[out] The value of the inferred scale +// \return The status of the operation +static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision, + int32_t* scale) { DCHECK_NE(python_decimal, NULLPTR); DCHECK_NE(precision, NULLPTR); DCHECK_NE(scale, NULLPTR); - OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", "()")); + // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a DCHECK + OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", "")); RETURN_IF_PYERROR(); DCHECK(PyTuple_Check(as_tuple.obj())); @@ -117,8 +134,23 @@ Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precisio const auto exponent = static_cast(PyLong_AsLong(py_exponent.obj())); RETURN_IF_PYERROR(); - *precision = num_digits; - *scale = -exponent; + const int32_t abs_exponent = std::abs(exponent); + + int32_t num_additional_zeros; + + if (num_digits <= abs_exponent) { + DCHECK_NE(exponent, 0) << "exponent should never be zero here"; + + // we have leading/trailing zeros, leading if exponent is negative + num_additional_zeros = exponent < 0 ? abs_exponent - num_digits : exponent; + *scale = static_cast(exponent < 0) * -exponent; + } else { + // we can use the number of digits as the precision + num_additional_zeros = 0; + *scale = -exponent; + } + + *precision = num_digits + num_additional_zeros; return Status::OK(); } @@ -193,6 +225,62 @@ Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) { return Status::OK(); } +bool PyFloat_isnan(PyObject* obj) { + return PyFloat_Check(obj) && std::isnan(PyFloat_AS_DOUBLE(obj)); +} + +bool PyDecimal_Check(PyObject* obj) { + // TODO(phillipc): Is this expensive? + OwnedRef Decimal; + Status status = ImportDecimalType(&Decimal); + DCHECK_OK(status); + const int32_t result = PyObject_IsInstance(obj, Decimal.obj()); + DCHECK_NE(result, -1) << " error during PyObject_IsInstance check"; + return result == 1; +} + +bool PyDecimal_ISNAN(PyObject* obj) { + DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal"; + OwnedRef is_nan(PyObject_CallMethod(obj, "is_nan", "")); + return PyObject_IsTrue(is_nan.obj()) == 1; +} + +DecimalMetadata::DecimalMetadata() + : DecimalMetadata(std::numeric_limits::min(), + std::numeric_limits::min()) {} + +DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale) + : precision_(precision), scale_(scale) {} + +Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) { + const int32_t current_precision = precision_; + precision_ = std::max(current_precision, suggested_precision); + + const int32_t current_scale = scale_; + scale_ = std::max(current_scale, suggested_scale); + + // if our suggested scale is zero and we don't yet have enough precision then we need to + // add whatever the current scale is to the precision + if (suggested_scale == 0 && suggested_precision > current_precision) { + precision_ += scale_; + } + + return Status::OK(); +} + +Status DecimalMetadata::Update(PyObject* object) { + DCHECK(PyDecimal_Check(object)) << "Object is not a Python Decimal"; + + if (ARROW_PREDICT_FALSE(PyDecimal_ISNAN(object))) { + return Status::OK(); + } + + int32_t precision; + int32_t scale; + RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); + return Update(precision, scale); +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index c0171aa2f5a..6be0e49b18a 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -36,29 +36,92 @@ namespace py { class OwnedRef; -ARROW_EXPORT -std::shared_ptr GetPrimitiveType(Type::type type); +// \brief Get an arrow DataType instance from Arrow's Type::type enum +// \param[in] type One of the values of Arrow's Type::type enum +// \return A shared pointer to DataType +ARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); namespace internal { +// \brief Import a Python module +// \param[in] module_name The name of the module +// \param[out] ref The OwnedRef containing the module PyObject* Status ImportModule(const std::string& module_name, OwnedRef* ref); -Status ImportFromModule(const OwnedRef& module, const std::string& module_name, - OwnedRef* ref); -Status PythonDecimalToString(PyObject* python_decimal, std::string* out); +// \brief Import an object from a Python module +// \param[in] module A Python module +// \param[in] name The name of the object to import +// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c +// module +Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref); + +// \brief Import +Status ImportDecimalType(OwnedRef* decimal_type); -Status InferDecimalPrecisionAndScale(PyObject* python_decimal, - int32_t* precision = NULLPTR, - int32_t* scale = NULLPTR); +// \brief Convert a Python Decimal object to a C++ string +// \param[in] python_decimal A Python decimal.Decimal instance +// \param[out] The string representation of the Python Decimal instance +// \return The status of the operation +Status PythonDecimalToString(PyObject* python_decimal, std::string* out); +// \brief Convert a C++ std::string to a Python Decimal instance +// \param[in] decimal_constructor The decimal type object +// \param[in] decimal_string A decimal string +// \return An instance of decimal.Decimal PyObject* DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string); + +// \brief Convert a Python decimal to an Arrow Decimal128 object +// \param[in] python_decimal A Python decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, Decimal128* out); + +// \brief Check whether obj is an integer, independent of Python versions. bool IsPyInteger(PyObject* obj); +// \brief Check whether obj is nan +bool PyFloat_isnan(PyObject* obj); + +// \brief Check whether obj is an instance of Decimal +bool PyDecimal_Check(PyObject* obj); + +// \brief Check whether obj is nan. This function will abort the program if the argument +// is not a Decimal instance +bool PyDecimal_ISNAN(PyObject* obj); + +// \brief Convert a Python integer into an unsigned 64-bit integer +// \param[in] obj A Python integer +// \param[out] out A pointer to a C uint64_t to hold the result of the conversion +// \return The status of the operation Status UInt64FromPythonInt(PyObject* obj, uint64_t* out); +// \brief Helper class to track and update the precision and scale of a decimal +class DecimalMetadata { + public: + DecimalMetadata(); + DecimalMetadata(int32_t precision, int32_t scale); + + // \brief Adjust the precision and scale of a decimal type given a new precision and a + // new scale \param[in] suggested_precision A candidate precision \param[in] + // suggested_scale A candidate scale \return The status of the operation + Status Update(int32_t suggested_precision, int32_t suggested_scale); + + // \brief A convenient interface for updating the precision and scale based on a Python + // Decimal object \param object A Python Decimal object \return The status of the + // operation + Status Update(PyObject* object); + + int32_t precision() const { return precision_; } + int32_t scale() const { return scale_; } + + private: + int32_t precision_; + int32_t scale_; +}; + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 6c9c871a100..8d4308065c2 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -54,6 +54,9 @@ class Ndarray1DIndexer { T* data() const { return data_; } + T* begin() const { return data(); } + T* end() const { return begin() + size() * stride_; } + bool is_strided() const { return stride_ == 1; } T& operator[](size_type index) { return data_[index * stride_]; } diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 23418ad920c..04a71c1f640 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -67,17 +67,9 @@ constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max(); namespace { -inline bool PyFloat_isnan(PyObject* obj) { - if (PyFloat_Check(obj)) { - double val = PyFloat_AS_DOUBLE(obj); - return val != val; - } else { - return false; - } -} - inline bool PandasObjectIsNull(PyObject* obj) { - return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); + return obj == Py_None || obj == numpy_nan || internal::PyFloat_isnan(obj) || + (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj)); } inline bool PyObject_is_string(PyObject* obj) { @@ -88,10 +80,8 @@ inline bool PyObject_is_string(PyObject* obj) { #endif } -inline bool PyObject_is_float(PyObject* obj) { return PyFloat_Check(obj); } - inline bool PyObject_is_integer(PyObject* obj) { - return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); + return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj); } template @@ -310,13 +300,18 @@ class NumPyConverter { arr_(reinterpret_cast(ao)), dtype_(PyArray_DESCR(arr_)), mask_(nullptr), - use_pandas_null_sentinels_(use_pandas_null_sentinels) { + use_pandas_null_sentinels_(use_pandas_null_sentinels), + decimal_type_() { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } length_ = static_cast(PyArray_SIZE(arr_)); itemsize_ = static_cast(PyArray_DESCR(arr_)->elsize); stride_ = static_cast(PyArray_STRIDES(arr_)[0]); + + PyAcquireGIL lock; + Status status = internal::ImportDecimalType(&decimal_type_); + DCHECK_OK(status); } bool is_strided() const { return itemsize_ != stride_; } @@ -491,6 +486,8 @@ class NumPyConverter { bool use_pandas_null_sentinels_; + OwnedRefNoGIL decimal_type_; + // Used in visitor pattern std::vector> out_arrays_; @@ -743,58 +740,42 @@ Status NumPyConverter::ConvertDates() { Status NumPyConverter::ConvertDecimals() { PyAcquireGIL lock; - // Import the decimal module and Decimal class - OwnedRef decimal; - OwnedRef Decimal; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); - RETURN_NOT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); - + internal::DecimalMetadata max_decimal_metadata; Ndarray1DIndexer objects(arr_); - PyObject* object = objects[0]; if (type_ == NULLPTR) { - int32_t precision; - int32_t desired_scale; - - int32_t tmp_precision; - int32_t tmp_scale; - - RETURN_NOT_OK( - internal::InferDecimalPrecisionAndScale(objects[0], &precision, &desired_scale)); - - for (int64_t i = 1; i < length_; ++i) { - RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(objects[i], &tmp_precision, - &tmp_scale)); - precision = std::max(precision, tmp_precision); - - if (std::abs(desired_scale) < std::abs(tmp_scale)) { - desired_scale = tmp_scale; - } + for (PyObject* object : objects) { + RETURN_NOT_OK(max_decimal_metadata.Update(object)); } - type_ = ::arrow::decimal(precision, desired_scale); + type_ = + ::arrow::decimal(max_decimal_metadata.precision(), max_decimal_metadata.scale()); } Decimal128Builder builder(type_, pool_); RETURN_NOT_OK(builder.Resize(length_)); const auto& decimal_type = static_cast(*type_); - PyObject* Decimal_type_object = Decimal.obj(); - for (int64_t i = 0; i < length_; ++i) { - object = objects[i]; + for (PyObject* object : objects) { + const int is_decimal = PyObject_IsInstance(object, decimal_type_.obj()); - if (PyObject_IsInstance(object, Decimal_type_object)) { - Decimal128 value; - RETURN_NOT_OK(internal::DecimalFromPythonDecimal(object, decimal_type, &value)); - RETURN_NOT_OK(builder.Append(value)); - } else if (PandasObjectIsNull(object)) { - RETURN_NOT_OK(builder.AppendNull()); - } else { + if (ARROW_PREDICT_FALSE(is_decimal == 0)) { std::stringstream ss; ss << "Error converting from Python objects to Decimal: "; RETURN_NOT_OK(InvalidConversion(object, "decimal.Decimal", &ss)); return Status::Invalid(ss.str()); + } else if (ARROW_PREDICT_FALSE(is_decimal == -1)) { + DCHECK_NE(PyErr_Occurred(), nullptr); + RETURN_IF_PYERROR(); + } + + if (PandasObjectIsNull(object)) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + Decimal128 value; + RETURN_NOT_OK(internal::DecimalFromPythonDecimal(object, decimal_type, &value)); + RETURN_NOT_OK(builder.Append(value)); } } return PushBuilderResult(&builder); @@ -1045,18 +1026,13 @@ Status NumPyConverter::ConvertObjectsInfer() { objects.Init(arr_); PyDateTime_IMPORT; - OwnedRef decimal; - OwnedRef Decimal; - RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); - RETURN_NOT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); - for (int64_t i = 0; i < length_; ++i) { PyObject* obj = objects[i]; if (PandasObjectIsNull(obj)) { continue; } else if (PyObject_is_string(obj)) { return ConvertObjectStrings(); - } else if (PyObject_is_float(obj)) { + } else if (PyFloat_Check(obj)) { return ConvertObjectFloats(); } else if (PyBool_Check(obj)) { return ConvertBooleans(); @@ -1069,7 +1045,7 @@ Status NumPyConverter::ConvertObjectsInfer() { return ConvertDateTimes(); } else if (PyTime_Check(obj)) { return ConvertTimes(); - } else if (PyObject_IsInstance(const_cast(obj), Decimal.obj())) { + } else if (PyObject_IsInstance(obj, decimal_type_.obj()) == 1) { return ConvertDecimals(); } else if (PyList_Check(obj)) { std::shared_ptr inferred_type; diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index b76caaecee6..16ac1e332b1 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -78,15 +78,14 @@ TEST(OwnedRefNoGIL, TestMoves) { class DecimalTest : public ::testing::Test { public: - DecimalTest() : lock_(), decimal_module_(), decimal_constructor_() { - auto s = internal::ImportModule("decimal", &decimal_module_); - DCHECK(s.ok()) << s.message(); - DCHECK_NE(decimal_module_.obj(), NULLPTR); + DecimalTest() : lock_(), decimal_constructor_() { + OwnedRef decimal_module; - s = internal::ImportFromModule(decimal_module_, "Decimal", &decimal_constructor_); - DCHECK(s.ok()) << s.message(); + Status status = internal::ImportModule("decimal", &decimal_module); + DCHECK_OK(status); - DCHECK_NE(decimal_constructor_.obj(), NULLPTR); + status = internal::ImportFromModule(decimal_module, "Decimal", &decimal_constructor_); + DCHECK_OK(status); } OwnedRef CreatePythonDecimal(const std::string& string_value) { @@ -94,16 +93,17 @@ class DecimalTest : public ::testing::Test { return ref; } + PyObject* decimal_constructor() const { return decimal_constructor_.obj(); } + private: PyAcquireGIL lock_; - OwnedRef decimal_module_; OwnedRef decimal_constructor_; }; TEST_F(DecimalTest, TestPythonDecimalToString) { std::string decimal_string("-39402950693754869342983"); - OwnedRef python_object = this->CreatePythonDecimal(decimal_string); + OwnedRef python_object(this->CreatePythonDecimal(decimal_string)); ASSERT_NE(python_object.obj(), nullptr); std::string string_result; @@ -114,35 +114,57 @@ TEST_F(DecimalTest, TestInferPrecisionAndScale) { std::string decimal_string("-394029506937548693.42983"); OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); - int32_t precision; - int32_t scale; - - ASSERT_OK( - internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal.obj())); const auto expected_precision = static_cast(decimal_string.size() - 2); // 1 for -, 1 for . const int32_t expected_scale = 5; - ASSERT_EQ(expected_precision, precision); - ASSERT_EQ(expected_scale, scale); + ASSERT_EQ(expected_precision, metadata.precision()); + ASSERT_EQ(expected_scale, metadata.scale()); } TEST_F(DecimalTest, TestInferPrecisionAndNegativeScale) { std::string decimal_string("-3.94042983E+10"); OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); - int32_t precision; - int32_t scale; - - ASSERT_OK( - internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal.obj())); const auto expected_precision = 9; const int32_t expected_scale = -2; - ASSERT_EQ(expected_precision, precision); - ASSERT_EQ(expected_scale, scale); + ASSERT_EQ(expected_precision, metadata.precision()); + ASSERT_EQ(expected_scale, metadata.scale()); +} + +TEST_F(DecimalTest, TestInferAllLeadingZeros) { + std::string decimal_string("0.001"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal.obj())); + ASSERT_EQ(3, metadata.precision()); + ASSERT_EQ(3, metadata.scale()); +} + +TEST_F(DecimalTest, TestInferAllLeadingZerosExponentialNotationPositive) { + std::string decimal_string("0.01E5"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal.obj())); + ASSERT_EQ(4, metadata.precision()); + ASSERT_EQ(0, metadata.scale()); +} + +TEST_F(DecimalTest, TestInferAllLeadingZerosExponentialNotationNegative) { + std::string decimal_string("0.01E3"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal.obj())); + ASSERT_EQ(2, metadata.precision()); + ASSERT_EQ(0, metadata.scale()); } TEST(PandasConversionTest, TestObjectBlockWriteFails) { @@ -226,14 +248,12 @@ TEST_F(DecimalTest, FromPythonDecimalRescaleTruncateable) { TEST_F(DecimalTest, TestOverflowFails) { Decimal128 value; - int32_t precision; - int32_t scale; OwnedRef python_decimal( this->CreatePythonDecimal("9999999999999999999999999999999999999.9")); - ASSERT_OK( - internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); - ASSERT_EQ(38, precision); - ASSERT_EQ(1, scale); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal.obj())); + ASSERT_EQ(38, metadata.precision()); + ASSERT_EQ(1, metadata.scale()); auto type = ::arrow::decimal(38, 38); const auto& decimal_type = static_cast(*type); @@ -241,5 +261,111 @@ TEST_F(DecimalTest, TestOverflowFails) { decimal_type, &value)); } +TEST_F(DecimalTest, TestNoneAndNaN) { + OwnedRef list_ref(PyList_New(4)); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + PyObject* constructor = this->decimal_constructor(); + PyObject* decimal_value = internal::DecimalFromString(constructor, "1.234"); + ASSERT_NE(decimal_value, nullptr); + + Py_INCREF(Py_None); + PyObject* missing_value1 = Py_None; + ASSERT_NE(missing_value1, nullptr); + + PyObject* missing_value2 = PyFloat_FromDouble(NPY_NAN); + ASSERT_NE(missing_value2, nullptr); + + PyObject* missing_value3 = internal::DecimalFromString(constructor, "nan"); + ASSERT_NE(missing_value3, nullptr); + + // This steals a reference to each object, so we don't need to decref them later, + // just the list + ASSERT_EQ(0, PyList_SetItem(list, 0, decimal_value)); + ASSERT_EQ(0, PyList_SetItem(list, 1, missing_value1)); + ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2)); + ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3)); + + MemoryPool* pool = default_memory_pool(); + std::shared_ptr arr; + ASSERT_OK(ConvertPySequence(list, pool, &arr)); + ASSERT_TRUE(arr->IsValid(0)); + ASSERT_TRUE(arr->IsNull(1)); + ASSERT_TRUE(arr->IsNull(2)); + ASSERT_TRUE(arr->IsNull(3)); +} + +TEST_F(DecimalTest, TestMixedPrecisionAndScale) { + std::vector strings{{"0.001", "1.01E5", "1.01E5"}}; + + OwnedRef list_ref(PyList_New(static_cast(strings.size()))); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + // PyList_SetItem steals a reference to the item so we don't decref it later + PyObject* decimal_constructor = this->decimal_constructor(); + for (Py_ssize_t i = 0; i < static_cast(strings.size()); ++i) { + const int result = PyList_SetItem( + list, i, internal::DecimalFromString(decimal_constructor, strings.at(i))); + ASSERT_EQ(0, result); + } + + MemoryPool* pool = default_memory_pool(); + std::shared_ptr arr; + ASSERT_OK(ConvertPySequence(list, pool, &arr)); + const auto& type = static_cast(*arr->type()); + + int32_t expected_precision = 9; + int32_t expected_scale = 3; + ASSERT_EQ(expected_precision, type.precision()); + ASSERT_EQ(expected_scale, type.scale()); +} + +TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) { + PyAcquireGIL lock; + MemoryPool* pool = default_memory_pool(); + std::shared_ptr arr; + + PyObject* value1 = this->CreatePythonDecimal("0.01").detach(); + ASSERT_NE(value1, nullptr); + + PyObject* value2 = this->CreatePythonDecimal("0.001").detach(); + ASSERT_NE(value2, nullptr); + + OwnedRef list_ref(PyList_New(2)); + PyObject* list = list_ref.obj(); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, value1), 0); + ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); + + ASSERT_OK(ConvertPySequence(list, pool, &arr)); + + const auto& type = static_cast(*arr->type()); + ASSERT_EQ(3, type.precision()); + ASSERT_EQ(3, type.scale()); +} + +TEST_F(DecimalTest, SimpleInference) { + OwnedRef value(this->CreatePythonDecimal("0.01")); + ASSERT_NE(value.obj(), nullptr); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(value.obj())); + ASSERT_EQ(2, metadata.precision()); + ASSERT_EQ(2, metadata.scale()); +} + +TEST_F(DecimalTest, UpdateWithNaN) { + internal::DecimalMetadata metadata; + OwnedRef nan_value(this->CreatePythonDecimal("nan")); + ASSERT_OK(metadata.Update(nan_value.obj())); + ASSERT_EQ(std::numeric_limits::min(), metadata.precision()); + ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index e4406747d55..6db46d48512 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -14,7 +14,6 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -// #include #include @@ -37,7 +36,7 @@ class DecimalTestFixture : public ::testing::Test { TEST_F(DecimalTestFixture, TestToString) { Decimal128 decimal(this->integer_value_); - int scale = 5; + int32_t scale = 5; std::string result = decimal.ToString(scale); ASSERT_EQ(result, this->string_value_); } @@ -45,7 +44,7 @@ TEST_F(DecimalTestFixture, TestToString) { TEST_F(DecimalTestFixture, TestFromString) { Decimal128 expected(this->integer_value_); Decimal128 result; - int precision, scale; + int32_t precision, scale; ASSERT_OK(Decimal128::FromString(this->string_value_, &result, &precision, &scale)); ASSERT_EQ(result, expected); ASSERT_EQ(precision, 8); @@ -55,8 +54,8 @@ TEST_F(DecimalTestFixture, TestFromString) { TEST_F(DecimalTestFixture, TestStringStartingWithPlus) { std::string plus_value("+234.234"); Decimal128 out; - int scale; - int precision; + int32_t scale; + int32_t precision; ASSERT_OK(Decimal128::FromString(plus_value, &out, &precision, &scale)); ASSERT_EQ(234234, out); ASSERT_EQ(6, precision); @@ -67,8 +66,8 @@ TEST_F(DecimalTestFixture, TestStringStartingWithPlus128) { std::string plus_value("+2342394230592.232349023094"); Decimal128 expected_value("2342394230592232349023094"); Decimal128 out; - int scale; - int precision; + int32_t scale; + int32_t precision; ASSERT_OK(Decimal128::FromString(plus_value, &out, &precision, &scale)); ASSERT_EQ(expected_value, out); ASSERT_EQ(25, precision); @@ -90,9 +89,7 @@ TEST(DecimalTest, TestFromDecimalString128) { Decimal128 result; ASSERT_OK(Decimal128::FromString(string_value, &result)); Decimal128 expected(static_cast(-230492239423435324)); - expected *= 100; - expected -= 12; - ASSERT_EQ(result, expected); + ASSERT_EQ(result, expected * 100 - 12); // Sanity check that our number is actually using more than 64 bits ASSERT_NE(result.high_bits(), 0); @@ -194,36 +191,36 @@ TEST(DecimalTest, TestInvalidInputWithLeadingZeros) { TEST(DecimalZerosTest, LeadingZerosNoDecimalPoint) { std::string string_value("0000000"); Decimal128 d; - int precision; - int scale; + int32_t precision; + int32_t scale; ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale)); - ASSERT_EQ(precision, 7); - ASSERT_EQ(scale, 0); - ASSERT_EQ(d, 0); + ASSERT_EQ(0, precision); + ASSERT_EQ(0, scale); + ASSERT_EQ(0, d); } TEST(DecimalZerosTest, LeadingZerosDecimalPoint) { std::string string_value("000.0000"); Decimal128 d; - int precision; - int scale; + int32_t precision; + int32_t scale; ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale)); // We explicitly do not support this for now, otherwise this would be ASSERT_EQ - ASSERT_NE(precision, 7); + ASSERT_EQ(4, precision); - ASSERT_EQ(scale, 4); - ASSERT_EQ(d, 0); + ASSERT_EQ(4, scale); + ASSERT_EQ(0, d); } TEST(DecimalZerosTest, NoLeadingZerosDecimalPoint) { std::string string_value(".00000"); Decimal128 d; - int precision; - int scale; + int32_t precision; + int32_t scale; ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale)); - ASSERT_EQ(precision, 5); - ASSERT_EQ(scale, 5); - ASSERT_EQ(d, 0); + ASSERT_EQ(5, precision); + ASSERT_EQ(5, scale); + ASSERT_EQ(0, d); } template @@ -335,16 +332,16 @@ INSTANTIATE_TEST_CASE_P(Decimal128ParsingTest, Decimal128ParsingTest, std::make_tuple("0.00123", 123ULL, 5), std::make_tuple("1.23E-8", 123ULL, 10), std::make_tuple("-1.23E-8", -123LL, 10), - std::make_tuple("1.23E+3", 123ULL, -1), - std::make_tuple("-1.23E+3", -123LL, -1), - std::make_tuple("1.23E+5", 123ULL, -3), - std::make_tuple("1.2345E+7", 12345ULL, -3), + std::make_tuple("1.23E+3", 1230ULL, 0), + std::make_tuple("-1.23E+3", -1230LL, 0), + std::make_tuple("1.23E+5", 123000ULL, 0), + std::make_tuple("1.2345E+7", 12345000ULL, 0), std::make_tuple("1.23e-8", 123ULL, 10), std::make_tuple("-1.23e-8", -123LL, 10), - std::make_tuple("1.23e+3", 123ULL, -1), - std::make_tuple("-1.23e+3", -123LL, -1), - std::make_tuple("1.23e+5", 123ULL, -3), - std::make_tuple("1.2345e+7", 12345ULL, -3))); + std::make_tuple("1.23e+3", 1230ULL, 0), + std::make_tuple("-1.23e+3", -1230LL, 0), + std::make_tuple("1.23e+5", 123000ULL, 0), + std::make_tuple("1.2345e+7", 12345000ULL, 0))); class Decimal128ParsingTestInvalid : public ::testing::TestWithParam {}; @@ -375,4 +372,14 @@ TEST(Decimal128Test, TestSmallNumberFormat) { ASSERT_EQ(expected, result); } +TEST(Decimal128Test, TestNoDecimalPointExponential) { + Decimal128 value; + int32_t precision; + int32_t scale; + ASSERT_OK(Decimal128::FromString("1E1", &value, &precision, &scale)); + ASSERT_EQ(10, value.low_bits()); + ASSERT_EQ(2, precision); + ASSERT_EQ(0, scale); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index a3c8cda7654..48380a9c980 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -23,12 +23,55 @@ #include #include +#include + #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" namespace arrow { +static const Decimal128 ScaleMultipliers[] = { + Decimal128(0LL), + Decimal128(10LL), + Decimal128(100LL), + Decimal128(1000LL), + Decimal128(10000LL), + Decimal128(100000LL), + Decimal128(1000000LL), + Decimal128(10000000LL), + Decimal128(100000000LL), + Decimal128(1000000000LL), + Decimal128(10000000000LL), + Decimal128(100000000000LL), + Decimal128(1000000000000LL), + Decimal128(10000000000000LL), + Decimal128(100000000000000LL), + Decimal128(1000000000000000LL), + Decimal128(10000000000000000LL), + Decimal128(100000000000000000LL), + Decimal128(1000000000000000000LL), + Decimal128(0LL, 10000000000000000000ULL), + Decimal128(5LL, 7766279631452241920ULL), + Decimal128(54LL, 3875820019684212736ULL), + Decimal128(542LL, 1864712049423024128ULL), + Decimal128(5421LL, 200376420520689664ULL), + Decimal128(54210LL, 2003764205206896640ULL), + Decimal128(542101LL, 1590897978359414784ULL), + Decimal128(5421010LL, 15908979783594147840ULL), + Decimal128(54210108LL, 11515845246265065472ULL), + Decimal128(542101086LL, 4477988020393345024ULL), + Decimal128(5421010862LL, 7886392056514347008ULL), + Decimal128(54210108624LL, 5076944270305263616ULL), + Decimal128(542101086242LL, 13875954555633532928ULL), + Decimal128(5421010862427LL, 9632337040368467968ULL), + Decimal128(54210108624275LL, 4089650035136921600ULL), + Decimal128(542101086242752LL, 4003012203950112768ULL), + Decimal128(5421010862427522LL, 3136633892082024448ULL), + Decimal128(54210108624275221LL, 12919594847110692864ULL), + Decimal128(542101086242752217LL, 68739955140067328ULL), + Decimal128(5421010862427522170LL, 687399551400673280ULL)}; + static constexpr uint64_t kIntMask = 0xFFFFFFFF; static constexpr auto kCarryBit = static_cast(1) << static_cast(32); @@ -49,7 +92,7 @@ std::array Decimal128::ToBytes() const { } void Decimal128::ToBytes(uint8_t* out) const { - DCHECK_NE(out, NULLPTR); + DCHECK_NE(out, nullptr); reinterpret_cast(out)[0] = BitUtil::ToLittleEndian(low_bits_); reinterpret_cast(out)[1] = BitUtil::ToLittleEndian(high_bits_); } @@ -187,12 +230,10 @@ static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 1] = {1LL, 100000000000000000LL, 1000000000000000000LL}; -static inline bool isdigit(char value) { return std::isdigit(value) != 0; } - static void StringToInteger(const std::string& str, Decimal128* out) { using std::size_t; - DCHECK_NE(out, NULLPTR) << "Decimal128 output variable cannot be NULLPTR"; + DCHECK_NE(out, nullptr) << "Decimal128 output variable cannot be nullptr"; DCHECK_EQ(*out, 0) << "When converting a string to Decimal128 the initial output must be 0"; @@ -212,160 +253,124 @@ static void StringToInteger(const std::string& str, Decimal128* out) { } } -Status Decimal128::FromString(const std::string& s, Decimal128* out, int* precision, - int* scale) { - // Implements this regex: "(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?((E|e)(\\+|-)?\\d+)?"; - if (s.empty()) { - return Status::Invalid("Empty string cannot be converted to decimal"); - } +static const boost::regex DECIMAL_REGEX( + // sign of the number + "(?[-+]?)" - std::string::const_iterator charp = s.cbegin(); - std::string::const_iterator end = s.cend(); + // digits around the decimal point + "(((?\\d+)\\.(?\\d*)|\\.(?\\d+)" + ")" - char first_char = *charp; - bool is_negative = false; - if (first_char == '+' || first_char == '-') { - is_negative = first_char == '-'; - ++charp; - } + // optional exponent + "([eE](?[-+]?\\d+))?" - if (charp == end) { - std::stringstream ss; - ss << "Single character: '" << first_char << "' is not a valid decimal value"; - return Status::Invalid(ss.str()); - } + // otherwise + "|" - std::string::const_iterator numeric_string_start = charp; + // we're just an integer + "(?\\d+)" - DCHECK_LT(charp, end); + // or an integer with an exponent + "(?:[eE](?[-+]?\\d+))?)"); - // skip leading zeros - charp = std::find_if_not(charp, end, [](char value) { return value == '0'; }); +static inline bool is_zero_character(char c) { return c == '0'; } - // all zeros and no decimal point - if (charp == end) { - if (out != NULLPTR) { - *out = 0; - } +Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, + int32_t* scale) { + if (s.empty()) { + return Status::Invalid("Empty string cannot be converted to decimal"); + } - // Not sure what other libraries assign precision to for this case (this case of - // a string consisting only of one or more zeros) - if (precision != NULLPTR) { - *precision = static_cast(charp - numeric_string_start); + // case of all zeros + if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) { + if (precision != nullptr) { + *precision = 0; } - if (scale != NULLPTR) { + if (scale != nullptr) { *scale = 0; } + *out = 0; return Status::OK(); } - std::string::const_iterator whole_part_start = charp; + boost::smatch results; + const bool matches = boost::regex_match(s, results, DECIMAL_REGEX); - charp = std::find_if_not(charp, end, isdigit); + if (!matches) { + std::stringstream ss; + ss << "The string " << s << " is not a valid decimal number"; + return Status::Invalid(ss.str()); + } - std::string::const_iterator whole_part_end = charp; - std::string whole_part(whole_part_start, whole_part_end); + const std::string sign = results["SIGN"]; + const std::string integer = results["INTEGER"]; - if (charp != end && *charp == '.') { - ++charp; + const std::string left_digits = results["LEFT_DIGITS"]; + const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"]; - if (charp == end) { - return Status::Invalid( - "Decimal point must be followed by at least one base ten digit. Reached the " - "end of the string."); - } + const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"]; - if (std::isdigit(*charp) == 0) { - std::stringstream ss; - ss << "Decimal point must be followed by a base ten digit. Found '" << *charp - << "'"; - return Status::Invalid(ss.str()); - } - } else { - if (charp != end) { - std::stringstream ss; - ss << "Expected base ten digit or decimal point but found '" << *charp - << "' instead."; - return Status::Invalid(ss.str()); - } - } + const std::string first_exp_value = results["FIRST_EXP_VALUE"]; + const std::string second_exp_value = results["SECOND_EXP_VALUE"]; - std::string::const_iterator fractional_part_start = charp; + std::string whole_part; + std::string fractional_part; + std::string exponent_value; - // The rest must be digits or an exponent - if (charp != end) { - charp = std::find_if_not(charp, end, isdigit); - - // The while loop has ended before the end of the string which means we've hit a - // character that isn't a base ten digit or "E" for exponent - if (charp != end && *charp != 'E' && *charp != 'e') { - std::stringstream ss; - ss << "Found non base ten digit character '" << *charp - << "' before the end of the string"; - return Status::Invalid(ss.str()); - } + if (!integer.empty()) { + whole_part = integer; + } else if (!left_digits.empty()) { + DCHECK(second_right_digits.empty()) << s << " " << second_right_digits; + whole_part = left_digits; + fractional_part = first_right_digits; + } else { + DCHECK(first_right_digits.empty()) << s << " " << first_right_digits; + fractional_part = second_right_digits; } - std::string::const_iterator fractional_part_end = charp; - std::string fractional_part(fractional_part_start, fractional_part_end); + // skip leading zeros before the decimal point + std::string::const_iterator without_leading_zeros = + std::find_if_not(whole_part.cbegin(), whole_part.cend(), is_zero_character); + whole_part = std::string(without_leading_zeros, whole_part.cend()); - if (precision != NULLPTR) { - *precision = static_cast(whole_part.size() + fractional_part.size()); + if (!first_exp_value.empty()) { + exponent_value = first_exp_value; + } else { + exponent_value = second_exp_value; } - if (charp != end) { - // we must have an exponent, if this aborts then we have somehow not caught this and - // raised a proper error - DCHECK(*charp == 'E' || *charp == 'e'); - - ++charp; - - const char value = *charp; - const bool starts_with_plus_or_minus = value == '+' || value == '-'; - - // we use this to construct the adjusted exponent integer later - std::string::const_iterator digit_start = charp; - - // skip plus or minus - charp += starts_with_plus_or_minus; - - // confirm that the rest of the characters are digits - charp = std::find_if_not(charp, end, isdigit); - - if (charp != end) { - // we have something other than digits here - std::stringstream ss; - ss << "Found non decimal digit exponent value '" << *charp << "'"; - return Status::Invalid(ss.str()); - } - - if (scale != NULLPTR) { - // compute the scale from the adjusted exponent - std::string adjusted_exponent_string(digit_start, end); - DCHECK(std::all_of(adjusted_exponent_string.cbegin() + starts_with_plus_or_minus, - adjusted_exponent_string.cend(), isdigit)) - << "Non decimal digit character found in " << adjusted_exponent_string; - const auto adjusted_exponent = - static_cast(std::stol(adjusted_exponent_string)); - const auto len = static_cast(whole_part.size() + fractional_part.size()); + if (precision != nullptr) { + *precision = static_cast(whole_part.size() + fractional_part.size()); + } + if (scale != nullptr) { + if (!exponent_value.empty()) { + auto adjusted_exponent = static_cast(std::stol(exponent_value)); + auto len = static_cast(whole_part.size() + fractional_part.size()); *scale = -adjusted_exponent + len - 1; - } - } else { - if (scale != NULLPTR) { - *scale = static_cast(fractional_part.size()); + } else { + *scale = static_cast(fractional_part.size()); } } - if (out != NULLPTR) { - // zero out in case we've passed in a previously used value + if (out != nullptr) { *out = 0; StringToInteger(whole_part + fractional_part, out); - if (is_negative) { + if (sign == "-") { out->Negate(); } + + if (scale != nullptr && *scale < 0) { + const int32_t abs_scale = std::abs(*scale); + *out *= ScaleMultipliers[abs_scale]; + + if (precision != nullptr) { + *precision += abs_scale; + } + *scale = 0; + } } return Status::OK(); @@ -813,47 +818,6 @@ Decimal128 operator%(const Decimal128& left, const Decimal128& right) { return remainder; } -static const Decimal128 ScaleMultipliers[] = { - Decimal128(1), - Decimal128(10), - Decimal128(100), - Decimal128(1000), - Decimal128(10000), - Decimal128(100000), - Decimal128(1000000), - Decimal128(10000000), - Decimal128(100000000), - Decimal128(1000000000), - Decimal128(10000000000), - Decimal128(100000000000), - Decimal128(1000000000000), - Decimal128(10000000000000), - Decimal128(100000000000000), - Decimal128(1000000000000000), - Decimal128(10000000000000000), - Decimal128(100000000000000000), - Decimal128(1000000000000000000), - Decimal128("10000000000000000000"), - Decimal128("100000000000000000000"), - Decimal128("1000000000000000000000"), - Decimal128("10000000000000000000000"), - Decimal128("100000000000000000000000"), - Decimal128("1000000000000000000000000"), - Decimal128("10000000000000000000000000"), - Decimal128("100000000000000000000000000"), - Decimal128("1000000000000000000000000000"), - Decimal128("10000000000000000000000000000"), - Decimal128("100000000000000000000000000000"), - Decimal128("1000000000000000000000000000000"), - Decimal128("10000000000000000000000000000000"), - Decimal128("100000000000000000000000000000000"), - Decimal128("1000000000000000000000000000000000"), - Decimal128("10000000000000000000000000000000000"), - Decimal128("100000000000000000000000000000000000"), - Decimal128("1000000000000000000000000000000000000"), - Decimal128("10000000000000000000000000000000000000"), - Decimal128("100000000000000000000000000000000000000")}; - static bool RescaleWouldCauseDataLoss(const Decimal128& value, int32_t delta_scale, int32_t abs_delta_scale, Decimal128* result) { Decimal128 multiplier(ScaleMultipliers[abs_delta_scale]); @@ -872,7 +836,7 @@ static bool RescaleWouldCauseDataLoss(const Decimal128& value, int32_t delta_sca Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const { - DCHECK_NE(out, NULLPTR) << "out is nullptr"; + DCHECK_NE(out, nullptr) << "out is nullptr"; DCHECK_NE(original_scale, new_scale) << "original_scale != new_scale"; const int32_t delta_scale = new_scale - original_scale; diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 1594090a0d3..79a99ba6aba 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -124,7 +124,7 @@ class ARROW_EXPORT Decimal128 { /// \brief Convert a decimal string to an Decimal128 value, optionally including /// precision and scale if they're passed in and not null. static Status FromString(const std::string& s, Decimal128* out, - int* precision = NULLPTR, int* scale = NULLPTR); + int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); /// \brief Convert Decimal128 from one scale to another Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const; diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 4ca4d22527f..c823f06bdf6 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -30,7 +30,7 @@ namespace arrow { // // Add more as needed. -// Log levels. LOG ignores them, so their values are abitrary. +// Log levels. LOG ignores them, so their values are arbitrary. #define ARROW_DEBUG (-1) #define ARROW_INFO 0 @@ -53,6 +53,9 @@ namespace arrow { #define DCHECK(condition) \ ARROW_IGNORE_EXPR(condition) \ while (false) ::arrow::internal::NullLog() +#define DCHECK_OK(status) \ + ARROW_IGNORE_EXPR(status) \ + while (false) ::arrow::internal::NullLog() #define DCHECK_EQ(val1, val2) \ ARROW_IGNORE_EXPR(val1) \ while (false) ::arrow::internal::NullLog() @@ -76,6 +79,7 @@ namespace arrow { #define ARROW_DFATAL ARROW_FATAL #define DCHECK(condition) ARROW_CHECK(condition) +#define DCHECK_OK(status) (ARROW_CHECK((status).ok()) << (status).message()) #define DCHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2)) #define DCHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2)) #define DCHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2)) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 8423ff00b67..19b59a49b6d 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -639,3 +639,13 @@ def test_structarray_from_arrays_coerce(): pa.StructArray.from_arrays(arrays) assert result.equals(expected) + + +def test_decimal_array_with_none_and_nan(): + values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')] + array = pa.array(values) + assert array.type == pa.decimal128(4, 3) + assert array.to_pylist() == values[:2] + [None, None] + + array = pa.array(values, type=pa.decimal128(10, 4)) + assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None] diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 986aeffcacc..813fbdf40bb 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1161,6 +1161,20 @@ def test_decimal_fails_with_truncation(self): with pytest.raises(pa.ArrowException): pa.array(data2, type=type2) + def test_decimal_with_different_precisions(self): + data = [ + decimal.Decimal('0.01'), + decimal.Decimal('0.001'), + ] + series = pd.Series(data) + array = pa.array(series) + assert array.to_pylist() == data + assert array.type == pa.decimal128(3, 3) + + array = pa.array(data, type=pa.decimal128(12, 5)) + expected = [decimal.Decimal('0.01000'), decimal.Decimal('0.00100')] + assert array.to_pylist() == expected + class TestListTypes(object): """