From 6de6f0fc6feec214b8d573120cda2f3f3cfdce99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 1 Sep 2020 01:49:10 +0200 Subject: [PATCH 01/80] refactor python to arrow conversions --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/builder_base.h | 2 + cpp/src/arrow/python/python_to_arrow.cc | 1287 ++++++----------------- cpp/src/arrow/python/python_to_arrow.h | 1 + cpp/src/arrow/type_traits.h | 27 + cpp/src/arrow/util/converter.cc | 1 + cpp/src/arrow/util/converter.h | 248 +++++ 7 files changed, 620 insertions(+), 947 deletions(-) create mode 100644 cpp/src/arrow/util/converter.cc create mode 100644 cpp/src/arrow/util/converter.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f40fa3798b4..a0eb5dc686f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -182,6 +182,7 @@ set(ARROW_SRCS util/bitmap_ops.cc util/bpacking.cc util/compression.cc + util/converter.cc util/cpu_info.cc util/decimal.cc util/delimiting.cc diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 33b1f9b3e66..19cbc0a82c1 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -56,6 +56,8 @@ class ARROW_EXPORT ArrayBuilder { /// skip shared pointers and just return a raw pointer ArrayBuilder* child(int i) { return children_[i].get(); } + std::shared_ptr child_builder(int i) const { return children_[i]; } + int num_children() const { return static_cast(children_.size()); } virtual int64_t length() const { return length_; } diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 849c474ded3..a1df54857d1 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -35,6 +35,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/converter.h" #include "arrow/util/decimal.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" @@ -46,6 +47,7 @@ #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/type_traits.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -54,40 +56,36 @@ using internal::checked_pointer_cast; namespace py { -// ---------------------------------------------------------------------- -// NullCoding - -enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS }; - -template -struct NullChecker {}; +class PyValue { + public: + using I = PyObject*; + using O = PyConversionOptions; -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return obj == Py_None; } -}; + static bool IsNull(const DataType&, const O& options, I obj) { + if (options.from_pandas) { + return internal::PandasObjectIsNull(obj); + } else { + return obj == Py_None; + } + } -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } -}; + static bool IsNaT(const TimestampType&, int64_t value) { + return internal::npy_traits::isnull(value); + } -// ---------------------------------------------------------------------- -// ValueConverters -// -// Typed conversion logic for single python objects are encapsulated in -// ValueConverter structs using SFINAE for specialization. -// -// The FromPython medthod is responsible to convert the python object to the -// C++ value counterpart which can be directly appended to the ArrayBuilder or -// Scalar can be constructed from. + static bool IsNaT(const DurationType&, int64_t value) { + return internal::npy_traits::isnull(value); + } -template -struct ValueConverter {}; + static Result Convert(const NullType&, const O&, I obj) { + if (obj == Py_None) { + return nullptr; + } else { + return Status::Invalid("Invalid null value"); + } + } -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const BooleanType&, const O&, I obj) { if (obj == Py_True) { return true; } else if (obj == Py_False) { @@ -98,38 +96,32 @@ struct ValueConverter { return internal::InvalidValue(obj, "tried to convert to boolean"); } } -}; template struct ValueConverter> { using ValueType = typename Type::c_type; - static inline Result FromPython(PyObject* obj) { - ValueType value; - arrow::Status s_ = internal::CIntFromPython(obj, &value); - if (!s_.ok() && !internal::PyIntScalar_Check(obj)) { + template + static enable_if_integer> Convert(const T&, const O&, + I obj) { + typename T::c_type value; + auto status = internal::CIntFromPython(obj, &value); + if (status.ok()) { + return value; + } else if (!internal::PyIntScalar_Check(obj)) { return internal::InvalidValue(obj, "tried to convert to int"); } else { - RETURN_NOT_OK(s_); + return status; } - return value; } -}; - -template <> -struct ValueConverter { - using ValueType = typename HalfFloatType::c_type; - static inline Result FromPython(PyObject* obj) { - ValueType value; + static Result Convert(const HalfFloatType&, const O&, I obj) { + uint16_t value; RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const FloatType&, const O&, I obj) { float value; if (internal::PyFloatScalar_Check(obj)) { value = static_cast(PyFloat_AsDouble(obj)); @@ -141,11 +133,8 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const DoubleType&, const O&, I obj) { double value; if (PyFloat_Check(obj)) { value = PyFloat_AS_DOUBLE(obj); @@ -160,11 +149,14 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const Decimal128Type& type, const O&, I obj) { + Decimal128 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, type, &value)); + return value; + } + + static Result Convert(const Date32Type&, const O&, I obj) { int32_t value; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -175,16 +167,14 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const Date64Type&, const O&, I obj) { int64_t value; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); value = internal::PyDateTime_to_ms(pydate); // Truncate any intraday milliseconds + // TODO: introduce an option for this value -= value % 86400000LL; } else if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -195,16 +185,12 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool /*ignore_timezone*/) { + static Result Convert(const Time32Type& type, const O&, I obj) { int32_t value; if (PyTime_Check(obj)) { // TODO(kszucs): consider to raise if a timezone aware time object is encountered - switch (unit) { + switch (type.unit()) { case TimeUnit::SECOND: value = static_cast(internal::PyTime_to_s(obj)); break; @@ -220,16 +206,12 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool /*ignore_timezone=*/) { + static Result Convert(const Time64Type& type, const O&, I obj) { int64_t value; if (PyTime_Check(obj)) { // TODO(kszucs): consider to raise if a timezone aware time object is encountered - switch (unit) { + switch (type.unit()) { case TimeUnit::MICRO: value = internal::PyTime_to_us(obj); break; @@ -245,20 +227,16 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool ignore_timezone) { + static Result Convert(const TimestampType& type, const O& options, I obj) { int64_t value; if (PyDateTime_Check(obj)) { ARROW_ASSIGN_OR_RAISE(int64_t offset, internal::PyDateTime_utcoffset_s(obj)); - if (ignore_timezone) { + if (options.ignore_timezone) { offset = 0; } auto dt = reinterpret_cast(obj); - switch (unit) { + switch (type.unit()) { case TimeUnit::SECOND: value = internal::PyDateTime_to_s(dt) - offset; break; @@ -282,38 +260,28 @@ struct ValueConverter { default: return Status::UnknownError("Invalid time unit"); } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(type)) { + // TODO(kszucs): the message should highlight the received numpy dtype + // TODO(kszucs): it also validates the unit, so add the unit to the error message + return Status::Invalid("Expected np.datetime64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) { - // validate that the numpy scalar has np.datetime64 dtype - std::shared_ptr type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != TimestampType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.datetime64 but got: ", type->ToString()); - } - // validate that the time units are matching - if (unit != checked_cast(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.datetime64 objects with differing unit"); - } - // convert the numpy value - return reinterpret_cast(obj)->obval; - } -}; - -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool /*ignore_timezone*/) { + static Result Convert(const DurationType& type, const O&, I obj) { int64_t value; if (PyDelta_Check(obj)) { auto dt = reinterpret_cast(obj); - switch (unit) { + switch (type.unit()) { case TimeUnit::SECOND: value = internal::PyDelta_to_s(dt); break; @@ -329,186 +297,80 @@ struct ValueConverter { default: return Status::UnknownError("Invalid time unit"); } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(type)) { + // TODO(kszucs): the message should highlight the received numpy dtype + // TODO(kszucs): it also validates the unit, so add the unit to the error message + return Status::Invalid("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) { - // validate that the numpy scalar has np.timedelta64 dtype - std::shared_ptr type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != DurationType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); - } - // validate that the time units are matching - if (unit != checked_cast(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.timedelta64 objects with differing unit"); - } - // convert the numpy value - return reinterpret_cast(obj)->obval; + static Result Convert(const BaseBinaryType&, const O&, I obj) { + PyBytesView view; + RETURN_NOT_OK(view.FromString(obj)); + return util::string_view(view.bytes, view.size); } -}; -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const FixedSizeBinaryType& type, const O&, + I obj) { PyBytesView view; RETURN_NOT_OK(view.FromString(obj)); - return std::move(view); + if (ARROW_PREDICT_TRUE(view.size == type.byte_width())) { + return util::string_view(view.bytes, view.size); + } else { + std::stringstream ss; + ss << "expected to be length " << type.byte_width() << " was " << view.size; + return internal::InvalidValue(obj, ss.str()); + } } -}; -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const StringType& type, const O& options, + I obj) { // strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 bool is_utf8 = false; PyBytesView view; - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); + // TODO(kszucs): pass strict conversion in options if (!is_utf8) { return internal::InvalidValue(obj, "was not a utf8 string"); } - return std::move(view); + return util::string_view(view.bytes, view.size); } - static inline Result FromPython(PyObject* obj, bool* is_utf8) { - PyBytesView view; - - // Non-strict conversion; keep track of whether values are unicode or bytes - if (PyUnicode_Check(obj)) { - *is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); - } else { - // If not unicode or bytes, FromBinary will error - *is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); - } - return std::move(view); + static Result Convert(const DataType&, const O&, I obj) { + return Status::NotImplemented(""); } }; -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj, int32_t byte_width) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - if (ARROW_PREDICT_FALSE(view.size != byte_width)) { - std::stringstream ss; - ss << "expected to be length " << byte_width << " was " << view.size; - return internal::InvalidValue(obj, ss.str()); - } else { - return std::move(view); - } - } -}; - -// ---------------------------------------------------------------------- -// Sequence converter base and CRTP "middle" subclasses - -class SeqConverter; - -// Forward-declare converter factory -Status GetConverter(const std::shared_ptr& type, bool from_pandas, - bool strict_conversions, bool ignore_timezone, - std::unique_ptr* out); - -// Marshal Python sequence (list, tuple, etc.) to Arrow array -class SeqConverter { +class PyArrayConverter : public ArrayConverter { public: - virtual ~SeqConverter() = default; - - // Initialize the sequence converter with an ArrayBuilder created - // externally. The reason for this interface is that we have - // arrow::MakeBuilder which also creates child builders for nested types, so - // we have to pass in the child builders to child SeqConverter in the case of - // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; - - // Append a single null value to the builder - virtual Status AppendNull() = 0; - - // Append a valid value - virtual Status AppendValue(PyObject* seq) = 0; - - // Append a single python object handling Null values - virtual Status Append(PyObject* seq) = 0; - - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status Extend(PyObject* seq, int64_t size) = 0; - - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; - - virtual Status Close() { - if (chunks_.size() == 0 || builder_->length() > 0) { - std::shared_ptr last_chunk; - RETURN_NOT_OK(builder_->Finish(&last_chunk)); - chunks_.emplace_back(std::move(last_chunk)); - } - return Status::OK(); - } - - virtual Status GetResult(std::shared_ptr* out) { - // Still some accumulated data in the builder. If there are no chunks, we - // always call Finish to deal with the edge case where a size-0 sequence - // was converted with a specific output type, like array([], type=t) - RETURN_NOT_OK(Close()); - *out = std::make_shared(this->chunks_, builder_->type()); - return Status::OK(); - } - - ArrayBuilder* builder() const { return builder_; } - - int num_chunks() const { return static_cast(chunks_.size()); } + using ArrayConverter::ArrayConverter; - protected: - ArrayBuilder* builder_; - bool unfinished_builder_; - std::vector> chunks_; -}; - -template -class TypedConverter : public SeqConverter { - public: - using BuilderType = typename TypeTraits::BuilderType; - - Status Init(ArrayBuilder* builder) override { - builder_ = builder; - DCHECK_NE(builder_, nullptr); - typed_builder_ = checked_cast(builder); - return Status::OK(); - } - - // Append a missing item (default implementation) - Status AppendNull() override { return this->typed_builder_->AppendNull(); } - - // Append null if the obj is None or pandas null otherwise the valid value - Status Append(PyObject* obj) override { - return NullChecker::Check(obj) ? AppendNull() : AppendValue(obj); - } - - Status Extend(PyObject* obj, int64_t size) override { + Status Extend(PyObject* values, int64_t size) override { /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(this->Reserve(size)); // Iterate over the items adding each one - return internal::VisitSequence( - obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); + return internal::VisitSequence(values, [this](PyObject* item, bool* /* unused */) { + return this->Append(item); + }); } - Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { + Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size) { /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(this->Reserve(size)); // Iterate over the items adding each one return internal::VisitSequenceMasked( - obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { + values, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { if (is_masked) { return this->AppendNull(); } else { @@ -518,796 +380,332 @@ class TypedConverter : public SeqConverter { } }); } - - protected: - BuilderType* typed_builder_; }; -// ---------------------------------------------------------------------- -// Sequence converter for null type - -template -class NullConverter : public TypedConverter { +template +class PyPrimitiveArrayConverter : public TypedArrayConverter { public: - Status AppendValue(PyObject* obj) override { - return internal::InvalidValue(obj, "converting to null type"); - } -}; - -// ---------------------------------------------------------------------- -// Sequence converter template for primitive (integer and floating point bool) types - -template -class PrimitiveConverter : public TypedConverter { - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); - return this->typed_builder_->Append(value); - } -}; - -// ---------------------------------------------------------------------- -// Sequence converters for temporal types + using TypedArrayConverter::TypedArrayConverter; -template -class TimeConverter : public TypedConverter { - public: - explicit TimeConverter(TimeUnit::type unit, bool ignore_timezone) - : unit_(unit), ignore_timezone_(ignore_timezone) {} - - // TODO(kszucs): support numpy values for date and time converters - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, - ValueConverter::FromPython(obj, unit_, ignore_timezone_)); - return this->typed_builder_->Append(value); - } - - protected: - TimeUnit::type unit_; - bool ignore_timezone_; -}; - -// TODO(kszucs): move it to the type_traits -template -struct NumpyType {}; - -template <> -struct NumpyType { - static inline bool isnull(int64_t v) { - return internal::npy_traits::isnull(v); - } -}; - -template <> -struct NumpyType { - static inline bool isnull(int64_t v) { - return internal::npy_traits::isnull(v); - } -}; - -template -class TemporalConverter : public TimeConverter { - public: - using TimeConverter::TimeConverter; - - Status AppendValue(PyObject* obj) override { - int64_t value; - if (PyArray_CheckAnyScalarExact(obj)) { - // convert np.datetime64 / np.timedelta64 depending on Type - ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromNumpy(obj, this->unit_)); - if (NumpyType::isnull(value)) { - // checks numpy NaT sentinel after conversion - return this->typed_builder_->AppendNull(); - } + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->type_, this->options_, value)) { + return this->builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE( - value, - ValueConverter::FromPython( - obj, this->unit_, TimeConverter::ignore_timezone_)); + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->type_, this->options_, value)); + return this->builder_->Append(converted); } - return this->typed_builder_->Append(value); } }; -// ---------------------------------------------------------------------- -// Sequence converters for Binary, FixedSizeBinary, String - -template -class BinaryLikeConverter : public TypedConverter { - public: - using BuilderType = typename TypeTraits::BuilderType; - - inline Status AutoChunk(Py_ssize_t size) { - // did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(this->typed_builder_->value_data_length() + size > - BuilderType::memory_limit())) { - // builder would be full, so need to add a new chunk - std::shared_ptr chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); - } - return Status::OK(); - } - - Status AppendString(const PyBytesView& view) { - // check that the value fits in the datatype - if (view.size > BuilderType::memory_limit()) { - return Status::Invalid("string too large for datatype"); - } - DCHECK_GE(view.size, 0); - - // create a new chunk if the value would overflow the builder - RETURN_NOT_OK(AutoChunk(view.size)); - - // now we can safely append the value to the builder - RETURN_NOT_OK( - this->typed_builder_->Append(::arrow::util::string_view(view.bytes, view.size))); - - return Status::OK(); - } - - protected: - // Create a single instance of PyBytesView here to prevent unnecessary object - // creation/destruction - PyBytesView string_view_; -}; - -template -class BinaryConverter : public BinaryLikeConverter { - public: - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto view, ValueConverter::FromPython(obj)); - return this->AppendString(view); - } -}; - -template -class FixedSizeBinaryConverter - : public BinaryLikeConverter { - public: - explicit FixedSizeBinaryConverter(int32_t byte_width) : byte_width_(byte_width) {} - - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE( - this->string_view_, - ValueConverter::FromPython(obj, byte_width_)); - return this->AppendString(this->string_view_); - } - - protected: - int32_t byte_width_; -}; - -// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, -// otherwise we allow but return results as BinaryArray -template -class StringConverter : public BinaryLikeConverter { +template +class PyPrimitiveArrayConverter< + T, enable_if_t::value || is_duration_type::value>> + : public TypedArrayConverter { public: - StringConverter() : binary_count_(0) {} + using TypedArrayConverter::TypedArrayConverter; - Status AppendValue(PyObject* obj) override { - if (Strict) { - // raise if the object is not unicode or not an utf-8 encoded bytes - ARROW_ASSIGN_OR_RAISE(this->string_view_, ValueConverter::FromPython(obj)); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->type_, this->options_, value)) { + return this->builder_->AppendNull(); } else { - // keep track of whether values are unicode or bytes; if any bytes are - // observe, the result will be bytes - bool is_utf8; - ARROW_ASSIGN_OR_RAISE(this->string_view_, - ValueConverter::FromPython(obj, &is_utf8)); - if (!is_utf8) { - ++binary_count_; + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->type_, converted)) { + return this->builder_->AppendNull(); + } else { + return this->builder_->Append(converted); } } - return this->AppendString(this->string_view_); - } - - Status GetResult(std::shared_ptr* out) override { - RETURN_NOT_OK(SeqConverter::GetResult(out)); - - // If we saw any non-unicode, cast results to BinaryArray - if (binary_count_) { - // We should have bailed out earlier - DCHECK(!Strict); - auto binary_type = TypeTraits::type_singleton(); - return (*out)->View(binary_type).Value(out); - } - return Status::OK(); } - - protected: - int64_t binary_count_; }; -// ---------------------------------------------------------------------- -// Convert lists (NumPy arrays containing lists or ndarrays as values) - // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ - case Type::TYPE: { \ - if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ - return value_converter_->Extend(obj, value_length); \ - } \ - return AppendNdarrayTypedItem(arr); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return this->value_converter_->Extend(value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ } // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE) \ - case Type::TYPE: { \ - return value_converter_->Extend(obj, value_length); \ +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return this->value_converter_->Extend(value, size); \ } -// Base class for ListConverter and FixedSizeListConverter (to have both work with CRTP) -template -class BaseListConverter : public TypedConverter { +template +class PyListArrayConverter : public ListArrayConverter { public: - using BuilderType = typename TypeTraits::BuilderType; - - explicit BaseListConverter(bool from_pandas, bool strict_conversions, - bool ignore_timezone) - : from_pandas_(from_pandas), - strict_conversions_(strict_conversions), - ignore_timezone_(ignore_timezone) {} - - Status Init(ArrayBuilder* builder) override { - this->builder_ = builder; - this->typed_builder_ = checked_cast(builder); - - this->value_type_ = checked_cast(*builder->type()).value_type(); - RETURN_NOT_OK(GetConverter(value_type_, from_pandas_, strict_conversions_, - ignore_timezone_, &value_converter_)); - return this->value_converter_->Init(this->typed_builder_->value_builder()); + using ListArrayConverter::ListArrayConverter; + + Status ValidateSize(const FixedSizeListType& type, int64_t size) { + // TODO(kszucs): perhaps this should be handled somewhere else + if (type.list_size() != size) { + return Status::Invalid("Length of item not correct: expected ", type.list_size(), + " but got array of size ", size); + } else { + return Status::OK(); + } } - template - Status AppendNdarrayTypedItem(PyArrayObject* arr) { - using traits = internal::npy_traits; - using T = typename traits::value_type; - using ValueBuilderType = typename TypeTraits::BuilderType; + Status ValidateBuilder(const MapType&) { + // TODO(kszucs): perhaps this should be handled somewhere else + if (this->builder_->key_builder()->null_count() > 0) { + return Status::Invalid("Invalid Map: key field can not contain null values"); + } else { + return Status::OK(); + } + } - const bool null_sentinels_possible = - // Always treat Numpy's NaT as null - NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || - // Observing pandas's null sentinels - (from_pandas_ && traits::supports_nulls); + Status ValidateBuilder(const DataType&) { return Status::OK(); } + Status ValidateSize(const BaseListType&, int64_t size) { return Status::OK(); } - auto child_builder = checked_cast(value_converter_->builder()); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->type_, this->options_, value)) { + return this->builder_->AppendNull(); + } - // TODO(wesm): Vector append when not strided - Ndarray1DIndexer values(arr); - if (null_sentinels_possible) { - for (int64_t i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - RETURN_NOT_OK(child_builder->AppendNull()); - } else { - RETURN_NOT_OK(child_builder->Append(values[i])); - } - } + RETURN_NOT_OK(this->builder_->Append()); + if (PyArray_Check(value)) { + RETURN_NOT_OK(AppendNdarray(value)); + } else if (PySequence_Check(value)) { + RETURN_NOT_OK(AppendSequence(value)); } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(child_builder->Append(values[i])); - } + return internal::InvalidType( + value, "was not a sequence or recognized null for conversion to list type"); } - return Status::OK(); + return ValidateBuilder(this->type_); } - Status AppendNdarrayItem(PyObject* obj) { - PyArrayObject* arr = reinterpret_cast(obj); + Status AppendSequence(PyObject* value) { + int64_t size = static_cast(PySequence_Size(value)); + RETURN_NOT_OK(this->ValidateSize(this->type_, size)); + return this->value_converter_->Extend(value, size); + } - if (PyArray_NDIM(arr) != 1) { + Status AppendNdarray(PyObject* value) { + PyArrayObject* ndarray = reinterpret_cast(value); + if (PyArray_NDIM(ndarray) != 1) { return Status::Invalid("Can only convert 1-dimensional array values"); } + const int64_t size = PyArray_SIZE(ndarray); + RETURN_NOT_OK(this->ValidateSize(this->type_, size)); - const int64_t value_length = PyArray_SIZE(arr); - - switch (value_type_->id()) { + const auto value_type = this->value_converter_->builder()->type(); + switch (value_type->id()) { LIST_SLOW_CASE(NA) - LIST_FAST_CASE(UINT8, NPY_UINT8, UInt8Type) - LIST_FAST_CASE(INT8, NPY_INT8, Int8Type) - LIST_FAST_CASE(UINT16, NPY_UINT16, UInt16Type) - LIST_FAST_CASE(INT16, NPY_INT16, Int16Type) - LIST_FAST_CASE(UINT32, NPY_UINT32, UInt32Type) - LIST_FAST_CASE(INT32, NPY_INT32, Int32Type) - LIST_FAST_CASE(UINT64, NPY_UINT64, UInt64Type) - LIST_FAST_CASE(INT64, NPY_INT64, Int64Type) + LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) + LIST_FAST_CASE(INT8, Int8Type, NPY_INT8) + LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16) + LIST_FAST_CASE(INT16, Int16Type, NPY_INT16) + LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32) + LIST_FAST_CASE(INT32, Int32Type, NPY_INT32) + LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64) + LIST_FAST_CASE(INT64, Int64Type, NPY_INT64) + LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16) + LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT) + LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE) + LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME) + LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA) LIST_SLOW_CASE(DATE32) LIST_SLOW_CASE(DATE64) LIST_SLOW_CASE(TIME32) LIST_SLOW_CASE(TIME64) - LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) - LIST_FAST_CASE(DURATION, NPY_TIMEDELTA, DurationType) - LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType) - LIST_FAST_CASE(FLOAT, NPY_FLOAT, FloatType) - LIST_FAST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) LIST_SLOW_CASE(BINARY) LIST_SLOW_CASE(FIXED_SIZE_BINARY) LIST_SLOW_CASE(STRING) case Type::LIST: { - if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) { + if (PyArray_DESCR(ndarray)->type_num != NPY_OBJECT) { return Status::Invalid( - "Can only convert list types from NumPy object " - "array input"); + "Can only convert list types from NumPy object array input"); } - return internal::VisitSequence(obj, [this](PyObject* item, bool*) { - return value_converter_->Append(item); + return internal::VisitSequence(value, [this](PyObject* item, bool*) { + return this->value_converter_->Append(item); }); } default: { - return Status::TypeError("Unknown list item type: ", value_type_->ToString()); + return Status::TypeError("Unknown list item type: ", value_type->ToString()); } } } - Status AppendValue(PyObject* obj) override { - RETURN_NOT_OK(this->typed_builder_->Append()); - if (PyArray_Check(obj)) { - return AppendNdarrayItem(obj); - } - if (!PySequence_Check(obj)) { - return internal::InvalidType(obj, - "was not a sequence or recognized null" - " for conversion to list type"); - } - int64_t list_size = static_cast(PySequence_Size(obj)); - return value_converter_->Extend(obj, list_size); - } - - Status GetResult(std::shared_ptr* out) override { - // TODO: Improved handling of chunked children - if (value_converter_->num_chunks() > 0) { - return Status::Invalid("List child type ", - value_converter_->builder()->type()->ToString(), - " overflowed the capacity of a single chunk"); - } - return SeqConverter::GetResult(out); - } - - protected: - std::shared_ptr value_type_; - std::unique_ptr value_converter_; - const bool from_pandas_; - const bool strict_conversions_; - const bool ignore_timezone_; -}; - -template -class ListConverter : public BaseListConverter { - public: - using BASE = BaseListConverter; - using BASE::BASE; -}; + template + Status AppendNdarrayTyped(PyArrayObject* ndarray) { + // no need to go through the conversion + using NumpyTrait = internal::npy_traits; + using NumpyType = typename NumpyTrait::value_type; + using ValueBuilderType = typename TypeTraits::BuilderType; -template -class FixedSizeListConverter : public BaseListConverter { - public: - using BASE = BaseListConverter; - using BASE::BASE; + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || + // Observing pandas's null sentinels + (this->options_.from_pandas && NumpyTrait::supports_nulls); - Status Init(ArrayBuilder* builder) override { - RETURN_NOT_OK(BASE::Init(builder)); - list_size_ = checked_pointer_cast(builder->type())->list_size(); - return Status::OK(); - } + auto value_builder = + checked_cast(this->value_converter_->builder().get()); - Status AppendValue(PyObject* obj) override { - // the same as BaseListConverter but with additional length checks - RETURN_NOT_OK(this->typed_builder_->Append()); - if (PyArray_Check(obj)) { - int64_t list_size = static_cast(PyArray_Size(obj)); - if (list_size != list_size_) { - return Status::Invalid("Length of item not correct: expected ", list_size_, - " but got array of size ", list_size); + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(ndarray); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (NumpyTrait::isnull(values[i])) { + RETURN_NOT_OK(value_builder->AppendNull()); + } else { + RETURN_NOT_OK(value_builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(value_builder->Append(values[i])); } - return this->AppendNdarrayItem(obj); - } - if (!PySequence_Check(obj)) { - return internal::InvalidType(obj, - "was not a sequence or recognized null" - " for conversion to list type"); - } - int64_t list_size = static_cast(PySequence_Size(obj)); - if (list_size != list_size_) { - return Status::Invalid("Length of item not correct: expected ", list_size_, - " but got list of size ", list_size); - } - return this->value_converter_->Extend(obj, list_size); - } - - protected: - int64_t list_size_; -}; - -// ---------------------------------------------------------------------- -// Convert maps - -// Define a MapConverter as a ListConverter that uses MapBuilder.value_builder -// to append struct of key/value pairs -template -class MapConverter : public BaseListConverter { - public: - using BASE = BaseListConverter; - - explicit MapConverter(bool from_pandas, bool strict_conversions, bool ignore_timezone) - : BASE(from_pandas, strict_conversions, ignore_timezone), key_builder_(nullptr) {} - - Status Append(PyObject* obj) override { - RETURN_NOT_OK(BASE::Append(obj)); - return VerifyLastStructAppended(); - } - - Status Extend(PyObject* seq, int64_t size) override { - RETURN_NOT_OK(BASE::Extend(seq, size)); - return VerifyLastStructAppended(); - } - - Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) override { - RETURN_NOT_OK(BASE::ExtendMasked(seq, mask, size)); - return VerifyLastStructAppended(); - } - - protected: - Status VerifyLastStructAppended() { - // The struct_builder may not have field_builders initialized in constructor, so - // assign key_builder lazily - if (key_builder_ == nullptr) { - auto struct_builder = - checked_cast(BASE::value_converter_->builder()); - key_builder_ = struct_builder->field_builder(0); - } - if (key_builder_->null_count() > 0) { - return Status::Invalid("Invalid Map: key field can not contain null values"); } return Status::OK(); } - - private: - ArrayBuilder* key_builder_; }; -// ---------------------------------------------------------------------- -// Convert structs - -template -class StructConverter : public TypedConverter { +template +class PyStructArrayConverter : public StructArrayConverter { public: - explicit StructConverter(bool from_pandas, bool strict_conversions, - bool ignore_timezone) - : from_pandas_(from_pandas), - strict_conversions_(strict_conversions), - ignore_timezone_(ignore_timezone) {} - - Status Init(ArrayBuilder* builder) override { - this->builder_ = builder; - this->typed_builder_ = checked_cast(builder); - auto struct_type = checked_pointer_cast(builder->type()); - - num_fields_ = this->typed_builder_->num_fields(); - DCHECK_EQ(num_fields_, struct_type->num_fields()); - - field_name_bytes_list_.reset(PyList_New(num_fields_)); - field_name_unicode_list_.reset(PyList_New(num_fields_)); + using StructArrayConverter::StructArrayConverter; + + Status Init() override { + // Store the field names as a PyObjects for dict matching + num_fields_ = this->type_.num_fields(); + bytes_field_names_.reset(PyList_New(num_fields_)); + unicode_field_names_.reset(PyList_New(num_fields_)); RETURN_IF_PYERROR(); - // Initialize the child converters and field names for (int i = 0; i < num_fields_; i++) { - const std::string& field_name(struct_type->field(i)->name()); - std::shared_ptr field_type(struct_type->field(i)->type()); - - std::unique_ptr value_converter; - RETURN_NOT_OK(GetConverter(field_type, from_pandas_, strict_conversions_, - ignore_timezone_, &value_converter)); - RETURN_NOT_OK(value_converter->Init(this->typed_builder_->field_builder(i))); - value_converters_.push_back(std::move(value_converter)); - - // Store the field name as a PyObject, for dict matching - PyObject* bytesobj = - PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); - PyObject* unicodeobj = + const auto& field_name = this->type_.field(i)->name(); + PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); + PyObject* unicode = PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); RETURN_IF_PYERROR(); - PyList_SET_ITEM(field_name_bytes_list_.obj(), i, bytesobj); - PyList_SET_ITEM(field_name_unicode_list_.obj(), i, unicodeobj); + PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes); + PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode); } - return Status::OK(); } - Status AppendValue(PyObject* obj) override { - RETURN_NOT_OK(this->typed_builder_->Append()); - // Note heterogeneous sequences are not allowed - if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) { - if (PyDict_Check(obj)) { - source_kind_ = SourceKind::DICTS; - } else if (PyTuple_Check(obj)) { - source_kind_ = SourceKind::TUPLES; + Status InferInputKind(PyObject* value) { + // Infer input object's type, note that heterogeneous sequences are not allowed + if (ARROW_PREDICT_FALSE(input_kind_ == InputKind::UNKNOWN)) { + if (PyDict_Check(value)) { + input_kind_ = InputKind::DICTS; + } else if (PyTuple_Check(value)) { + input_kind_ = InputKind::TUPLES; + } else { + return internal::InvalidType(value, + "was not a dict, tuple, or recognized null value " + "for conversion to struct type"); } } - if (PyDict_Check(obj) && source_kind_ == SourceKind::DICTS) { - return AppendDictItem(obj); - } else if (PyTuple_Check(obj) && source_kind_ == SourceKind::TUPLES) { - return AppendTupleItem(obj); - } else { - return internal::InvalidType(obj, - "was not a dict, tuple, or recognized null value" - " for conversion to struct type"); - } + return Status::OK(); } - // Append a missing item - Status AppendNull() override { return this->typed_builder_->AppendNull(); } - - protected: - Status AppendDictItem(PyObject* obj) { - if (dict_key_kind_ == DictKeyKind::UNICODE) { - return AppendDictItemWithUnicodeKeys(obj); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->type_, this->options_, value)) { + return this->builder_->AppendNull(); } - if (dict_key_kind_ == DictKeyKind::BYTES) { - return AppendDictItemWithBytesKeys(obj); + RETURN_NOT_OK(InferInputKind(value)); + RETURN_NOT_OK(this->builder_->Append()); + return (input_kind_ == InputKind::DICTS) ? AppendDict(value) : AppendTuple(value); + } + + Status AppendTuple(PyObject* tuple) { + if (!PyTuple_Check(tuple)) { + return internal::InvalidType(tuple, "was expecting a tuple"); } - for (int i = 0; i < num_fields_; i++) { - PyObject* nameobj = PyList_GET_ITEM(field_name_unicode_list_.obj(), i); - PyObject* valueobj = PyDict_GetItem(obj, nameobj); - if (valueobj != NULL) { - dict_key_kind_ = DictKeyKind::UNICODE; - return AppendDictItemWithUnicodeKeys(obj); - } - RETURN_IF_PYERROR(); - // Unicode key not present, perhaps bytes key is? - nameobj = PyList_GET_ITEM(field_name_bytes_list_.obj(), i); - valueobj = PyDict_GetItem(obj, nameobj); - if (valueobj != NULL) { - dict_key_kind_ = DictKeyKind::BYTES; - return AppendDictItemWithBytesKeys(obj); - } - RETURN_IF_PYERROR(); + if (PyTuple_GET_SIZE(tuple) != num_fields_) { + return Status::Invalid("Tuple size must be equal to number of struct fields"); } - // If we come here, it means all keys are absent for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(value_converters_[i]->Append(Py_None)); + PyObject* value = PyTuple_GET_ITEM(tuple, i); + RETURN_NOT_OK(this->child_converters_[i]->Append(value)); } return Status::OK(); } - Status AppendDictItemWithBytesKeys(PyObject* obj) { - return AppendDictItem(obj, field_name_bytes_list_.obj()); - } - - Status AppendDictItemWithUnicodeKeys(PyObject* obj) { - return AppendDictItem(obj, field_name_unicode_list_.obj()); - } - - Status AppendDictItem(PyObject* obj, PyObject* field_name_list) { - // NOTE we're ignoring any extraneous dict items - for (int i = 0; i < num_fields_; i++) { - PyObject* nameobj = PyList_GET_ITEM(field_name_list, i); // borrowed - PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed - if (valueobj == NULL) { + Status InferDictKeyKind(PyObject* dict) { + if (ARROW_PREDICT_FALSE(dict_key_kind_ == DictKeyKind::UNKNOWN)) { + for (int i = 0; i < num_fields_; i++) { + PyObject* name = PyList_GET_ITEM(unicode_field_names_.obj(), i); + PyObject* value = PyDict_GetItem(dict, name); + if (value != NULL) { + dict_key_kind_ = DictKeyKind::UNICODE; + return Status::OK(); + } + RETURN_IF_PYERROR(); + // Unicode key not present, perhaps bytes key is? + name = PyList_GET_ITEM(bytes_field_names_.obj(), i); + value = PyDict_GetItem(dict, name); + if (value != NULL) { + dict_key_kind_ = DictKeyKind::BYTES; + return Status::OK(); + } RETURN_IF_PYERROR(); } - RETURN_NOT_OK(value_converters_[i]->Append(valueobj ? valueobj : Py_None)); } return Status::OK(); } - Status AppendTupleItem(PyObject* obj) { - if (PyTuple_GET_SIZE(obj) != num_fields_) { - return Status::Invalid("Tuple size must be equal to number of struct fields"); + Status AppendDict(PyObject* dict) { + if (!PyDict_Check(dict)) { + return internal::InvalidType(dict, "was expecting a dict"); } + RETURN_NOT_OK(InferDictKeyKind(dict)); + + if (dict_key_kind_ == DictKeyKind::UNICODE) { + return AppendDict(dict, unicode_field_names_.obj()); + } else if (dict_key_kind_ == DictKeyKind::BYTES) { + return AppendDict(dict, bytes_field_names_.obj()); + } else { + // If we come here, it means all keys are absent + for (int i = 0; i < num_fields_; i++) { + RETURN_NOT_OK(this->child_converters_[i]->Append(Py_None)); + } + return Status::OK(); + } + } + + Status AppendDict(PyObject* dict, PyObject* field_names) { + // NOTE we're ignoring any extraneous dict items for (int i = 0; i < num_fields_; i++) { - PyObject* valueobj = PyTuple_GET_ITEM(obj, i); - RETURN_NOT_OK(value_converters_[i]->Append(valueobj)); + PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed + PyObject* value = PyDict_GetItem(dict, name); // borrowed + if (value == NULL) { + RETURN_IF_PYERROR(); + } + RETURN_NOT_OK(this->child_converters_[i]->Append(value ? value : Py_None)); } return Status::OK(); } - std::vector> value_converters_; - OwnedRef field_name_unicode_list_; - OwnedRef field_name_bytes_list_; - int num_fields_; + protected: // Whether we're converting from a sequence of dicts or tuples - enum class SourceKind { UNKNOWN, DICTS, TUPLES } source_kind_ = SourceKind::UNKNOWN; + enum class InputKind { UNKNOWN, DICTS, TUPLES } input_kind_ = InputKind::UNKNOWN; + // Whether the input dictionary keys' type is python bytes or unicode enum class DictKeyKind { UNKNOWN, BYTES, UNICODE } dict_key_kind_ = DictKeyKind::UNKNOWN; - bool from_pandas_; - bool strict_conversions_; - bool ignore_timezone_; -}; - -template -class DecimalConverter : public TypedConverter { - public: - using BASE = TypedConverter; - - Status Init(ArrayBuilder* builder) override { - RETURN_NOT_OK(BASE::Init(builder)); - decimal_type_ = checked_pointer_cast(this->typed_builder_->type()); - return Status::OK(); - } - - Status AppendValue(PyObject* obj) override { - Decimal128 value; - RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *decimal_type_, &value)); - return this->typed_builder_->Append(value); - } - - private: - std::shared_ptr decimal_type_; + // Store the field names as a PyObjects for dict matching + OwnedRef bytes_field_names_; + OwnedRef unicode_field_names_; + // Store the number of fields for later reuse + int num_fields_; }; -#define PRIMITIVE(TYPE_ENUM, TYPE) \ - case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new PrimitiveConverter); \ - break; - -#define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new TYPE_CLASS); \ - break; - -// Dynamic constructor for sequence converters -template -Status GetConverterFlat(const std::shared_ptr& type, bool strict_conversions, - bool ignore_timezone, std::unique_ptr* out) { - switch (type->id()) { - SIMPLE_CONVERTER_CASE(NA, NullConverter); - PRIMITIVE(BOOL, BooleanType); - PRIMITIVE(INT8, Int8Type); - PRIMITIVE(INT16, Int16Type); - PRIMITIVE(INT32, Int32Type); - PRIMITIVE(INT64, Int64Type); - PRIMITIVE(UINT8, UInt8Type); - PRIMITIVE(UINT16, UInt16Type); - PRIMITIVE(UINT32, UInt32Type); - PRIMITIVE(UINT64, UInt64Type); - PRIMITIVE(HALF_FLOAT, HalfFloatType); - PRIMITIVE(FLOAT, FloatType); - PRIMITIVE(DOUBLE, DoubleType); - PRIMITIVE(DATE32, Date32Type); - PRIMITIVE(DATE64, Date64Type); - SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); - case Type::BINARY: - *out = - std::unique_ptr(new BinaryConverter()); - break; - case Type::LARGE_BINARY: - *out = std::unique_ptr( - new BinaryConverter()); - break; - case Type::FIXED_SIZE_BINARY: - *out = std::unique_ptr(new FixedSizeBinaryConverter( - checked_cast(*type).byte_width())); - break; - case Type::STRING: - if (strict_conversions) { - *out = std::unique_ptr( - new StringConverter()); - } else { - *out = std::unique_ptr( - new StringConverter()); - } - break; - case Type::LARGE_STRING: - if (strict_conversions) { - *out = std::unique_ptr( - new StringConverter()); - } else { - *out = std::unique_ptr( - new StringConverter()); - } - break; - case Type::TIME32: { - auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TimeConverter(unit, ignore_timezone)); - break; - } - case Type::TIME64: { - auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TimeConverter(unit, ignore_timezone)); - break; - } - case Type::TIMESTAMP: { - auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TemporalConverter(unit, ignore_timezone)); - break; - } - case Type::DURATION: { - auto unit = checked_cast(*type).unit(); - *out = - std::unique_ptr(new TemporalConverter( - unit, /*ignore_timezone=*/false)); - break; - } - default: - return Status::NotImplemented("Sequence converter for type ", type->ToString(), - " not implemented"); - } - return Status::OK(); -} - -Status GetConverter(const std::shared_ptr& type, bool from_pandas, - bool strict_conversions, bool ignore_timezone, - std::unique_ptr* out) { - if (from_pandas) { - // ARROW-842: If pandas is not installed then null checks will be less - // comprehensive, but that is okay. - internal::InitPandasStaticData(); - } - - switch (type->id()) { - case Type::LIST: - if (from_pandas) { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::LARGE_LIST: - if (from_pandas) { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::MAP: - if (from_pandas) { - *out = - std::unique_ptr(new MapConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr(new MapConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::FIXED_SIZE_LIST: - if (from_pandas) { - *out = std::unique_ptr( - new FixedSizeListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr( - new FixedSizeListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::STRUCT: - if (from_pandas) { - *out = std::unique_ptr( - new StructConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr(new StructConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - default: - break; - } - - if (from_pandas) { - RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, - ignore_timezone, out)); - } else { - RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, - ignore_timezone, out)); - } - return Status::OK(); -} - -// ---------------------------------------------------------------------- +// TODO(kszucs): find a better name +using PyArrayConverterBuilder = + ArrayConverterBuilder; // Convert *obj* to a sequence if necessary // Fill *size* to its length. If >= 0 on entry, *size* is an upper size @@ -1373,28 +771,19 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, if (options.type == nullptr) { RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); - if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { - const auto& ts_type = checked_cast(*real_type); - real_type = timestamp(ts_type.unit()); - } + // TODO(kszucs): remove this + // if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { + // const auto& ts_type = checked_cast(*real_type); + // real_type = timestamp(ts_type.unit()); + // } } else { real_type = options.type; strict_conversions = true; } DCHECK_GE(size, 0); - // Create the sequence converter, initialize with the builder - std::unique_ptr converter; - RETURN_NOT_OK(GetConverter(real_type, options.from_pandas, strict_conversions, - options.ignore_timezone, &converter)); - - // Create ArrayBuilder for type, then pass into the SeqConverter - // instance. The reason this is created here rather than in GetConverter is - // because of nested types (child SeqConverter objects need the child - // builders created by MakeBuilder) - std::unique_ptr type_builder; - RETURN_NOT_OK(MakeBuilder(options.pool, real_type, &type_builder)); - RETURN_NOT_OK(converter->Init(type_builder.get())); + ARROW_ASSIGN_OR_RAISE(auto converter, + PyArrayConverterBuilder::Make(real_type, options.pool, options)); // Convert values if (mask != nullptr && mask != Py_None) { @@ -1404,7 +793,11 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, } // Retrieve result. Conversion may yield one or more array values - return converter->GetResult(out); + // return converter->GetResult(out); + ARROW_ASSIGN_OR_RAISE(auto result, converter->Finish()); + ArrayVector chunks{result}; + *out = std::make_shared(chunks, real_type); + return Status::OK(); } Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 5108e752e8f..ac40b4f35e5 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -81,4 +81,5 @@ Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, std::shared_ptr* out); } // namespace py + } // namespace arrow diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index f95edb0c896..36719ffa278 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -544,6 +544,9 @@ using is_string_like_type = template using enable_if_string_like = enable_if_t::value, R>; +template +using enable_if_same = enable_if_t::value, R>; + // Note that this also includes DecimalType template using is_fixed_size_binary_type = std::is_base_of; @@ -574,6 +577,9 @@ using is_nested_type = std::is_base_of; template using enable_if_nested = enable_if_t::value, R>; +template +using enable_if_not_nested = enable_if_t::value, R>; + template using is_var_length_list_type = std::integral_constant::value || @@ -596,6 +602,15 @@ using is_fixed_size_list_type = std::is_same; template using enable_if_fixed_size_list = enable_if_t::value, R>; +template +using is_list_type = + std::integral_constant::value || + std::is_same::value || + std::is_same::valuae>; + +template +using enable_if_list_type = enable_if_t::value, R>; + template using is_list_like_type = std::integral_constant::value || @@ -654,6 +669,18 @@ using is_interval_type = std::is_base_of; template using enable_if_interval = enable_if_t::value, R>; +template +using is_dictionary_type = std::is_base_of; + +template +using enable_if_dictionary = enable_if_t::value, R>; + +template +using is_extension_type = std::is_base_of; + +template +using enable_if_extension = enable_if_t::value, R>; + // Attribute differentiation template diff --git a/cpp/src/arrow/util/converter.cc b/cpp/src/arrow/util/converter.cc new file mode 100644 index 00000000000..ef53ba9e6e0 --- /dev/null +++ b/cpp/src/arrow/util/converter.cc @@ -0,0 +1 @@ +// Move the implementation here diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h new file mode 100644 index 00000000000..9e2fa909af9 --- /dev/null +++ b/cpp/src/arrow/util/converter.h @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/int_util_internal.h" +#include "arrow/util/logging.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template +class ARROW_EXPORT ArrayConverter { + public: + using InputType = I; + using OptionsType = O; + + ArrayConverter(const std::shared_ptr& type, + std::shared_ptr builder, O options) + : sp_type_(type), sp_builder_(builder), options_(options) {} + + virtual ~ArrayConverter() = default; + std::shared_ptr builder() { return sp_builder_; } + std::shared_ptr type() { return sp_type_; } + O options() { return options_; } + + virtual Status Init() { return Status::OK(); }; + virtual Status Reserve(int64_t additional_capacity) = 0; + + virtual Status Append(I value) = 0; + virtual Status AppendNull() = 0; + + virtual Status Extend(I seq, int64_t size) = 0; + + virtual Result> Finish() = 0; + + // virtual Result> ToArray(I value); + // virtual Result> ToChunkedArray(I value); + + protected: + const std::shared_ptr sp_type_; + std::shared_ptr sp_builder_; + O options_; +}; + +template +class ARROW_EXPORT TypedArrayConverter : public AC { + public: + using ArrayConverter = AC; + using BuilderType = typename TypeTraits::BuilderType; + + TypedArrayConverter(const std::shared_ptr& type, + std::shared_ptr builder, + typename AC::OptionsType options) + : AC(type, builder, options), + type_(checked_cast(*type)), + builder_(checked_cast(builder.get())) {} + + Status Reserve(int64_t additional_capacity) override { + return this->builder_->Reserve(additional_capacity); + } + + Status AppendNull() override { return this->builder_->AppendNull(); } + + Result> Finish() override { + std::shared_ptr out; + RETURN_NOT_OK(builder_->Finish(&out)); + return out; + } + + protected: + const T& type_; + BuilderType* builder_; +}; + +template +class ARROW_EXPORT ListArrayConverter : public TypedArrayConverter { + public: + ListArrayConverter(const std::shared_ptr& type, + std::shared_ptr builder, + std::shared_ptr value_converter, + typename AC::OptionsType options) + : TypedArrayConverter(type, builder, options), + value_converter_(std::move(value_converter)) {} + + protected: + std::shared_ptr value_converter_; +}; + +template +class ARROW_EXPORT StructArrayConverter : public TypedArrayConverter { + public: + StructArrayConverter(const std::shared_ptr& type, + std::shared_ptr builder, + std::vector> child_converters, + typename AC::OptionsType options) + : TypedArrayConverter(type, builder, options), + child_converters_(std::move(child_converters)) {} + + protected: + std::vector> child_converters_; +}; + +template class PAC, + template class LAC, template class SAC> +struct ArrayConverterBuilder { + using Self = ArrayConverterBuilder; + + Status Visit(const NullType& t) { + // TODO: merge with the primitive c_type variant below, requires a NullType ctor which + // accepts a type instance + using T = NullType; + using BuilderType = typename TypeTraits::BuilderType; + using PrimitiveConverter = PAC; + static_assert(std::is_same::value, + ""); + + auto builder = std::make_shared(pool); + out->reset(new PrimitiveConverter(type, std::move(builder), options)); + return Status::OK(); + } + + template + enable_if_t::value && !is_interval_type::value && + !is_dictionary_type::value && !is_extension_type::value, + Status> + Visit(const T& t) { + using BuilderType = typename TypeTraits::BuilderType; + using PrimitiveConverter = PAC; + static_assert(std::is_same::value, + ""); + + auto builder = std::make_shared(type, pool); + out->reset(new PrimitiveConverter(type, std::move(builder), options)); + return Status::OK(); + } + + template + enable_if_t::value && !std::is_same::value, Status> + Visit(const T& t) { + using BuilderType = typename TypeTraits::BuilderType; + using ListConverter = LAC; + static_assert(std::is_same::value, ""); + + ARROW_ASSIGN_OR_RAISE(auto child_converter, + (Self::Make(t.value_type(), pool, options))); + auto builder = std::make_shared(pool, child_converter->builder(), type); + out->reset( + new ListConverter(type, std::move(builder), std::move(child_converter), options)); + return Status::OK(); + } + + Status Visit(const MapType& t) { + using T = MapType; + using ListConverter = LAC; + static_assert(std::is_same::value, ""); + + // TODO(kszucs): seems like builders not respect field nullability + std::vector> struct_fields{t.key_field(), t.item_field()}; + auto struct_type = std::make_shared(struct_fields); + ARROW_ASSIGN_OR_RAISE(auto struct_converter, Self::Make(struct_type, pool, options)); + + auto struct_builder = struct_converter->builder(); + auto key_builder = struct_builder->child_builder(0); + auto item_builder = struct_builder->child_builder(1); + auto builder = std::make_shared(pool, key_builder, item_builder, type); + + out->reset(new ListConverter(type, std::move(builder), std::move(struct_converter), + options)); + return Status::OK(); + } + + Status Visit(const StructType& t) { + using T = StructType; + using StructConverter = SAC; + static_assert(std::is_same::value, ""); + + std::shared_ptr child_converter; + std::vector> child_converters; + std::vector> child_builders; + + for (const auto& field : t.fields()) { + ARROW_ASSIGN_OR_RAISE(child_converter, Self::Make(field->type(), pool, options)); + + // TODO: use move + child_converters.emplace_back(child_converter); + child_builders.emplace_back(child_converter->builder()); + } + + auto builder = std::make_shared(type, pool, child_builders); + out->reset(new StructConverter(type, std::move(builder), std::move(child_converters), + options)); + return Status::OK(); + } + + Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } + + static Result> Make(std::shared_ptr type, + MemoryPool* pool, O options) { + std::shared_ptr out; + Self visitor = {type, pool, options, &out}; + RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); + RETURN_NOT_OK(out->Init()); + return out; + } + + const std::shared_ptr& type; + MemoryPool* pool; + O options; + std::shared_ptr* out; +}; + +} // namespace arrow From 19c07f16712833fcca07744ea71432d4a9bb31a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 1 Sep 2020 17:49:48 +0200 Subject: [PATCH 02/80] fallback to bytes in case of mixed unicode and bytes data --- cpp/src/arrow/python/python_test.cc | 10 +-- cpp/src/arrow/python/python_to_arrow.cc | 84 +++++++++++++++++++------ cpp/src/arrow/python/python_to_arrow.h | 6 +- 3 files changed, 72 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index b21c16af50a..e0b22915af0 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -330,7 +330,7 @@ TEST(BuiltinConversionTest, TestMixedTypeFails) { ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); std::shared_ptr arr; - ASSERT_RAISES(TypeError, ConvertPySequence(list, {}, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {}, &arr)); } TEST_F(DecimalTest, FromPythonDecimalRescaleNotTruncateable) { @@ -424,10 +424,10 @@ TEST_F(DecimalTest, TestNoneAndNaN) { std::shared_ptr arr, arr_from_pandas; PyConversionOptions options; - ASSERT_RAISES(TypeError, ConvertPySequence(list, options, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options, &arr)); options.from_pandas = true; - ASSERT_OK(ConvertPySequence(list, options, &arr_from_pandas)); + ASSERT_OK(ConvertPySequence(list, nullptr, options, &arr_from_pandas)); auto c0 = arr_from_pandas->chunk(0); ASSERT_TRUE(c0->IsValid(0)); ASSERT_TRUE(c0->IsNull(1)); @@ -452,7 +452,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScale) { } std::shared_ptr arr; - ASSERT_OK(ConvertPySequence(list, {}, &arr)); + ASSERT_OK(ConvertPySequence(list, nullptr, {}, &arr)); const auto& type = checked_cast(*arr->type()); int32_t expected_precision = 9; @@ -477,7 +477,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) { ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); std::shared_ptr arr; - ASSERT_OK(ConvertPySequence(list, {}, &arr)); + ASSERT_OK(ConvertPySequence(list, nullptr, {}, &arr)); const auto& type = checked_cast(*arr->type()); ASSERT_EQ(3, type.precision()); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a1df54857d1..377c8f89341 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -333,18 +333,30 @@ struct ValueConverter> { } } - static Result Convert(const StringType& type, const O& options, - I obj) { - // strict conversion, force output to be unicode / utf8 and validate that - // any binary values are utf8 + template + static enable_if_string_like>> Convert( + const T& type, const O& options, I obj) { bool is_utf8 = false; PyBytesView view; - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); - // TODO(kszucs): pass strict conversion in options - if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); + if (options.strict) { + // Strict conversion, force output to be unicode / utf8 and validate that + // any binary values are utf8 + RETURN_NOT_OK(view.FromString(obj, &is_utf8)); + if (!is_utf8) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } + } else { + // Non-strict conversion; keep track of whether values are unicode or bytes + if (PyUnicode_Check(obj)) { + is_utf8 = true; + RETURN_NOT_OK(view.FromUnicode(obj)); + } else { + // If not unicode or bytes, FromBinary will error + is_utf8 = false; + RETURN_NOT_OK(view.FromBinary(obj)); + } } - return util::string_view(view.bytes, view.size); + return std::make_pair(util::string_view(view.bytes, view.size), is_utf8); } static Result Convert(const DataType&, const O&, I obj) { @@ -420,6 +432,44 @@ class PyPrimitiveArrayConverter< } }; +// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, otherwise we +// allow but return results as BinaryArray +template +class PyPrimitiveArrayConverter> + : public TypedArrayConverter { + public: + using TypedArrayConverter::TypedArrayConverter; + + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->type_, this->options_, value)) { + return this->builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto pair, + PyValue::Convert(this->type_, this->options_, value)); + if (!pair.second) { + // observed binary value + observed_binary_ = true; + } + return this->builder_->Append(pair.first); + } + } + + Result> Finish() override { + ARROW_ASSIGN_OR_RAISE(auto array, + (TypedArrayConverter::Finish())); + if (observed_binary_) { + // If we saw any non-unicode, cast results to BinaryArray + auto binary_type = TypeTraits::type_singleton(); + return array->View(binary_type); + } else { + return array; + } + } + + protected: + bool observed_binary_ = false; +}; + // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path #define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ @@ -750,24 +800,25 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } -Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, - const PyConversionOptions& options, +Status ConvertPySequence(PyObject* obj, PyObject* mask, const PyConversionOptions& opts, std::shared_ptr* out) { PyAcquireGIL lock; PyObject* seq; OwnedRef tmp_seq_nanny; + PyConversionOptions options = opts; // copy options struct since we modify it below std::shared_ptr real_type; int64_t size = options.size; - RETURN_NOT_OK(ConvertToSequenceAndInferSize(sequence_source, &seq, &size)); + RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); tmp_seq_nanny.reset(seq); // In some cases, type inference may be "loose", like strings. If the user // passed pa.string(), then we will error if we encounter any non-UTF8 // value. If not, then we will allow the result to be a BinaryArray - bool strict_conversions = false; + auto copied_options = options; + options.strict = false; if (options.type == nullptr) { RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); @@ -778,7 +829,7 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, // } } else { real_type = options.type; - strict_conversions = true; + options.strict = true; } DCHECK_GE(size, 0); @@ -800,10 +851,5 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, return Status::OK(); } -Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, - std::shared_ptr* out) { - return ConvertPySequence(obj, nullptr, options, out); -} - } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index ac40b4f35e5..83abee3b2b7 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -60,6 +60,8 @@ struct PyConversionOptions { /// timezone bugs (see ARROW-9528). Should be removed /// after Arrow 2.0 release. bool ignore_timezone = false; + + bool strict = false; }; /// \brief Convert sequence (list, generator, NumPy array with dtype object) of @@ -76,10 +78,6 @@ Status ConvertPySequence(PyObject* obj, PyObject* mask, const PyConversionOptions& options, std::shared_ptr* out); -ARROW_PYTHON_EXPORT -Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, - std::shared_ptr* out); - } // namespace py } // namespace arrow From ccf7b9ae8eac63e0a1b22dcc2086a54f58a1e823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 1 Sep 2020 17:53:42 +0200 Subject: [PATCH 03/80] fix error type --- cpp/src/arrow/python/python_to_arrow.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 377c8f89341..d180d52a6da 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -267,8 +267,8 @@ struct ValueConverter> { if (!numpy_type->Equals(type)) { // TODO(kszucs): the message should highlight the received numpy dtype // TODO(kszucs): it also validates the unit, so add the unit to the error message - return Status::Invalid("Expected np.datetime64 but got: ", - numpy_type->ToString()); + return Status::NotImplemented("Expected np.datetime64 but got: ", + numpy_type->ToString()); } return reinterpret_cast(obj)->obval; } else { @@ -304,8 +304,8 @@ struct ValueConverter> { if (!numpy_type->Equals(type)) { // TODO(kszucs): the message should highlight the received numpy dtype // TODO(kszucs): it also validates the unit, so add the unit to the error message - return Status::Invalid("Expected np.timedelta64 but got: ", - numpy_type->ToString()); + return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); } return reinterpret_cast(obj)->obval; } else { From 7e86eba7cec880fca0c07751d3cae4bcc4c6ee53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 2 Sep 2020 12:11:34 +0200 Subject: [PATCH 04/80] decrypt --- cpp/src/arrow/array/builder_base.h | 2 +- cpp/src/arrow/util/converter.h | 118 +++++++++++++++-------------- 2 files changed, 64 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 19cbc0a82c1..22dea2ee858 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -56,7 +56,7 @@ class ARROW_EXPORT ArrayBuilder { /// skip shared pointers and just return a raw pointer ArrayBuilder* child(int i) { return children_[i].get(); } - std::shared_ptr child_builder(int i) const { return children_[i]; } + const std::shared_ptr& child_builder(int i) const { return children_[i]; } int num_children() const { return static_cast(children_.size()); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 9e2fa909af9..0842b294b32 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -44,28 +44,28 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -template +template class ARROW_EXPORT ArrayConverter { public: - using InputType = I; - using OptionsType = O; + using InputType = Input; + using OptionsType = Options; ArrayConverter(const std::shared_ptr& type, - std::shared_ptr builder, O options) + std::shared_ptr builder, Options options) : sp_type_(type), sp_builder_(builder), options_(options) {} virtual ~ArrayConverter() = default; - std::shared_ptr builder() { return sp_builder_; } - std::shared_ptr type() { return sp_type_; } - O options() { return options_; } + const std::shared_ptr& builder() const { return sp_builder_; } + const std::shared_ptr& type() const { return sp_type_; } + Options options() const { return options_; } virtual Status Init() { return Status::OK(); }; virtual Status Reserve(int64_t additional_capacity) = 0; - virtual Status Append(I value) = 0; + virtual Status Append(Input value) = 0; virtual Status AppendNull() = 0; - virtual Status Extend(I seq, int64_t size) = 0; + virtual Status Extend(Input seq, int64_t size) = 0; virtual Result> Finish() = 0; @@ -75,19 +75,19 @@ class ARROW_EXPORT ArrayConverter { protected: const std::shared_ptr sp_type_; std::shared_ptr sp_builder_; - O options_; + Options options_; }; -template -class ARROW_EXPORT TypedArrayConverter : public AC { +template +class ARROW_EXPORT TypedArrayConverter : public ArrayConverter { public: - using ArrayConverter = AC; + using ArrayConverterType = ArrayConverter; using BuilderType = typename TypeTraits::BuilderType; TypedArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - typename AC::OptionsType options) - : AC(type, builder, options), + typename ArrayConverter::OptionsType options) + : ArrayConverter(type, builder, options), type_(checked_cast(*type)), builder_(checked_cast(builder.get())) {} @@ -108,50 +108,53 @@ class ARROW_EXPORT TypedArrayConverter : public AC { BuilderType* builder_; }; -template -class ARROW_EXPORT ListArrayConverter : public TypedArrayConverter { +template +class ARROW_EXPORT ListArrayConverter : public TypedArrayConverter { public: ListArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - std::shared_ptr value_converter, - typename AC::OptionsType options) - : TypedArrayConverter(type, builder, options), + std::shared_ptr value_converter, + typename ArrayConverter::OptionsType options) + : TypedArrayConverter(type, builder, options), value_converter_(std::move(value_converter)) {} protected: - std::shared_ptr value_converter_; + std::shared_ptr value_converter_; }; -template -class ARROW_EXPORT StructArrayConverter : public TypedArrayConverter { +template +class ARROW_EXPORT StructArrayConverter : public TypedArrayConverter { public: StructArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - std::vector> child_converters, - typename AC::OptionsType options) - : TypedArrayConverter(type, builder, options), + std::vector> child_converters, + typename ArrayConverter::OptionsType options) + : TypedArrayConverter(type, builder, options), child_converters_(std::move(child_converters)) {} protected: - std::vector> child_converters_; + std::vector> child_converters_; }; -template class PAC, - template class LAC, template class SAC> +template class PrimitiveArrayConverter, + template class ListArrayConverter, + template class StructArrayConverter> struct ArrayConverterBuilder { - using Self = ArrayConverterBuilder; + using Self = ArrayConverterBuilder; Status Visit(const NullType& t) { // TODO: merge with the primitive c_type variant below, requires a NullType ctor which // accepts a type instance - using T = NullType; - using BuilderType = typename TypeTraits::BuilderType; - using PrimitiveConverter = PAC; - static_assert(std::is_same::value, - ""); + using BuilderType = typename TypeTraits::BuilderType; + using NullConverter = PrimitiveArrayConverter; + static_assert( + std::is_same::value, + ""); auto builder = std::make_shared(pool); - out->reset(new PrimitiveConverter(type, std::move(builder), options)); + out->reset(new NullConverter(type, std::move(builder), options)); return Status::OK(); } @@ -161,8 +164,9 @@ struct ArrayConverterBuilder { Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using PrimitiveConverter = PAC; - static_assert(std::is_same::value, + using PrimitiveConverter = PrimitiveArrayConverter; + static_assert(std::is_same::value, ""); auto builder = std::make_shared(type, pool); @@ -174,8 +178,10 @@ struct ArrayConverterBuilder { enable_if_t::value && !std::is_same::value, Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ListConverter = LAC; - static_assert(std::is_same::value, ""); + using ListConverter = ListArrayConverter; + static_assert( + std::is_same::value, + ""); ARROW_ASSIGN_OR_RAISE(auto child_converter, (Self::Make(t.value_type(), pool, options))); @@ -186,9 +192,10 @@ struct ArrayConverterBuilder { } Status Visit(const MapType& t) { - using T = MapType; - using ListConverter = LAC; - static_assert(std::is_same::value, ""); + using MapConverter = ListArrayConverter; + static_assert( + std::is_same::value, + ""); // TODO(kszucs): seems like builders not respect field nullability std::vector> struct_fields{t.key_field(), t.item_field()}; @@ -200,18 +207,19 @@ struct ArrayConverterBuilder { auto item_builder = struct_builder->child_builder(1); auto builder = std::make_shared(pool, key_builder, item_builder, type); - out->reset(new ListConverter(type, std::move(builder), std::move(struct_converter), - options)); + out->reset( + new MapConverter(type, std::move(builder), std::move(struct_converter), options)); return Status::OK(); } Status Visit(const StructType& t) { - using T = StructType; - using StructConverter = SAC; - static_assert(std::is_same::value, ""); + using StructConverter = StructArrayConverter; + static_assert( + std::is_same::value, + ""); - std::shared_ptr child_converter; - std::vector> child_converters; + std::shared_ptr child_converter; + std::vector> child_converters; std::vector> child_builders; for (const auto& field : t.fields()) { @@ -230,9 +238,9 @@ struct ArrayConverterBuilder { Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } - static Result> Make(std::shared_ptr type, - MemoryPool* pool, O options) { - std::shared_ptr out; + static Result> Make(std::shared_ptr type, + MemoryPool* pool, Options options) { + std::shared_ptr out; Self visitor = {type, pool, options, &out}; RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); RETURN_NOT_OK(out->Init()); @@ -241,8 +249,8 @@ struct ArrayConverterBuilder { const std::shared_ptr& type; MemoryPool* pool; - O options; - std::shared_ptr* out; + Options options; + std::shared_ptr* out; }; } // namespace arrow From eb8a7859be22b2e0238bf2e422d17ee66d8c7f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 2 Sep 2020 12:12:55 +0200 Subject: [PATCH 05/80] remove converter.cc --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/util/converter.cc | 1 - 2 files changed, 2 deletions(-) delete mode 100644 cpp/src/arrow/util/converter.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a0eb5dc686f..f40fa3798b4 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -182,7 +182,6 @@ set(ARROW_SRCS util/bitmap_ops.cc util/bpacking.cc util/compression.cc - util/converter.cc util/cpu_info.cc util/decimal.cc util/delimiting.cc diff --git a/cpp/src/arrow/util/converter.cc b/cpp/src/arrow/util/converter.cc deleted file mode 100644 index ef53ba9e6e0..00000000000 --- a/cpp/src/arrow/util/converter.cc +++ /dev/null @@ -1 +0,0 @@ -// Move the implementation here From f2b519e434c226a75d2e9ebb49610e528b5ffbb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 2 Sep 2020 14:49:28 +0200 Subject: [PATCH 06/80] remove unused headers --- cpp/src/arrow/array/builder_base.cc | 6 ++++++ cpp/src/arrow/array/builder_base.h | 7 +++++++ cpp/src/arrow/util/converter.h | 28 +++++++--------------------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 6f015dda3e1..b92cc285894 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -99,6 +99,12 @@ Status ArrayBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } +Result> ArrayBuilder::Finish() { + std::shared_ptr out; + RETURN_NOT_OK(Finish(&out)); + return out; +} + void ArrayBuilder::Reset() { capacity_ = length_ = null_count_ = 0; null_bitmap_builder_.Reset(); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 22dea2ee858..d73681756ba 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -120,6 +120,13 @@ class ARROW_EXPORT ArrayBuilder { /// \return Status Status Finish(std::shared_ptr* out); + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \return The finalized Array object + Result> Finish(); + /// \brief Return the type of the built Array virtual std::shared_ptr type() const = 0; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 0842b294b32..b804a947ef7 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -15,13 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include - -#include -#include -#include -#include -#include #include #include #include @@ -33,9 +26,6 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/decimal.h" -#include "arrow/util/int_util_internal.h" -#include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -45,7 +35,7 @@ using internal::checked_cast; using internal::checked_pointer_cast; template -class ARROW_EXPORT ArrayConverter { +class ArrayConverter { public: using InputType = Input; using OptionsType = Options; @@ -79,7 +69,7 @@ class ARROW_EXPORT ArrayConverter { }; template -class ARROW_EXPORT TypedArrayConverter : public ArrayConverter { +class TypedArrayConverter : public ArrayConverter { public: using ArrayConverterType = ArrayConverter; using BuilderType = typename TypeTraits::BuilderType; @@ -97,11 +87,7 @@ class ARROW_EXPORT TypedArrayConverter : public ArrayConverter { Status AppendNull() override { return this->builder_->AppendNull(); } - Result> Finish() override { - std::shared_ptr out; - RETURN_NOT_OK(builder_->Finish(&out)); - return out; - } + Result> Finish() override { return builder_->Finish(); }; protected: const T& type_; @@ -109,7 +95,7 @@ class ARROW_EXPORT TypedArrayConverter : public ArrayConverter { }; template -class ARROW_EXPORT ListArrayConverter : public TypedArrayConverter { +class ListArrayConverter : public TypedArrayConverter { public: ListArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, @@ -123,7 +109,7 @@ class ARROW_EXPORT ListArrayConverter : public TypedArrayConverter -class ARROW_EXPORT StructArrayConverter : public TypedArrayConverter { +class StructArrayConverter : public TypedArrayConverter { public: StructArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, @@ -242,8 +228,8 @@ struct ArrayConverterBuilder { MemoryPool* pool, Options options) { std::shared_ptr out; Self visitor = {type, pool, options, &out}; - RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); - RETURN_NOT_OK(out->Init()); + ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); + ARROW_RETURN_NOT_OK(out->Init()); return out; } From 75bf699f8289588ea7708f99d87b83a0b0ef8600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 2 Sep 2020 17:07:52 +0200 Subject: [PATCH 07/80] dictionary support --- cpp/src/arrow/python/python_to_arrow.cc | 60 +++++++++++--- cpp/src/arrow/util/converter.h | 74 +++++++++++++++--- python/pyarrow/tests/test_convert_builtin.py | 82 ++++++++++++++++++++ 3 files changed, 196 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index d180d52a6da..e2b1b4bd597 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -359,8 +359,8 @@ struct ValueConverter> { return std::make_pair(util::string_view(view.bytes, view.size), is_utf8); } - static Result Convert(const DataType&, const O&, I obj) { - return Status::NotImplemented(""); + static Result Convert(const DataType& type, const O&, I obj) { + return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); } }; @@ -395,9 +395,9 @@ class PyArrayConverter : public ArrayConverter { }; template -class PyPrimitiveArrayConverter : public TypedArrayConverter { +class PyPrimitiveArrayConverter : public PrimitiveArrayConverter { public: - using TypedArrayConverter::TypedArrayConverter; + using PrimitiveArrayConverter::PrimitiveArrayConverter; Status Append(PyObject* value) override { if (PyValue::IsNull(this->type_, this->options_, value)) { @@ -413,9 +413,9 @@ class PyPrimitiveArrayConverter : public TypedArrayConverter class PyPrimitiveArrayConverter< T, enable_if_t::value || is_duration_type::value>> - : public TypedArrayConverter { + : public PrimitiveArrayConverter { public: - using TypedArrayConverter::TypedArrayConverter; + using PrimitiveArrayConverter::PrimitiveArrayConverter; Status Append(PyObject* value) override { if (PyValue::IsNull(this->type_, this->options_, value)) { @@ -436,9 +436,9 @@ class PyPrimitiveArrayConverter< // allow but return results as BinaryArray template class PyPrimitiveArrayConverter> - : public TypedArrayConverter { + : public PrimitiveArrayConverter { public: - using TypedArrayConverter::TypedArrayConverter; + using PrimitiveArrayConverter::PrimitiveArrayConverter; Status Append(PyObject* value) override { if (PyValue::IsNull(this->type_, this->options_, value)) { @@ -470,6 +470,46 @@ class PyPrimitiveArrayConverter> bool observed_binary_ = false; }; +template +class PyDictionaryArrayConverter : public DictionaryArrayConverter { + public: + using DictionaryArrayConverter::DictionaryArrayConverter; + + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->type_, this->options_, value)) { + return this->builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->builder_->Append(converted); + } + } +}; + +template +class PyDictionaryArrayConverter> + : public DictionaryArrayConverter { + public: + using DictionaryArrayConverter::DictionaryArrayConverter; + + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->value_type_, this->options_, value)) { + return this->builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto pair, + PyValue::Convert(this->value_type_, this->options_, value)); + if (!pair.second) { + // observed binary value + observed_binary_ = true; + } + return this->builder_->Append(pair.first); + } + } + + protected: + bool observed_binary_ = false; +}; + // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path #define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ @@ -754,8 +794,8 @@ class PyStructArrayConverter : public StructArrayConverter // TODO(kszucs): find a better name using PyArrayConverterBuilder = ArrayConverterBuilder; + PyPrimitiveArrayConverter, PyDictionaryArrayConverter, + PyListArrayConverter, PyStructArrayConverter>; // Convert *obj* to a sequence if necessary // Fill *size* to its length. If >= 0 on entry, *size* is an upper size diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index b804a947ef7..ff6749e6857 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -49,30 +49,24 @@ class ArrayConverter { const std::shared_ptr& type() const { return sp_type_; } Options options() const { return options_; } - virtual Status Init() { return Status::OK(); }; + virtual Status Init() { return Status::OK(); } virtual Status Reserve(int64_t additional_capacity) = 0; - virtual Status Append(Input value) = 0; virtual Status AppendNull() = 0; - virtual Status Extend(Input seq, int64_t size) = 0; - virtual Result> Finish() = 0; - // virtual Result> ToArray(I value); - // virtual Result> ToChunkedArray(I value); - protected: const std::shared_ptr sp_type_; std::shared_ptr sp_builder_; Options options_; }; -template +template ::BuilderType> class TypedArrayConverter : public ArrayConverter { public: using ArrayConverterType = ArrayConverter; - using BuilderType = typename TypeTraits::BuilderType; TypedArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, @@ -94,6 +88,29 @@ class TypedArrayConverter : public ArrayConverter { BuilderType* builder_; }; +// mostly for convenience +template +class PrimitiveArrayConverter : public TypedArrayConverter { + public: + using TypedArrayConverter::TypedArrayConverter; +}; + +template +class DictionaryArrayConverter + : public TypedArrayConverter> { + public: + DictionaryArrayConverter(const std::shared_ptr& type, + std::shared_ptr builder, + typename ArrayConverter::OptionsType options) + : TypedArrayConverter>( + type, builder, options), + value_type_(checked_cast( + *checked_cast(*type).value_type())) {} + + protected: + const T& value_type_; +}; + template class ListArrayConverter : public TypedArrayConverter { public: @@ -122,13 +139,21 @@ class StructArrayConverter : public TypedArrayConverter { std::vector> child_converters_; }; +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + out->reset( \ + new DictionaryArrayConverter(type, std::move(builder), options)); \ + break; + template class PrimitiveArrayConverter, + template class DictionaryArrayConverter, template class ListArrayConverter, template class StructArrayConverter> struct ArrayConverterBuilder { using Self = ArrayConverterBuilder; + DictionaryArrayConverter, ListArrayConverter, + StructArrayConverter>; Status Visit(const NullType& t) { // TODO: merge with the primitive c_type variant below, requires a NullType ctor which @@ -198,6 +223,35 @@ struct ArrayConverterBuilder { return Status::OK(); } + Status Visit(const DictionaryType& t) { + std::unique_ptr builder; + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, nullptr, &builder)); + + switch (t.value_type()->id()) { + DICTIONARY_CASE(BOOL, BooleanType); + DICTIONARY_CASE(INT8, Int8Type); + DICTIONARY_CASE(INT16, Int16Type); + DICTIONARY_CASE(INT32, Int32Type); + DICTIONARY_CASE(INT64, Int64Type); + DICTIONARY_CASE(UINT8, UInt8Type); + DICTIONARY_CASE(UINT16, UInt16Type); + DICTIONARY_CASE(UINT32, UInt32Type); + DICTIONARY_CASE(UINT64, UInt64Type); + DICTIONARY_CASE(HALF_FLOAT, HalfFloatType); + DICTIONARY_CASE(FLOAT, FloatType); + DICTIONARY_CASE(DOUBLE, DoubleType); + DICTIONARY_CASE(DATE32, Date32Type); + DICTIONARY_CASE(DATE64, Date64Type); + DICTIONARY_CASE(BINARY, BinaryType); + DICTIONARY_CASE(STRING, StringType); + DICTIONARY_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType); + default: + return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(), + " not implemented"); + } + return Status::OK(); + } + Status Visit(const StructType& t) { using StructConverter = StructArrayConverter; static_assert( diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index b8050f96468..20391b82047 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1664,3 +1664,85 @@ def test_map_from_tuples(): for entry in [[(5,)], [()], [('5', 'foo', True)]]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([entry], type=pa.map_('i4', 'i4')) + + +def test_dictionary_from_boolean(): + typ = pa.dictionary(pa.int8(), value_type=pa.bool_()) + a = pa.array([False, False, True, False, True], type=typ) + assert isinstance(a.type, pa.DictionaryType) + assert a.type.equals(typ) + + expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8()) + expected_dictionary = pa.array([False, True], type=pa.bool_()) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +@pytest.mark.parametrize('value_type', [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.float32(), + pa.float64(), + pa.date32(), + pa.date64(), +]) +def test_dictionary_from_integers(value_type): + typ = pa.dictionary(pa.int8(), value_type=value_type) + a = pa.array([1, 2, 1, 1, 2, 3], type=typ) + assert isinstance(a.type, pa.DictionaryType) + assert a.type.equals(typ) + + expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8()) + expected_dictionary = pa.array([1, 2, 3], type=value_type) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +# @pytest.mark.parametrize('input_index_type', [ +# pa.int8(), +# pa.int16(), +# pa.int32(), +# pa.int64() +# ]) +# def test_dictionary_is_always_adaptive(input_index_type): +# # dictionary array is constructed using adaptive index type builder, +# # meaning that the input index type is ignored since the output index +# # type depends on the input data +# typ = pa.dictionary(input_index_type, value_type=pa.int64()) + +# a = pa.array(range(2**7), type=typ) +# expected = pa.dictionary(pa.int8(), pa.int64()) +# assert a.type.equals(expected) + +# a = pa.array(range(2**7 + 1), type=typ) +# expected = pa.dictionary(pa.int16(), pa.int64()) +# assert a.type.equals(expected) + + +def test_dictionary_from_strings(): + for value_type in [pa.binary(), pa.string()]: + typ = pa.dictionary(pa.int8(), value_type) + a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ) + + assert isinstance(a.type, pa.DictionaryType) + + expected_indices = pa.array([0, 1, 2, 1, 2, 3], type=pa.int8()) + expected_dictionary = pa.array(["", "a", "bb", "ccc"], type=value_type) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + # fixed size binary type + typ = pa.dictionary(pa.int8(), pa.binary(3)) + a = pa.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ) + assert isinstance(a.type, pa.DictionaryType) + + expected_indices = pa.array([0, 0, 1, 2, 1], type=pa.int8()) + expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3)) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) From 0d73f02d05d52b11d8fac86e2e2f33f7abe3632e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Sep 2020 11:25:49 +0200 Subject: [PATCH 08/80] fix dict conversion --- cpp/src/arrow/python/python_to_arrow.cc | 16 +++++++++++++++- cpp/src/arrow/util/converter.h | 3 +-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index e2b1b4bd597..beb38d02742 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -476,7 +476,7 @@ class PyDictionaryArrayConverter : public DictionaryArrayConverter::DictionaryArrayConverter; Status Append(PyObject* value) override { - if (PyValue::IsNull(this->type_, this->options_, value)) { + if (PyValue::IsNull(this->value_type_, this->options_, value)) { return this->builder_->AppendNull(); } else { ARROW_ASSIGN_OR_RAISE(auto converted, @@ -506,6 +506,20 @@ class PyDictionaryArrayConverter> } } + Result> Finish() override { + ARROW_ASSIGN_OR_RAISE(auto array, + (DictionaryArrayConverter::Finish())); + if (observed_binary_) { + // If we saw any non-unicode, cast results to a dictionary with binary value type + const auto& current_type = checked_cast(*array->type()); + auto binary_type = TypeTraits::type_singleton(); + auto new_type = dictionary(current_type.index_type(), binary_type, current_type.ordered()); + return array->View(new_type); + } else { + return array; + } + } + protected: bool observed_binary_ = false; }; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index ff6749e6857..31fe9766c8d 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -51,7 +51,7 @@ class ArrayConverter { virtual Status Init() { return Status::OK(); } virtual Status Reserve(int64_t additional_capacity) = 0; - virtual Status Append(Input value) = 0; + virtual Status Append(InputType value) = 0; virtual Status AppendNull() = 0; virtual Status Extend(Input seq, int64_t size) = 0; virtual Result> Finish() = 0; @@ -88,7 +88,6 @@ class TypedArrayConverter : public ArrayConverter { BuilderType* builder_; }; -// mostly for convenience template class PrimitiveArrayConverter : public TypedArrayConverter { public: From aa6364f59d1be0511c895ba88895b9e38d7b9457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Sep 2020 16:20:49 +0200 Subject: [PATCH 09/80] refactor PyBytesView --- cpp/src/arrow/python/common.h | 85 +++++++++---------------- cpp/src/arrow/python/python_to_arrow.cc | 30 ++++----- cpp/src/arrow/python/serialize.cc | 4 +- 3 files changed, 45 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 52a3f334d4e..6a0f3da0120 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -23,6 +23,7 @@ #include "arrow/buffer.h" #include "arrow/python/pyarrow.h" #include "arrow/python/visibility.h" +#include "arrow/result.h" #include "arrow/util/macros.h" namespace arrow { @@ -188,84 +189,60 @@ class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { struct PyBytesView { const char* bytes; Py_ssize_t size; - - PyBytesView() : bytes(NULLPTR), size(0), ref(NULLPTR) {} - - // View the given Python object as binary-like, i.e. bytes - Status FromBinary(PyObject* obj) { return FromBinary(obj, "a bytes object"); } - - Status FromString(PyObject* obj) { - bool ignored = false; - return FromString(obj, false, &ignored); - } - - Status FromString(PyObject* obj, bool* is_utf8) { - return FromString(obj, true, is_utf8); - } - - Status FromUnicode(PyObject* obj) { - Py_ssize_t size; - // The utf-8 representation is cached on the unicode object - const char* data = PyUnicode_AsUTF8AndSize(obj, &size); - RETURN_IF_PYERROR(); - this->bytes = data; - this->size = size; - this->ref.reset(); - return Status::OK(); - } - - protected: - PyBytesView(const char* b, Py_ssize_t s, PyObject* obj = NULLPTR) - : bytes(b), size(s), ref(obj) {} + bool is_utf8; // View the given Python object as string-like, i.e. str or (utf8) bytes - Status FromString(PyObject* obj, bool check_utf8, bool* is_utf8) { + static Result FromString(PyObject* obj, bool check_utf8 = false) { if (PyUnicode_Check(obj)) { - *is_utf8 = true; return FromUnicode(obj); } else { - ARROW_RETURN_NOT_OK(FromBinary(obj, "a string or bytes object")); + ARROW_ASSIGN_OR_RAISE(auto result, FromBinary(obj)); if (check_utf8) { // Check the bytes are utf8 utf-8 - OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size)); + OwnedRef decoded(PyUnicode_FromStringAndSize(result.bytes, result.size)); if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - *is_utf8 = true; + result.is_utf8 = true; } else { - *is_utf8 = false; PyErr_Clear(); + result.is_utf8 = false; } - } else { - *is_utf8 = false; } - return Status::OK(); + return result; } } - Status FromBinary(PyObject* obj, const char* expected_msg) { + // View the given Python object as unicode string + static Result FromUnicode(PyObject* obj) { + Py_ssize_t size; + // The utf-8 representation is cached on the unicode object + const char* data = PyUnicode_AsUTF8AndSize(obj, &size); + RETURN_IF_PYERROR(); + return PyBytesView(data, size, true); + } + + // View the given Python object as binary-like, i.e. bytes + static Result FromBinary(PyObject* obj) { if (PyBytes_Check(obj)) { - this->bytes = PyBytes_AS_STRING(obj); - this->size = PyBytes_GET_SIZE(obj); - this->ref.reset(); - return Status::OK(); + return PyBytesView(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), false); } else if (PyByteArray_Check(obj)) { - this->bytes = PyByteArray_AS_STRING(obj); - this->size = PyByteArray_GET_SIZE(obj); - this->ref.reset(); - return Status::OK(); + return PyBytesView(PyByteArray_AS_STRING(obj), PyByteArray_GET_SIZE(obj), false); } else if (PyMemoryView_Check(obj)) { PyObject* contig_view = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); RETURN_IF_PYERROR(); - this->ref.reset(contig_view); - Py_buffer* buf = PyMemoryView_GET_BUFFER(contig_view); - this->bytes = reinterpret_cast(buf->buf); - this->size = buf->len; - return Status::OK(); + Py_buffer* buffer = PyMemoryView_GET_BUFFER(contig_view); + return PyBytesView(reinterpret_cast(buffer->buf), buffer->len, false, + contig_view); } else { - return Status::TypeError("Expected ", expected_msg, ", got a '", - Py_TYPE(obj)->tp_name, "' object"); + return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name, + "' object"); } } + protected: + PyBytesView(const char* bytes, Py_ssize_t size, bool is_utf8 = false, + PyObject* obj = nullptr) + : bytes(bytes), size(size), is_utf8(is_utf8), ref(obj) {} + OwnedRef ref; }; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index beb38d02742..d6a0cae4269 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -49,6 +49,9 @@ #include "arrow/python/type_traits.h" #include "arrow/visitor_inline.h" +// store PyBytesView.is_utf8 +// use util::optional for post conversion null sentinel checking + namespace arrow { using internal::checked_cast; @@ -315,15 +318,13 @@ struct ValueConverter> { } static Result Convert(const BaseBinaryType&, const O&, I obj) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); return util::string_view(view.bytes, view.size); } static Result Convert(const FixedSizeBinaryType& type, const O&, I obj) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); if (ARROW_PREDICT_TRUE(view.size == type.byte_width())) { return util::string_view(view.bytes, view.size); } else { @@ -336,27 +337,19 @@ struct ValueConverter> { template static enable_if_string_like>> Convert( const T& type, const O& options, I obj) { - bool is_utf8 = false; - PyBytesView view; if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); - if (!is_utf8) { + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true)); + if (!view.is_utf8) { return internal::InvalidValue(obj, "was not a utf8 string"); } + return std::make_pair(util::string_view(view.bytes, view.size), view.is_utf8); } else { // Non-strict conversion; keep track of whether values are unicode or bytes - if (PyUnicode_Check(obj)) { - is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); - } else { - // If not unicode or bytes, FromBinary will error - is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); - } + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); + return std::make_pair(util::string_view(view.bytes, view.size), view.is_utf8); } - return std::make_pair(util::string_view(view.bytes, view.size), is_utf8); } static Result Convert(const DataType& type, const O&, I obj) { @@ -513,7 +506,8 @@ class PyDictionaryArrayConverter> // If we saw any non-unicode, cast results to a dictionary with binary value type const auto& current_type = checked_cast(*array->type()); auto binary_type = TypeTraits::type_singleton(); - auto new_type = dictionary(current_type.index_type(), binary_type, current_type.ordered()); + auto new_type = + dictionary(current_type.index_type(), binary_type, current_type.ordered()); return array->View(new_type); } else { return array; diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index cefa97abeea..7b91c24f63c 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -37,6 +37,7 @@ #include "arrow/ipc/writer.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/result.h" #include "arrow/tensor.h" #include "arrow/util/logging.h" @@ -482,8 +483,7 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size)); RETURN_NOT_OK(builder->AppendBytes(data, size)); } else if (PyUnicode_Check(elem)) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(elem)); + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromUnicode(elem)); int32_t size = -1; RETURN_NOT_OK(internal::CastSize(view.size, &size)); RETURN_NOT_OK(builder->AppendString(view.bytes, size)); From 8a2cc26baccb8050ce059e77e37d27e71cbac9fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 3 Sep 2020 23:55:56 +0200 Subject: [PATCH 10/80] improve specialization --- cpp/src/arrow/python/python_to_arrow.cc | 131 +++++++------------ cpp/src/arrow/type_traits.h | 23 +++- cpp/src/arrow/util/hashing.h | 5 + python/pyarrow/tests/test_convert_builtin.py | 26 ++-- 4 files changed, 86 insertions(+), 99 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index d6a0cae4269..88801b8d41d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -64,7 +64,7 @@ class PyValue { using I = PyObject*; using O = PyConversionOptions; - static bool IsNull(const DataType&, const O& options, I obj) { + static bool IsNull(const O& options, I obj) { if (options.from_pandas) { return internal::PandasObjectIsNull(obj); } else { @@ -317,16 +317,14 @@ struct ValueConverter> { return value; } - static Result Convert(const BaseBinaryType&, const O&, I obj) { - ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); - return util::string_view(view.bytes, view.size); + static Result Convert(const BaseBinaryType&, const O&, I obj) { + return PyBytesView::FromString(obj); } - static Result Convert(const FixedSizeBinaryType& type, const O&, - I obj) { + static Result Convert(const FixedSizeBinaryType& type, const O&, I obj) { ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); if (ARROW_PREDICT_TRUE(view.size == type.byte_width())) { - return util::string_view(view.bytes, view.size); + return view; } else { std::stringstream ss; ss << "expected to be length " << type.byte_width() << " was " << view.size; @@ -335,20 +333,20 @@ struct ValueConverter> { } template - static enable_if_string_like>> Convert( - const T& type, const O& options, I obj) { + static enable_if_string> Convert(const T& type, const O& options, + I obj) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true)); + // TODO(kszucs): revisit this one if (!view.is_utf8) { return internal::InvalidValue(obj, "was not a utf8 string"); } - return std::make_pair(util::string_view(view.bytes, view.size), view.is_utf8); + return view; } else { // Non-strict conversion; keep track of whether values are unicode or bytes - ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); - return std::make_pair(util::string_view(view.bytes, view.size), view.is_utf8); + return PyBytesView::FromString(obj); } } @@ -393,57 +391,56 @@ class PyPrimitiveArrayConverter : public PrimitiveArrayConverter::PrimitiveArrayConverter; Status Append(PyObject* value) override { - if (PyValue::IsNull(this->type_, this->options_, value)) { + if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->type_, this->options_, value)); - return this->builder_->Append(converted); + return AppendValue(this->type_, value); } } -}; -template -class PyPrimitiveArrayConverter< - T, enable_if_t::value || is_duration_type::value>> - : public PrimitiveArrayConverter { - public: - using PrimitiveArrayConverter::PrimitiveArrayConverter; + Status AppendValue(const DataType&, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->type_, this->options_, value)); + return this->builder_->Append(converted); + } - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->type_, this->options_, value)) { + template + enable_if_t::value || is_duration_type::value, Status> + AppendValue(const U&, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->type_, converted)) { return this->builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->type_, this->options_, value)); - if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->type_, converted)) { - return this->builder_->AppendNull(); - } else { - return this->builder_->Append(converted); - } + return this->builder_->Append(converted); } } + + template + enable_if_has_string_view AppendValue(const U&, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->type_, this->options_, value)); + return this->builder_->Append(util::string_view(view.bytes, view.size)); + } }; -// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, otherwise we -// allow but return results as BinaryArray template -class PyPrimitiveArrayConverter> +class PyPrimitiveArrayConverter> : public PrimitiveArrayConverter { public: using PrimitiveArrayConverter::PrimitiveArrayConverter; Status Append(PyObject* value) override { - if (PyValue::IsNull(this->type_, this->options_, value)) { + if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto pair, + ARROW_ASSIGN_OR_RAISE(auto view, PyValue::Convert(this->type_, this->options_, value)); - if (!pair.second) { + if (!view.is_utf8) { // observed binary value observed_binary_ = true; } - return this->builder_->Append(pair.first); + return this->builder_->Append(util::string_view(view.bytes, view.size)); } } @@ -463,59 +460,31 @@ class PyPrimitiveArrayConverter> bool observed_binary_ = false; }; -template +template class PyDictionaryArrayConverter : public DictionaryArrayConverter { public: using DictionaryArrayConverter::DictionaryArrayConverter; Status Append(PyObject* value) override { - if (PyValue::IsNull(this->value_type_, this->options_, value)) { + if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->builder_->Append(converted); + return AppendValue(this->value_type_, value); } } -}; -template -class PyDictionaryArrayConverter> - : public DictionaryArrayConverter { - public: - using DictionaryArrayConverter::DictionaryArrayConverter; - - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->value_type_, this->options_, value)) { - return this->builder_->AppendNull(); - } else { - ARROW_ASSIGN_OR_RAISE(auto pair, - PyValue::Convert(this->value_type_, this->options_, value)); - if (!pair.second) { - // observed binary value - observed_binary_ = true; - } - return this->builder_->Append(pair.first); - } + Status AppendValue(const DataType&, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->builder_->Append(converted); } - Result> Finish() override { - ARROW_ASSIGN_OR_RAISE(auto array, - (DictionaryArrayConverter::Finish())); - if (observed_binary_) { - // If we saw any non-unicode, cast results to a dictionary with binary value type - const auto& current_type = checked_cast(*array->type()); - auto binary_type = TypeTraits::type_singleton(); - auto new_type = - dictionary(current_type.index_type(), binary_type, current_type.ordered()); - return array->View(new_type); - } else { - return array; - } + template + enable_if_has_string_view AppendValue(const U&, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->builder_->Append(util::string_view(view.bytes, view.size)); } - - protected: - bool observed_binary_ = false; }; // If the value type does not match the expected NumPy dtype, then fall through @@ -562,7 +531,7 @@ class PyListArrayConverter : public ListArrayConverter { Status ValidateSize(const BaseListType&, int64_t size) { return Status::OK(); } Status Append(PyObject* value) override { - if (PyValue::IsNull(this->type_, this->options_, value)) { + if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } @@ -706,7 +675,7 @@ class PyStructArrayConverter : public StructArrayConverter } Status Append(PyObject* value) override { - if (PyValue::IsNull(this->type_, this->options_, value)) { + if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } RETURN_NOT_OK(InferInputKind(value)); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 36719ffa278..6035a1cacbb 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -530,12 +530,21 @@ using enable_if_base_binary = enable_if_t::value, R>; // Any binary excludes string from Base binary template -using is_any_binary_type = +using is_binary_type = std::integral_constant::value || - std::is_same::value>; + std::is_same::value || + std::is_same::value>; template -using enable_if_any_binary = enable_if_t::value, R>; +using enable_if_binary = enable_if_t::value, R>; + +template +using is_string_type = + std::integral_constant::value || + std::is_same::value>; + +template +using enable_if_string = enable_if_t::value, R>; template using is_string_like_type = @@ -697,8 +706,12 @@ template using enable_if_has_c_type = enable_if_t::value, R>; template -using has_string_view = std::integral_constant::value || - is_string_like_type::value>; +using has_string_view = + std::integral_constant::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value>; template using enable_if_has_string_view = enable_if_t::value, R>; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 639421cefe9..f1c4b1e6318 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -842,6 +842,11 @@ struct HashTraits::value && using MemoTableType = BinaryMemoTable; }; +template <> +struct HashTraits { + using MemoTableType = BinaryMemoTable; +}; + template struct HashTraits::value>> { using MemoTableType = BinaryMemoTable; diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 20391b82047..f738a254950 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1710,19 +1710,19 @@ def test_dictionary_from_integers(value_type): # pa.int32(), # pa.int64() # ]) -# def test_dictionary_is_always_adaptive(input_index_type): -# # dictionary array is constructed using adaptive index type builder, -# # meaning that the input index type is ignored since the output index -# # type depends on the input data -# typ = pa.dictionary(input_index_type, value_type=pa.int64()) - -# a = pa.array(range(2**7), type=typ) -# expected = pa.dictionary(pa.int8(), pa.int64()) -# assert a.type.equals(expected) - -# a = pa.array(range(2**7 + 1), type=typ) -# expected = pa.dictionary(pa.int16(), pa.int64()) -# assert a.type.equals(expected) +def test_dictionary_is_always_adaptive(): + # dictionary array is constructed using adaptive index type builder, + # meaning that the output index type may be wider than the given index type + # since it depends on the input data + typ = pa.dictionary(pa.int8(), value_type=pa.int64()) + + a = pa.array(range(2**7), type=typ) + expected = pa.dictionary(pa.int8(), pa.int64()) + assert a.type.equals(expected) + + a = pa.array(range(2**7 + 1), type=typ) + expected = pa.dictionary(pa.int16(), pa.int64()) + assert a.type.equals(expected) def test_dictionary_from_strings(): From 07b96b1b4d1dffbfca5098f6f5b80243c44d0d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 00:37:46 +0200 Subject: [PATCH 11/80] other specialization --- cpp/src/arrow/python/python_to_arrow.cc | 105 +++++++++++++++++------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 88801b8d41d..06473078249 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -390,37 +390,65 @@ class PyPrimitiveArrayConverter : public PrimitiveArrayConverter::PrimitiveArrayConverter; + Status Append(PyObject* value) override; +}; + +template +class PyPrimitiveArrayConverter< + T, enable_if_t::value || is_boolean_type::value || + is_number_type::value || is_decimal_type::value || is_date_type::value || + is_time_type::value>> + : public PrimitiveArrayConverter { + public: + using PrimitiveArrayConverter::PrimitiveArrayConverter; + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } else { - return AppendValue(this->type_, value); + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->type_, this->options_, value)); + return this->builder_->Append(converted); } } +}; - Status AppendValue(const DataType&, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->type_, this->options_, value)); - return this->builder_->Append(converted); - } +template +class PyPrimitiveArrayConverter< + T, enable_if_t::value || is_duration_type::value>> + : public PrimitiveArrayConverter { + public: + using PrimitiveArrayConverter::PrimitiveArrayConverter; - template - enable_if_t::value || is_duration_type::value, Status> - AppendValue(const U&, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->type_, this->options_, value)); - if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->type_, converted)) { + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } else { - return this->builder_->Append(converted); + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->type_, converted)) { + return this->builder_->AppendNull(); + } else { + return this->builder_->Append(converted); + } } } +}; - template - enable_if_has_string_view AppendValue(const U&, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->type_, this->options_, value)); - return this->builder_->Append(util::string_view(view.bytes, view.size)); +template +class PyPrimitiveArrayConverter> + : public PrimitiveArrayConverter { + public: + using PrimitiveArrayConverter::PrimitiveArrayConverter; + + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->type_, this->options_, value)); + return this->builder_->Append(util::string_view(view.bytes, view.size)); + } } }; @@ -440,7 +468,7 @@ class PyPrimitiveArrayConverter> // observed binary value observed_binary_ = true; } - return this->builder_->Append(util::string_view(view.bytes, view.size)); + return this->builder_->Append(view.bytes, view.size); } } @@ -460,30 +488,45 @@ class PyPrimitiveArrayConverter> bool observed_binary_ = false; }; -template +template class PyDictionaryArrayConverter : public DictionaryArrayConverter { public: using DictionaryArrayConverter::DictionaryArrayConverter; + Status Append(PyObject* value) override; +}; + +template +class PyDictionaryArrayConverter> + : public DictionaryArrayConverter { + public: + using DictionaryArrayConverter::DictionaryArrayConverter; + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->builder_->AppendNull(); } else { - return AppendValue(this->value_type_, value); + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->builder_->Append(converted); } } +}; - Status AppendValue(const DataType&, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->builder_->Append(converted); - } +template +class PyDictionaryArrayConverter> + : public DictionaryArrayConverter { + public: + using DictionaryArrayConverter::DictionaryArrayConverter; - template - enable_if_has_string_view AppendValue(const U&, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->builder_->Append(util::string_view(view.bytes, view.size)); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->builder_->Append(util::string_view(view.bytes, view.size)); + } } }; From 74c74cfaed950dec917430c0e37399dfd74119e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 01:10:19 +0200 Subject: [PATCH 12/80] minor cleanup --- cpp/src/arrow/python/python_to_arrow.cc | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 06473078249..82207d8899b 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -386,18 +386,13 @@ class PyArrayConverter : public ArrayConverter { }; template -class PyPrimitiveArrayConverter : public PrimitiveArrayConverter { - public: - using PrimitiveArrayConverter::PrimitiveArrayConverter; - - Status Append(PyObject* value) override; -}; +class PyPrimitiveArrayConverter : public PrimitiveArrayConverter {}; template class PyPrimitiveArrayConverter< T, enable_if_t::value || is_boolean_type::value || - is_number_type::value || is_decimal_type::value || is_date_type::value || - is_time_type::value>> + is_number_type::value || is_decimal_type::value || + is_date_type::value || is_time_type::value>> : public PrimitiveArrayConverter { public: using PrimitiveArrayConverter::PrimitiveArrayConverter; @@ -489,12 +484,7 @@ class PyPrimitiveArrayConverter> }; template -class PyDictionaryArrayConverter : public DictionaryArrayConverter { - public: - using DictionaryArrayConverter::DictionaryArrayConverter; - - Status Append(PyObject* value) override; -}; +class PyDictionaryArrayConverter : public DictionaryArrayConverter {}; template class PyDictionaryArrayConverter> From a1fa6cfca5e4c7edf1e97832aaeb573eec8c68a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 01:11:42 +0200 Subject: [PATCH 13/80] clang format --- cpp/src/arrow/python/python_to_arrow.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 82207d8899b..ffcaf773f34 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -484,7 +484,8 @@ class PyPrimitiveArrayConverter> }; template -class PyDictionaryArrayConverter : public DictionaryArrayConverter {}; +class PyDictionaryArrayConverter : public DictionaryArrayConverter { +}; template class PyDictionaryArrayConverter> From d0f566784af87ea489952473449c452c703d1d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 01:49:06 +0200 Subject: [PATCH 14/80] update dictionary scalar test --- python/pyarrow/includes/libarrow.pxd | 8 ++++---- python/pyarrow/tests/test_misc.py | 2 +- python/pyarrow/tests/test_scalars.py | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9a3451e4f82..7de3875ad8c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -969,11 +969,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[shared_ptr[CScalar]] value CResult[shared_ptr[CScalar]] field(CFieldRef ref) const - cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): - cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": - shared_ptr[CScalar] index - shared_ptr[CArray] dictionary + cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": + shared_ptr[CScalar] index + shared_ptr[CArray] dictionary + cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): CDictionaryValue value CResult[shared_ptr[CScalar]] GetEncodedValue() diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index e98c36d07fd..932b664f7e4 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -132,7 +132,7 @@ def test_build_info(): pa.FixedSizeListScalar, pa.UnionScalar, pa.StructScalar, - pa.DictionaryScalar, + # pa.DictionaryScalar, pa.ipc.Message, pa.ipc.MessageReader, pa.MemoryPool, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 091ae38e6e4..a8c8187cfac 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -531,6 +531,7 @@ def test_map(): def test_dictionary(): + # TODO(kszucs): test null values indices = [2, 1, 2, 0] dictionary = ['foo', 'bar', 'baz'] @@ -550,8 +551,9 @@ def test_dictionary(): with pytest.warns(FutureWarning): assert s.dictionary_value.as_py() == v - with pytest.raises(pa.ArrowNotImplementedError): - pickle.loads(pickle.dumps(s)) + # FIXME(kszucs) + # restored = pickle.loads(pickle.dumps(s)) + # assert restored.equals(s) def test_union(): From 69cebce6f46b9c5e08614dc6f3cc7f314aab4c15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 13:43:17 +0200 Subject: [PATCH 15/80] fix dictionary scalar serialization and please gcc4.8 --- cpp/src/arrow/array/array_base.cc | 8 ++++- cpp/src/arrow/python/common.h | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 4 +-- cpp/src/arrow/scalar.h | 4 +-- python/pyarrow/includes/libarrow.pxd | 2 ++ python/pyarrow/scalar.pxi | 44 +++++++++++++++++++++++++ python/pyarrow/tests/test_misc.py | 2 +- python/pyarrow/tests/test_scalars.py | 19 +++++------ 8 files changed, 68 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 0781dd4a2df..900e8d2b38f 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -161,7 +161,13 @@ struct ScalarFromArraySlotImpl { } if (array_.IsNull(index_)) { - return MakeNullScalar(array_.type()); + auto null = MakeNullScalar(array_.type()); + if (is_dictionary(array_.type()->id())) { + auto& dict_null = checked_cast(*null); + const auto& dict_array = checked_cast(array_); + dict_null.value.dictionary = dict_array.dictionary(); + } + return null; } RETURN_NOT_OK(VisitArrayInline(array_, this)); diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 6a0f3da0120..e95f8312f06 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -207,7 +207,7 @@ struct PyBytesView { result.is_utf8 = false; } } - return result; + return std::move(result); } } diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index ffcaf773f34..89b0bd0a641 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -324,7 +324,7 @@ struct ValueConverter> { static Result Convert(const FixedSizeBinaryType& type, const O&, I obj) { ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); if (ARROW_PREDICT_TRUE(view.size == type.byte_width())) { - return view; + return std::move(view); } else { std::stringstream ss; ss << "expected to be length " << type.byte_width() << " was " << view.size; @@ -343,7 +343,7 @@ struct ValueConverter> { if (!view.is_utf8) { return internal::InvalidValue(obj, "was not a utf8 string"); } - return view; + return std::move(view); } else { // Non-strict conversion; keep track of whether values are unicode or bytes return PyBytesView::FromString(obj); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 4a007dd8782..946d3bfe44f 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -428,8 +428,8 @@ struct ARROW_EXPORT DictionaryScalar : public Scalar { explicit DictionaryScalar(std::shared_ptr type); - DictionaryScalar(ValueType value, std::shared_ptr type) - : Scalar(std::move(type), true), value(std::move(value)) {} + DictionaryScalar(ValueType value, std::shared_ptr type, bool is_valid = true) + : Scalar(std::move(type), is_valid), value(std::move(value)) {} Result> GetEncodedValue() const; }; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7de3875ad8c..0ba42dd8e76 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -974,6 +974,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] dictionary cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): + CDictionaryScalar(CDictionaryValue value, shared_ptr[CDataType], + c_bool is_valid) CDictionaryValue value CResult[shared_ptr[CScalar]] GetEncodedValue() diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index cc06ca6a2f9..9cdb62b4327 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -687,6 +687,50 @@ cdef class DictionaryScalar(Scalar): Concrete class for dictionary-encoded scalars. """ + @classmethod + def _reconstruct(cls, type, is_valid, index, dictionary): + cdef: + CDictionaryValue value + shared_ptr[CDictionaryScalar] wrapped + DataType type_ + Scalar index_ + Array dictionary_ + + type_ = ensure_type(type, allow_none=False) + if not isinstance(type_, DictionaryType): + raise TypeError('Must pass a DictionaryType instance') + + if isinstance(index, Scalar): + if not index.type.equals(type.index_type): + raise TypeError("The Scalar value passed as index must have " + "identical type to the dictionary type's " + "index_type") + index_ = index + else: + index_ = scalar(index, type=type_.index_type) + + if isinstance(dictionary, Array): + if not dictionary.type.equals(type.value_type): + raise TypeError("The Array passed as dictionary must have " + "identical type to the dictionary type's " + "value_type") + dictionary_ = dictionary + else: + dictionary_ = array(dictionary, type=type_.value_type) + + value.index = pyarrow_unwrap_scalar(index_) + value.dictionary = pyarrow_unwrap_array(dictionary_) + + wrapped = make_shared[CDictionaryScalar]( + value, pyarrow_unwrap_data_type(type_), (is_valid) + ) + return Scalar.wrap( wrapped) + + def __reduce__(self): + return DictionaryScalar._reconstruct, ( + self.type, self.is_valid, self.index, self.dictionary + ) + @property def index(self): """ diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 932b664f7e4..e98c36d07fd 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -132,7 +132,7 @@ def test_build_info(): pa.FixedSizeListScalar, pa.UnionScalar, pa.StructScalar, - # pa.DictionaryScalar, + pa.DictionaryScalar, pa.ipc.Message, pa.ipc.MessageReader, pa.MemoryPool, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index a8c8187cfac..fa48ad8b5f2 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -531,29 +531,28 @@ def test_map(): def test_dictionary(): - # TODO(kszucs): test null values - indices = [2, 1, 2, 0] - dictionary = ['foo', 'bar', 'baz'] + indices = pa.array([2, None, 1, 2, 0, None]) + dictionary = pa.array(['foo', 'bar', 'baz']) arr = pa.DictionaryArray.from_arrays(indices, dictionary) - expected = ['baz', 'bar', 'baz', 'foo'] + expected = ['baz', None, 'bar', 'baz', 'foo', None] + assert arr.to_pylist() == expected for j, (i, v) in enumerate(zip(indices, expected)): s = arr[j] assert s.as_py() == v assert s.value.as_py() == v - assert s.index.as_py() == i - assert s.dictionary.to_pylist() == dictionary + assert s.index.equals(i) + assert s.dictionary.equals(dictionary) with pytest.warns(FutureWarning): - assert s.index_value.as_py() == i + assert s.index_value.equals(i) with pytest.warns(FutureWarning): assert s.dictionary_value.as_py() == v - # FIXME(kszucs) - # restored = pickle.loads(pickle.dumps(s)) - # assert restored.equals(s) + restored = pickle.loads(pickle.dumps(s)) + assert restored.equals(s) def test_union(): From be9d8e7d8bc0f739cf6045b7628e5789f475fff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 14:06:43 +0200 Subject: [PATCH 16/80] fix dictionary scalar test --- cpp/src/arrow/scalar_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 4171164899d..a8e4e4780f2 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -627,6 +627,9 @@ TEST(TestDictionaryScalar, Basics) { gamma.dictionary = dict; auto scalar_null = MakeNullScalar(ty); + auto& dict_scalar_null = checked_cast(*scalar_null); + dict_scalar_null.value.dictionary = dict; + auto scalar_alpha = DictionaryScalar(alpha, ty); auto scalar_gamma = DictionaryScalar(gamma, ty); From 4f5891ffccc5f3c071c263fadd1cf003842e31dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 14:25:32 +0200 Subject: [PATCH 17/80] linting; try to fix msvc error --- cpp/src/arrow/util/converter.h | 78 ++++++++++++---------------- python/pyarrow/includes/libarrow.pxd | 4 +- 2 files changed, 36 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 31fe9766c8d..825970a58e5 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -138,33 +138,27 @@ class StructArrayConverter : public TypedArrayConverter { std::vector> child_converters_; }; -#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - out->reset( \ - new DictionaryArrayConverter(type, std::move(builder), options)); \ +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + out->reset(new DAC(type, std::move(builder), options)); \ break; -template class PrimitiveArrayConverter, - template class DictionaryArrayConverter, - template class ListArrayConverter, - template class StructArrayConverter> +template class PAC, + template class DAC, template class LAC, + template class SAC> struct ArrayConverterBuilder { - using Self = ArrayConverterBuilder; + using Self = ArrayConverterBuilder; Status Visit(const NullType& t) { // TODO: merge with the primitive c_type variant below, requires a NullType ctor which // accepts a type instance using BuilderType = typename TypeTraits::BuilderType; - using NullConverter = PrimitiveArrayConverter; - static_assert( - std::is_same::value, - ""); + using ConverterType = PAC; + static_assert(std::is_same::value, + ""); auto builder = std::make_shared(pool); - out->reset(new NullConverter(type, std::move(builder), options)); + out->reset(new ConverterType(type, std::move(builder), options)); return Status::OK(); } @@ -174,13 +168,12 @@ struct ArrayConverterBuilder { Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using PrimitiveConverter = PrimitiveArrayConverter; - static_assert(std::is_same::value, + using ConverterType = PAC; + static_assert(std::is_same::value, ""); auto builder = std::make_shared(type, pool); - out->reset(new PrimitiveConverter(type, std::move(builder), options)); + out->reset(new ConverterType(type, std::move(builder), options)); return Status::OK(); } @@ -188,24 +181,22 @@ struct ArrayConverterBuilder { enable_if_t::value && !std::is_same::value, Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ListConverter = ListArrayConverter; - static_assert( - std::is_same::value, - ""); + using ConverterType = LAC; + static_assert(std::is_same::value, + ""); ARROW_ASSIGN_OR_RAISE(auto child_converter, (Self::Make(t.value_type(), pool, options))); auto builder = std::make_shared(pool, child_converter->builder(), type); out->reset( - new ListConverter(type, std::move(builder), std::move(child_converter), options)); + new ConverterType(type, std::move(builder), std::move(child_converter), options)); return Status::OK(); } Status Visit(const MapType& t) { - using MapConverter = ListArrayConverter; - static_assert( - std::is_same::value, - ""); + using ConverterType = LAC; + static_assert(std::is_same::value, + ""); // TODO(kszucs): seems like builders not respect field nullability std::vector> struct_fields{t.key_field(), t.item_field()}; @@ -217,8 +208,8 @@ struct ArrayConverterBuilder { auto item_builder = struct_builder->child_builder(1); auto builder = std::make_shared(pool, key_builder, item_builder, type); - out->reset( - new MapConverter(type, std::move(builder), std::move(struct_converter), options)); + out->reset(new ConverterType(type, std::move(builder), std::move(struct_converter), + options)); return Status::OK(); } @@ -252,13 +243,12 @@ struct ArrayConverterBuilder { } Status Visit(const StructType& t) { - using StructConverter = StructArrayConverter; - static_assert( - std::is_same::value, - ""); + using ConverterType = SAC; + static_assert(std::is_same::value, + ""); - std::shared_ptr child_converter; - std::vector> child_converters; + std::shared_ptr child_converter; + std::vector> child_converters; std::vector> child_builders; for (const auto& field : t.fields()) { @@ -270,16 +260,16 @@ struct ArrayConverterBuilder { } auto builder = std::make_shared(type, pool, child_builders); - out->reset(new StructConverter(type, std::move(builder), std::move(child_converters), - options)); + out->reset(new ConverterType(type, std::move(builder), std::move(child_converters), + options)); return Status::OK(); } Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } - static Result> Make(std::shared_ptr type, - MemoryPool* pool, Options options) { - std::shared_ptr out; + static Result> Make(std::shared_ptr type, + MemoryPool* pool, Options options) { + std::shared_ptr out; Self visitor = {type, pool, options, &out}; ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); ARROW_RETURN_NOT_OK(out->Init()); @@ -289,7 +279,7 @@ struct ArrayConverterBuilder { const std::shared_ptr& type; MemoryPool* pool; Options options; - std::shared_ptr* out; + std::shared_ptr* out; }; } // namespace arrow diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 0ba42dd8e76..8797ba68e40 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -970,8 +970,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CResult[shared_ptr[CScalar]] field(CFieldRef ref) const cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": - shared_ptr[CScalar] index - shared_ptr[CArray] dictionary + shared_ptr[CScalar] index + shared_ptr[CArray] dictionary cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): CDictionaryScalar(CDictionaryValue value, shared_ptr[CDataType], From ddde7a9ac747d59aa84ce6df6174c02c4e9a4eba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 15:33:14 +0200 Subject: [PATCH 18/80] avoid name collisions --- cpp/src/arrow/python/common.h | 2 +- cpp/src/arrow/util/converter.h | 108 ++++++++++++++++----------------- 2 files changed, 53 insertions(+), 57 deletions(-) diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index e95f8312f06..9f4d7c15bee 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -240,7 +240,7 @@ struct PyBytesView { protected: PyBytesView(const char* bytes, Py_ssize_t size, bool is_utf8 = false, - PyObject* obj = nullptr) + PyObject* obj = NULLPTR) : bytes(bytes), size(size), is_utf8(is_utf8), ref(obj) {} OwnedRef ref; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 825970a58e5..c741d7ec898 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -62,16 +62,14 @@ class ArrayConverter { Options options_; }; -template ::BuilderType> -class TypedArrayConverter : public ArrayConverter { +class TypedArrayConverter : public BaseConverter { public: - using ArrayConverterType = ArrayConverter; - TypedArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - typename ArrayConverter::OptionsType options) - : ArrayConverter(type, builder, options), + typename BaseConverter::OptionsType options) + : BaseConverter(type, builder, options), type_(checked_cast(*type)), builder_(checked_cast(builder.get())) {} @@ -88,20 +86,20 @@ class TypedArrayConverter : public ArrayConverter { BuilderType* builder_; }; -template -class PrimitiveArrayConverter : public TypedArrayConverter { +template +class PrimitiveArrayConverter : public TypedArrayConverter { public: - using TypedArrayConverter::TypedArrayConverter; + using TypedArrayConverter::TypedArrayConverter; }; -template +template class DictionaryArrayConverter - : public TypedArrayConverter> { + : public TypedArrayConverter> { public: DictionaryArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - typename ArrayConverter::OptionsType options) - : TypedArrayConverter>( + typename BaseConverter::OptionsType options) + : TypedArrayConverter>( type, builder, options), value_type_(checked_cast( *checked_cast(*type).value_type())) {} @@ -110,52 +108,54 @@ class DictionaryArrayConverter const T& value_type_; }; -template -class ListArrayConverter : public TypedArrayConverter { +template +class ListArrayConverter : public TypedArrayConverter { public: ListArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - std::shared_ptr value_converter, - typename ArrayConverter::OptionsType options) - : TypedArrayConverter(type, builder, options), + std::shared_ptr value_converter, + typename BaseConverter::OptionsType options) + : TypedArrayConverter(type, builder, options), value_converter_(std::move(value_converter)) {} protected: - std::shared_ptr value_converter_; + std::shared_ptr value_converter_; }; -template -class StructArrayConverter : public TypedArrayConverter { +template +class StructArrayConverter : public TypedArrayConverter { public: StructArrayConverter(const std::shared_ptr& type, std::shared_ptr builder, - std::vector> child_converters, - typename ArrayConverter::OptionsType options) - : TypedArrayConverter(type, builder, options), + std::vector> child_converters, + typename BaseConverter::OptionsType options) + : TypedArrayConverter(type, builder, options), child_converters_(std::move(child_converters)) {} protected: - std::vector> child_converters_; + std::vector> child_converters_; }; -#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - out->reset(new DAC(type, std::move(builder), options)); \ +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + out->reset(new DictionaryConverter(type, std::move(builder), options)); \ break; -template class PAC, - template class DAC, template class LAC, - template class SAC> +template class PrimitiveConverter, + template class DictionaryConverter, + template class ListConverter, + template class StructConverter> struct ArrayConverterBuilder { - using Self = ArrayConverterBuilder; + using Self = ArrayConverterBuilder; Status Visit(const NullType& t) { // TODO: merge with the primitive c_type variant below, requires a NullType ctor which // accepts a type instance using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = PAC; - static_assert(std::is_same::value, - ""); + using ConverterType = PrimitiveConverter; + static_assert(std::is_base_of::value, ""); auto builder = std::make_shared(pool); out->reset(new ConverterType(type, std::move(builder), options)); @@ -168,9 +168,8 @@ struct ArrayConverterBuilder { Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = PAC; - static_assert(std::is_same::value, - ""); + using ConverterType = PrimitiveConverter; + static_assert(std::is_base_of::value, ""); auto builder = std::make_shared(type, pool); out->reset(new ConverterType(type, std::move(builder), options)); @@ -181,9 +180,8 @@ struct ArrayConverterBuilder { enable_if_t::value && !std::is_same::value, Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = LAC; - static_assert(std::is_same::value, - ""); + using ConverterType = ListConverter; + static_assert(std::is_base_of::value, ""); ARROW_ASSIGN_OR_RAISE(auto child_converter, (Self::Make(t.value_type(), pool, options))); @@ -194,9 +192,8 @@ struct ArrayConverterBuilder { } Status Visit(const MapType& t) { - using ConverterType = LAC; - static_assert(std::is_same::value, - ""); + using ConverterType = ListConverter; + static_assert(std::is_base_of::value, ""); // TODO(kszucs): seems like builders not respect field nullability std::vector> struct_fields{t.key_field(), t.item_field()}; @@ -215,7 +212,7 @@ struct ArrayConverterBuilder { Status Visit(const DictionaryType& t) { std::unique_ptr builder; - ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, nullptr, &builder)); + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, NULLPTR, &builder)); switch (t.value_type()->id()) { DICTIONARY_CASE(BOOL, BooleanType); @@ -243,20 +240,19 @@ struct ArrayConverterBuilder { } Status Visit(const StructType& t) { - using ConverterType = SAC; - static_assert(std::is_same::value, - ""); + using ConverterType = StructConverter; + static_assert(std::is_base_of::value, ""); - std::shared_ptr child_converter; - std::vector> child_converters; + std::shared_ptr child_converter; + std::vector> child_converters; std::vector> child_builders; for (const auto& field : t.fields()) { ARROW_ASSIGN_OR_RAISE(child_converter, Self::Make(field->type(), pool, options)); // TODO: use move - child_converters.emplace_back(child_converter); - child_builders.emplace_back(child_converter->builder()); + child_converters.push_back(child_converter); + child_builders.push_back(child_converter->builder()); } auto builder = std::make_shared(type, pool, child_builders); @@ -267,9 +263,9 @@ struct ArrayConverterBuilder { Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } - static Result> Make(std::shared_ptr type, - MemoryPool* pool, Options options) { - std::shared_ptr out; + static Result> Make(std::shared_ptr type, + MemoryPool* pool, Options options) { + std::shared_ptr out; Self visitor = {type, pool, options, &out}; ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); ARROW_RETURN_NOT_OK(out->Init()); @@ -279,7 +275,7 @@ struct ArrayConverterBuilder { const std::shared_ptr& type; MemoryPool* pool; Options options; - std::shared_ptr* out; + std::shared_ptr* out; }; } // namespace arrow From e7c046cf0dfb93930566b68f3704d7b799903dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 16:00:40 +0200 Subject: [PATCH 19/80] use string view --- cpp/src/arrow/python/python_to_arrow.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 89b0bd0a641..88c1ae0a9f0 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -463,7 +463,7 @@ class PyPrimitiveArrayConverter> // observed binary value observed_binary_ = true; } - return this->builder_->Append(view.bytes, view.size); + return this->builder_->Append(util::string_view(view.bytes, view.size)); } } From 22d098c578fa3ae68311b4bef12fb32949557cc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 16:21:09 +0200 Subject: [PATCH 20/80] test that dictionary index type is honored --- python/pyarrow/tests/test_convert_builtin.py | 21 ++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f738a254950..af37ba5bf78 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1704,12 +1704,21 @@ def test_dictionary_from_integers(value_type): assert a.dictionary.equals(expected_dictionary) -# @pytest.mark.parametrize('input_index_type', [ -# pa.int8(), -# pa.int16(), -# pa.int32(), -# pa.int64() -# ]) +@pytest.mark.parametrize('input_index_type', [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64() +]) +def test_dictionary_index_type(input_index_type): + # dictionary array is constructed using adaptive index type builder, + # but the input index type is considered as the minimal width type to use + + typ = pa.dictionary(input_index_type, value_type=pa.int64()) + arr = pa.array(range(10), type=typ) + assert arr.type.equals(typ) + + def test_dictionary_is_always_adaptive(): # dictionary array is constructed using adaptive index type builder, # meaning that the output index type may be wider than the given index type From 524dfbdead8370c81304ebd7574f5af5883c5176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 4 Sep 2020 16:37:49 +0200 Subject: [PATCH 21/80] always return with an array instead of a chunkedarray --- cpp/src/arrow/python/numpy_to_arrow.cc | 9 ++++----- cpp/src/arrow/python/python_test.cc | 23 +++++++++-------------- cpp/src/arrow/python/python_to_arrow.cc | 12 +++--------- cpp/src/arrow/python/python_to_arrow.h | 8 +++----- python/pyarrow/array.pxi | 13 ++++++------- python/pyarrow/includes/libarrow.pxd | 5 ++--- python/pyarrow/scalar.pxi | 5 +---- 7 files changed, 28 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index af608dfc360..320937142c2 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -314,11 +314,10 @@ Status NumPyConverter::Convert() { PyConversionOptions py_options; py_options.type = type_; py_options.from_pandas = from_pandas_; - std::shared_ptr res; - RETURN_NOT_OK(ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options, - &res)); - out_arrays_ = res->chunks(); + ARROW_ASSIGN_OR_RAISE( + auto result, ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options)); + out_arrays_.push_back(result); return Status::OK(); } diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index e0b22915af0..348dbaec662 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -329,8 +329,7 @@ TEST(BuiltinConversionTest, TestMixedTypeFails) { ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); - std::shared_ptr arr; - ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {}, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {})); } TEST_F(DecimalTest, FromPythonDecimalRescaleNotTruncateable) { @@ -422,17 +421,15 @@ TEST_F(DecimalTest, TestNoneAndNaN) { ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2)); ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3)); - std::shared_ptr arr, arr_from_pandas; PyConversionOptions options; - ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); options.from_pandas = true; - ASSERT_OK(ConvertPySequence(list, nullptr, options, &arr_from_pandas)); - auto c0 = arr_from_pandas->chunk(0); - ASSERT_TRUE(c0->IsValid(0)); - ASSERT_TRUE(c0->IsNull(1)); - ASSERT_TRUE(c0->IsNull(2)); - ASSERT_TRUE(c0->IsNull(3)); + auto arr = ConvertPySequence(list, nullptr, options).ValueOrDie(); + ASSERT_TRUE(arr->IsValid(0)); + ASSERT_TRUE(arr->IsNull(1)); + ASSERT_TRUE(arr->IsNull(2)); + ASSERT_TRUE(arr->IsNull(3)); } TEST_F(DecimalTest, TestMixedPrecisionAndScale) { @@ -451,8 +448,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScale) { ASSERT_EQ(0, result); } - std::shared_ptr arr; - ASSERT_OK(ConvertPySequence(list, nullptr, {}, &arr)); + auto arr = ConvertPySequence(list, nullptr, {}).ValueOrDie(); const auto& type = checked_cast(*arr->type()); int32_t expected_precision = 9; @@ -476,8 +472,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) { ASSERT_EQ(PyList_SetItem(list, 0, value1), 0); ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); - std::shared_ptr arr; - ASSERT_OK(ConvertPySequence(list, nullptr, {}, &arr)); + auto arr = ConvertPySequence(list, nullptr, {}).ValueOrDie(); const auto& type = checked_cast(*arr->type()); ASSERT_EQ(3, type.precision()); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 88c1ae0a9f0..fe5ba6dfddd 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -851,8 +851,8 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } -Status ConvertPySequence(PyObject* obj, PyObject* mask, const PyConversionOptions& opts, - std::shared_ptr* out) { +Result> ConvertPySequence(PyObject* obj, PyObject* mask, + const PyConversionOptions& opts) { PyAcquireGIL lock; PyObject* seq; @@ -893,13 +893,7 @@ Status ConvertPySequence(PyObject* obj, PyObject* mask, const PyConversionOption } else { RETURN_NOT_OK(converter->Extend(seq, size)); } - - // Retrieve result. Conversion may yield one or more array values - // return converter->GetResult(out); - ARROW_ASSIGN_OR_RAISE(auto result, converter->Finish()); - ArrayVector chunks{result}; - *out = std::make_shared(chunks, real_type); - return Status::OK(); + return converter->Finish(); } } // namespace py diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 83abee3b2b7..bfe7d7f1767 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -71,12 +71,10 @@ struct PyConversionOptions { /// values in the sequence are null (true) or not null (false). This parameter /// may be null /// \param[in] options various conversion options -/// \param[out] out a ChunkedArray containing one or more chunks -/// \return Status +/// \return Result Array ARROW_PYTHON_EXPORT -Status ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr* out); +Result> ConvertPySequence(PyObject* obj, PyObject* mask, + const PyConversionOptions& options); } // namespace py diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a9f1be221f9..679e7ff3c88 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -21,8 +21,10 @@ import warnings cdef _sequence_to_array(object sequence, object mask, object size, DataType type, CMemoryPool* pool, c_bool from_pandas): - cdef int64_t c_size - cdef PyConversionOptions options + cdef: + int64_t c_size + PyConversionOptions options + shared_ptr[CArray] result if type is not None: options.type = type.sp_type @@ -37,12 +39,9 @@ cdef _sequence_to_array(object sequence, object mask, object size, cdef shared_ptr[CChunkedArray] out with nogil: - check_status(ConvertPySequence(sequence, mask, options, &out)) + result = GetResultValue(ConvertPySequence(sequence, mask, options)) - if out.get().num_chunks() == 1: - return pyarrow_wrap_array(out.get().chunk(0)) - else: - return pyarrow_wrap_chunked_array(out) + return pyarrow_wrap_array(result) cdef inline _is_array_like(obj): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8797ba68e40..97b63c7ac6e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1769,9 +1769,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: # TODO Some functions below are not actually "nogil" - CStatus ConvertPySequence(object obj, object mask, - const PyConversionOptions& options, - shared_ptr[CChunkedArray]* out) + CResult[shared_ptr[CArray]] ConvertPySequence( + object obj, object mask, const PyConversionOptions& options) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 9cdb62b4327..be3a503425d 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -873,7 +873,6 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): PyConversionOptions options shared_ptr[CScalar] scalar shared_ptr[CArray] array - shared_ptr[CChunkedArray] chunked bint is_pandas_object = False type = ensure_type(type, allow_none=True) @@ -895,9 +894,7 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): value = [value] with nogil: - check_status(ConvertPySequence(value, None, options, &chunked)) + array = GetResultValue(ConvertPySequence(value, None, options)) - assert chunked.get().num_chunks() == 1 - array = chunked.get().chunk(0) scalar = GetResultValue(array.get().GetScalar(0)) return Scalar.wrap(scalar) From 677adb28ba9198d92728696b39695f1a366973ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 7 Sep 2020 14:06:22 +0200 Subject: [PATCH 22/80] address review comments --- cpp/src/arrow/python/python_test.cc | 7 +++---- cpp/src/arrow/python/python_to_arrow.cc | 3 --- cpp/src/arrow/util/converter.h | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index 348dbaec662..6ee33e037a5 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -425,7 +425,7 @@ TEST_F(DecimalTest, TestNoneAndNaN) { ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); options.from_pandas = true; - auto arr = ConvertPySequence(list, nullptr, options).ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, options)) ASSERT_TRUE(arr->IsValid(0)); ASSERT_TRUE(arr->IsNull(1)); ASSERT_TRUE(arr->IsNull(2)); @@ -448,7 +448,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScale) { ASSERT_EQ(0, result); } - auto arr = ConvertPySequence(list, nullptr, {}).ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, {})) const auto& type = checked_cast(*arr->type()); int32_t expected_precision = 9; @@ -472,8 +472,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) { ASSERT_EQ(PyList_SetItem(list, 0, value1), 0); ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); - auto arr = ConvertPySequence(list, nullptr, {}).ValueOrDie(); - + ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, {})); const auto& type = checked_cast(*arr->type()); ASSERT_EQ(3, type.precision()); ASSERT_EQ(3, type.scale()); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index fe5ba6dfddd..0b5bc00f7c8 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -49,9 +49,6 @@ #include "arrow/python/type_traits.h" #include "arrow/visitor_inline.h" -// store PyBytesView.is_utf8 -// use util::optional for post conversion null sentinel checking - namespace arrow { using internal::checked_cast; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c741d7ec898..d581df2efce 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -53,7 +53,6 @@ class ArrayConverter { virtual Status Reserve(int64_t additional_capacity) = 0; virtual Status Append(InputType value) = 0; virtual Status AppendNull() = 0; - virtual Status Extend(Input seq, int64_t size) = 0; virtual Result> Finish() = 0; protected: From b4797ced22635f2a66d6c83762ed6c2c01557a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 7 Sep 2020 14:31:19 +0200 Subject: [PATCH 23/80] move to internal namespace --- cpp/src/arrow/python/python_to_arrow.cc | 10 ++++++++-- cpp/src/arrow/util/converter.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 0b5bc00f7c8..c4c78aa4b62 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -51,8 +51,14 @@ namespace arrow { +using internal::ArrayConverter; +using internal::ArrayConverterBuilder; using internal::checked_cast; using internal::checked_pointer_cast; +using internal::DictionaryArrayConverter; +using internal::ListArrayConverter; +using internal::PrimitiveArrayConverter; +using internal::StructArrayConverter; namespace py { @@ -356,7 +362,7 @@ class PyArrayConverter : public ArrayConverter { public: using ArrayConverter::ArrayConverter; - Status Extend(PyObject* values, int64_t size) override { + Status Extend(PyObject* values, int64_t size) { /// Ensure we've allocated enough space RETURN_NOT_OK(this->Reserve(size)); // Iterate over the items adding each one @@ -466,7 +472,7 @@ class PyPrimitiveArrayConverter> Result> Finish() override { ARROW_ASSIGN_OR_RAISE(auto array, - (TypedArrayConverter::Finish())); + (PrimitiveArrayConverter::Finish())); if (observed_binary_) { // If we saw any non-unicode, cast results to BinaryArray auto binary_type = TypeTraits::type_singleton(); diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index d581df2efce..1c812a8e2fc 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -30,6 +30,7 @@ #include "arrow/visitor_inline.h" namespace arrow { +namespace internal { using internal::checked_cast; using internal::checked_pointer_cast; @@ -277,4 +278,5 @@ struct ArrayConverterBuilder { std::shared_ptr* out; }; +} // namespace internal } // namespace arrow From 1a5ad12cbce0653da251fd777f63a364fff039d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 8 Sep 2020 23:04:33 +0200 Subject: [PATCH 24/80] simplify --- cpp/src/arrow/python/python_to_arrow.cc | 236 ++++++++++----------- cpp/src/arrow/util/converter.h | 262 ++++++++++++------------ 2 files changed, 247 insertions(+), 251 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index c4c78aa4b62..042eedebd78 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -51,14 +51,14 @@ namespace arrow { -using internal::ArrayConverter; -using internal::ArrayConverterBuilder; using internal::checked_cast; using internal::checked_pointer_cast; -using internal::DictionaryArrayConverter; -using internal::ListArrayConverter; -using internal::PrimitiveArrayConverter; -using internal::StructArrayConverter; + +using internal::Converter; +using internal::DictionaryConverter; +using internal::ListConverter; +using internal::PrimitiveConverter; +using internal::StructConverter; namespace py { @@ -75,15 +75,15 @@ class PyValue { } } - static bool IsNaT(const TimestampType&, int64_t value) { + static bool IsNaT(const TimestampType*, int64_t value) { return internal::npy_traits::isnull(value); } - static bool IsNaT(const DurationType&, int64_t value) { + static bool IsNaT(const DurationType*, int64_t value) { return internal::npy_traits::isnull(value); } - static Result Convert(const NullType&, const O&, I obj) { + static Result Convert(const NullType*, const O&, I obj) { if (obj == Py_None) { return nullptr; } else { @@ -91,7 +91,7 @@ class PyValue { } } - static Result Convert(const BooleanType&, const O&, I obj) { + static Result Convert(const BooleanType*, const O&, I obj) { if (obj == Py_True) { return true; } else if (obj == Py_False) { @@ -108,7 +108,7 @@ struct ValueConverter> { using ValueType = typename Type::c_type; template - static enable_if_integer> Convert(const T&, const O&, + static enable_if_integer> Convert(const T*, const O&, I obj) { typename T::c_type value; auto status = internal::CIntFromPython(obj, &value); @@ -121,13 +121,13 @@ struct ValueConverter> { } } - static Result Convert(const HalfFloatType&, const O&, I obj) { + static Result Convert(const HalfFloatType*, const O&, I obj) { uint16_t value; RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); return value; } - static Result Convert(const FloatType&, const O&, I obj) { + static Result Convert(const FloatType*, const O&, I obj) { float value; if (internal::PyFloatScalar_Check(obj)) { value = static_cast(PyFloat_AsDouble(obj)); @@ -140,7 +140,7 @@ struct ValueConverter> { return value; } - static Result Convert(const DoubleType&, const O&, I obj) { + static Result Convert(const DoubleType*, const O&, I obj) { double value; if (PyFloat_Check(obj)) { value = PyFloat_AS_DOUBLE(obj); @@ -156,13 +156,13 @@ struct ValueConverter> { return value; } - static Result Convert(const Decimal128Type& type, const O&, I obj) { + static Result Convert(const Decimal128Type* type, const O&, I obj) { Decimal128 value; - RETURN_NOT_OK(internal::DecimalFromPyObject(obj, type, &value)); + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); return value; } - static Result Convert(const Date32Type&, const O&, I obj) { + static Result Convert(const Date32Type*, const O&, I obj) { int32_t value; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -174,7 +174,7 @@ struct ValueConverter> { return value; } - static Result Convert(const Date64Type&, const O&, I obj) { + static Result Convert(const Date64Type*, const O&, I obj) { int64_t value; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -192,11 +192,11 @@ struct ValueConverter> { return value; } - static Result Convert(const Time32Type& type, const O&, I obj) { + static Result Convert(const Time32Type* type, const O&, I obj) { int32_t value; if (PyTime_Check(obj)) { // TODO(kszucs): consider to raise if a timezone aware time object is encountered - switch (type.unit()) { + switch (type->unit()) { case TimeUnit::SECOND: value = static_cast(internal::PyTime_to_s(obj)); break; @@ -213,11 +213,11 @@ struct ValueConverter> { return value; } - static Result Convert(const Time64Type& type, const O&, I obj) { + static Result Convert(const Time64Type* type, const O&, I obj) { int64_t value; if (PyTime_Check(obj)) { // TODO(kszucs): consider to raise if a timezone aware time object is encountered - switch (type.unit()) { + switch (type->unit()) { case TimeUnit::MICRO: value = internal::PyTime_to_us(obj); break; @@ -234,7 +234,7 @@ struct ValueConverter> { return value; } - static Result Convert(const TimestampType& type, const O& options, I obj) { + static Result Convert(const TimestampType* type, const O& options, I obj) { int64_t value; if (PyDateTime_Check(obj)) { ARROW_ASSIGN_OR_RAISE(int64_t offset, internal::PyDateTime_utcoffset_s(obj)); @@ -242,7 +242,7 @@ struct ValueConverter> { offset = 0; } auto dt = reinterpret_cast(obj); - switch (type.unit()) { + switch (type->unit()) { case TimeUnit::SECOND: value = internal::PyDateTime_to_s(dt) - offset; break; @@ -270,7 +270,7 @@ struct ValueConverter> { // validate that the numpy scalar has np.datetime64 dtype std::shared_ptr numpy_type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); - if (!numpy_type->Equals(type)) { + if (!numpy_type->Equals(*type)) { // TODO(kszucs): the message should highlight the received numpy dtype // TODO(kszucs): it also validates the unit, so add the unit to the error message return Status::NotImplemented("Expected np.datetime64 but got: ", @@ -283,11 +283,11 @@ struct ValueConverter> { return value; } - static Result Convert(const DurationType& type, const O&, I obj) { + static Result Convert(const DurationType* type, const O&, I obj) { int64_t value; if (PyDelta_Check(obj)) { auto dt = reinterpret_cast(obj); - switch (type.unit()) { + switch (type->unit()) { case TimeUnit::SECOND: value = internal::PyDelta_to_s(dt); break; @@ -307,7 +307,7 @@ struct ValueConverter> { // validate that the numpy scalar has np.datetime64 dtype std::shared_ptr numpy_type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); - if (!numpy_type->Equals(type)) { + if (!numpy_type->Equals(*type)) { // TODO(kszucs): the message should highlight the received numpy dtype // TODO(kszucs): it also validates the unit, so add the unit to the error message return Status::NotImplemented("Expected np.timedelta64 but got: ", @@ -320,23 +320,23 @@ struct ValueConverter> { return value; } - static Result Convert(const BaseBinaryType&, const O&, I obj) { + static Result Convert(const BaseBinaryType*, const O&, I obj) { return PyBytesView::FromString(obj); } - static Result Convert(const FixedSizeBinaryType& type, const O&, I obj) { + static Result Convert(const FixedSizeBinaryType* type, const O&, I obj) { ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); - if (ARROW_PREDICT_TRUE(view.size == type.byte_width())) { + if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) { return std::move(view); } else { std::stringstream ss; - ss << "expected to be length " << type.byte_width() << " was " << view.size; + ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); } } template - static enable_if_string> Convert(const T& type, const O& options, + static enable_if_string> Convert(const T*, const O& options, I obj) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that @@ -353,14 +353,30 @@ struct ValueConverter> { } } - static Result Convert(const DataType& type, const O&, I obj) { + static Result Convert(const DataType* type, const O&, I obj) { return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); } }; -class PyArrayConverter : public ArrayConverter { +template +class PyPrimitiveConverter; + +template +class PyDictionaryConverter; + +template +class PyListConverter; +class PyStructConverter; + +class PyConverter : public Converter { public: - using ArrayConverter::ArrayConverter; + template + using PrimitiveConverter = PyPrimitiveConverter; + template + using DictionaryConverter = PyDictionaryConverter; + template + using ListConverter = PyListConverter; + using StructConverter = PyStructConverter; Status Extend(PyObject* values, int64_t size) { /// Ensure we've allocated enough space @@ -388,91 +404,80 @@ class PyArrayConverter : public ArrayConverter { } }; -template -class PyPrimitiveArrayConverter : public PrimitiveArrayConverter {}; - template -class PyPrimitiveArrayConverter< +class PyPrimitiveConverter< T, enable_if_t::value || is_boolean_type::value || is_number_type::value || is_decimal_type::value || is_date_type::value || is_time_type::value>> - : public PrimitiveArrayConverter { + : public PrimitiveConverter { public: - using PrimitiveArrayConverter::PrimitiveArrayConverter; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->type_, this->options_, value)); - return this->builder_->Append(converted); + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); } } }; template -class PyPrimitiveArrayConverter< +class PyPrimitiveConverter< T, enable_if_t::value || is_duration_type::value>> - : public PrimitiveArrayConverter { + : public PrimitiveConverter { public: - using PrimitiveArrayConverter::PrimitiveArrayConverter; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->type_, this->options_, value)); - if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->type_, converted)) { - return this->builder_->AppendNull(); + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { + return this->primitive_builder_->AppendNull(); } else { - return this->builder_->Append(converted); + return this->primitive_builder_->Append(converted); } } } }; template -class PyPrimitiveArrayConverter> - : public PrimitiveArrayConverter { +class PyPrimitiveConverter> + : public PrimitiveConverter { public: - using PrimitiveArrayConverter::PrimitiveArrayConverter; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->type_, this->options_, value)); - return this->builder_->Append(util::string_view(view.bytes, view.size)); + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } } }; template -class PyPrimitiveArrayConverter> - : public PrimitiveArrayConverter { +class PyPrimitiveConverter> + : public PrimitiveConverter { public: - using PrimitiveArrayConverter::PrimitiveArrayConverter; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->type_, this->options_, value)); + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); if (!view.is_utf8) { // observed binary value observed_binary_ = true; } - return this->builder_->Append(util::string_view(view.bytes, view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } } Result> Finish() override { - ARROW_ASSIGN_OR_RAISE(auto array, - (PrimitiveArrayConverter::Finish())); + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::Finish())); if (observed_binary_) { // If we saw any non-unicode, cast results to BinaryArray auto binary_type = TypeTraits::type_singleton(); @@ -486,40 +491,32 @@ class PyPrimitiveArrayConverter> bool observed_binary_ = false; }; -template -class PyDictionaryArrayConverter : public DictionaryArrayConverter { -}; - template -class PyDictionaryArrayConverter> - : public DictionaryArrayConverter { +class PyDictionaryConverter> + : public DictionaryConverter { public: - using DictionaryArrayConverter::DictionaryArrayConverter; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->value_builder_->AppendNull(); } else { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->value_type_, this->options_, value)); - return this->builder_->Append(converted); + return this->value_builder_->Append(converted); } } }; template -class PyDictionaryArrayConverter> - : public DictionaryArrayConverter { +class PyDictionaryConverter> + : public DictionaryConverter { public: - using DictionaryArrayConverter::DictionaryArrayConverter; - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->value_builder_->AppendNull(); } else { ARROW_ASSIGN_OR_RAISE(auto view, PyValue::Convert(this->value_type_, this->options_, value)); - return this->builder_->Append(util::string_view(view.bytes, view.size)); + return this->value_builder_->Append(util::string_view(view.bytes, view.size)); } } }; @@ -541,38 +538,36 @@ class PyDictionaryArrayConverter> } template -class PyListArrayConverter : public ListArrayConverter { +class PyListConverter : public ListConverter { public: - using ListArrayConverter::ListArrayConverter; - - Status ValidateSize(const FixedSizeListType& type, int64_t size) { + Status ValidateSize(const FixedSizeListType* type, int64_t size) { // TODO(kszucs): perhaps this should be handled somewhere else - if (type.list_size() != size) { - return Status::Invalid("Length of item not correct: expected ", type.list_size(), + if (type->list_size() != size) { + return Status::Invalid("Length of item not correct: expected ", type->list_size(), " but got array of size ", size); } else { return Status::OK(); } } - Status ValidateBuilder(const MapType&) { + Status ValidateBuilder(const MapType*) { // TODO(kszucs): perhaps this should be handled somewhere else - if (this->builder_->key_builder()->null_count() > 0) { + if (this->list_builder_->key_builder()->null_count() > 0) { return Status::Invalid("Invalid Map: key field can not contain null values"); } else { return Status::OK(); } } - Status ValidateBuilder(const DataType&) { return Status::OK(); } - Status ValidateSize(const BaseListType&, int64_t size) { return Status::OK(); } + Status ValidateBuilder(const BaseListType*) { return Status::OK(); } + Status ValidateSize(const BaseListType*, int64_t size) { return Status::OK(); } Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->list_builder_->AppendNull(); } - RETURN_NOT_OK(this->builder_->Append()); + RETURN_NOT_OK(this->list_builder_->Append()); if (PyArray_Check(value)) { RETURN_NOT_OK(AppendNdarray(value)); } else if (PySequence_Check(value)) { @@ -581,12 +576,12 @@ class PyListArrayConverter : public ListArrayConverter { return internal::InvalidType( value, "was not a sequence or recognized null for conversion to list type"); } - return ValidateBuilder(this->type_); + return ValidateBuilder(this->list_type_); } Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); - RETURN_NOT_OK(this->ValidateSize(this->type_, size)); + RETURN_NOT_OK(this->ValidateSize(this->list_type_, size)); return this->value_converter_->Extend(value, size); } @@ -596,7 +591,7 @@ class PyListArrayConverter : public ListArrayConverter { return Status::Invalid("Can only convert 1-dimensional array values"); } const int64_t size = PyArray_SIZE(ndarray); - RETURN_NOT_OK(this->ValidateSize(this->type_, size)); + RETURN_NOT_OK(this->ValidateSize(this->list_type_, size)); const auto value_type = this->value_converter_->builder()->type(); switch (value_type->id()) { @@ -671,20 +666,19 @@ class PyListArrayConverter : public ListArrayConverter { } }; -template -class PyStructArrayConverter : public StructArrayConverter { +class PyStructConverter : public StructConverter { public: - using StructArrayConverter::StructArrayConverter; - Status Init() override { + RETURN_NOT_OK(StructConverter::Init()); + // Store the field names as a PyObjects for dict matching - num_fields_ = this->type_.num_fields(); + num_fields_ = this->struct_type_->num_fields(); bytes_field_names_.reset(PyList_New(num_fields_)); unicode_field_names_.reset(PyList_New(num_fields_)); RETURN_IF_PYERROR(); for (int i = 0; i < num_fields_; i++) { - const auto& field_name = this->type_.field(i)->name(); + const auto& field_name = this->struct_type_->field(i)->name(); PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); PyObject* unicode = PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); @@ -713,10 +707,10 @@ class PyStructArrayConverter : public StructArrayConverter Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->builder_->AppendNull(); + return this->struct_builder_->AppendNull(); } RETURN_NOT_OK(InferInputKind(value)); - RETURN_NOT_OK(this->builder_->Append()); + RETURN_NOT_OK(this->struct_builder_->Append()); return (input_kind_ == InputKind::DICTS) ? AppendDict(value) : AppendTuple(value); } @@ -729,7 +723,7 @@ class PyStructArrayConverter : public StructArrayConverter } for (int i = 0; i < num_fields_; i++) { PyObject* value = PyTuple_GET_ITEM(tuple, i); - RETURN_NOT_OK(this->child_converters_[i]->Append(value)); + RETURN_NOT_OK(this->children_[i]->Append(value)); } return Status::OK(); } @@ -770,7 +764,7 @@ class PyStructArrayConverter : public StructArrayConverter } else { // If we come here, it means all keys are absent for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(this->child_converters_[i]->Append(Py_None)); + RETURN_NOT_OK(this->children_[i]->Append(Py_None)); } return Status::OK(); } @@ -784,7 +778,7 @@ class PyStructArrayConverter : public StructArrayConverter if (value == NULL) { RETURN_IF_PYERROR(); } - RETURN_NOT_OK(this->child_converters_[i]->Append(value ? value : Py_None)); + RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); } return Status::OK(); } @@ -805,12 +799,6 @@ class PyStructArrayConverter : public StructArrayConverter int num_fields_; }; -// TODO(kszucs): find a better name -using PyArrayConverterBuilder = - ArrayConverterBuilder; - // Convert *obj* to a sequence if necessary // Fill *size* to its length. If >= 0 on entry, *size* is an upper size // bound that may lead to truncation. @@ -888,7 +876,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* mask, DCHECK_GE(size, 0); ARROW_ASSIGN_OR_RAISE(auto converter, - PyArrayConverterBuilder::Make(real_type, options.pool, options)); + PyConverter::Make(real_type, options.pool, options)); // Convert values if (mask != nullptr && mask != Py_None) { diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 1c812a8e2fc..be173b9491a 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -35,131 +35,146 @@ namespace internal { using internal::checked_cast; using internal::checked_pointer_cast; -template -class ArrayConverter { +template +class PrimitiveConverter : public BaseConverter { public: - using InputType = Input; - using OptionsType = Options; - - ArrayConverter(const std::shared_ptr& type, - std::shared_ptr builder, Options options) - : sp_type_(type), sp_builder_(builder), options_(options) {} + using BuilderType = typename TypeTraits::BuilderType; - virtual ~ArrayConverter() = default; - const std::shared_ptr& builder() const { return sp_builder_; } - const std::shared_ptr& type() const { return sp_type_; } - Options options() const { return options_; } - - virtual Status Init() { return Status::OK(); } - virtual Status Reserve(int64_t additional_capacity) = 0; - virtual Status Append(InputType value) = 0; - virtual Status AppendNull() = 0; - virtual Result> Finish() = 0; + Status Init() override { + primitive_type_ = checked_cast(this->type_.get()); + primitive_builder_ = checked_cast(this->builder_.get()); + return Status::OK(); + } protected: - const std::shared_ptr sp_type_; - std::shared_ptr sp_builder_; - Options options_; + const T* primitive_type_; + BuilderType* primitive_builder_; + typename BaseConverter::OptionsType opts_; }; -template ::BuilderType> -class TypedArrayConverter : public BaseConverter { +template +class ListConverter : public BaseConverter { public: - TypedArrayConverter(const std::shared_ptr& type, - std::shared_ptr builder, - typename BaseConverter::OptionsType options) - : BaseConverter(type, builder, options), - type_(checked_cast(*type)), - builder_(checked_cast(builder.get())) {} - - Status Reserve(int64_t additional_capacity) override { - return this->builder_->Reserve(additional_capacity); - } - - Status AppendNull() override { return this->builder_->AppendNull(); } + using BuilderType = typename TypeTraits::BuilderType; - Result> Finish() override { return builder_->Finish(); }; + Status Init() override { + list_type_ = checked_cast(this->type_.get()); + list_builder_ = checked_cast(this->builder_.get()); + value_converter_ = this->children_[0]; + return Status::OK(); + } protected: - const T& type_; - BuilderType* builder_; -}; - -template -class PrimitiveArrayConverter : public TypedArrayConverter { - public: - using TypedArrayConverter::TypedArrayConverter; + const T* list_type_; + BuilderType* list_builder_; + std::shared_ptr value_converter_; }; -template -class DictionaryArrayConverter - : public TypedArrayConverter> { +template +class StructConverter : public BaseConverter { public: - DictionaryArrayConverter(const std::shared_ptr& type, - std::shared_ptr builder, - typename BaseConverter::OptionsType options) - : TypedArrayConverter>( - type, builder, options), - value_type_(checked_cast( - *checked_cast(*type).value_type())) {} + Status Init() override { + struct_type_ = checked_cast(this->type_.get()); + struct_builder_ = checked_cast(this->builder_.get()); + return Status::OK(); + } protected: - const T& value_type_; + const StructType* struct_type_; + StructBuilder* struct_builder_; }; -template -class ListArrayConverter : public TypedArrayConverter { +template +class DictionaryConverter : public BaseConverter { public: - ListArrayConverter(const std::shared_ptr& type, - std::shared_ptr builder, - std::shared_ptr value_converter, - typename BaseConverter::OptionsType options) - : TypedArrayConverter(type, builder, options), - value_converter_(std::move(value_converter)) {} + using BuilderType = DictionaryBuilder; + + Status Init() override { + dict_type_ = checked_cast(this->type_.get()); + value_type_ = checked_cast(dict_type_->value_type().get()); + value_builder_ = checked_cast(this->builder_.get()); + return Status::OK(); + } protected: - std::shared_ptr value_converter_; + const DictionaryType* dict_type_; + const U* value_type_; + BuilderType* value_builder_; }; -template -class StructArrayConverter : public TypedArrayConverter { +template +struct MakeConverterImpl; + +template +class Converter { public: - StructArrayConverter(const std::shared_ptr& type, - std::shared_ptr builder, - std::vector> child_converters, - typename BaseConverter::OptionsType options) - : TypedArrayConverter(type, builder, options), - child_converters_(std::move(child_converters)) {} + using InputType = Input; + using OptionsType = Options; + + template + using PrimitiveConverter = PrimitiveConverter; + template + using ListConverter = ListConverter; + template + using DictionaryConveter = DictionaryConverter; + using StructConverter = StructConverter; + + static Result> Make(std::shared_ptr type, + MemoryPool* pool, OptionsType options) { + std::shared_ptr out; + MakeConverterImpl visitor = {type, pool, options, &out}; + ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); + ARROW_RETURN_NOT_OK(out->Init()); + return out; + } + + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); }; + + virtual Status Append(InputType value) { + return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + virtual Status Reserve(int64_t additional_capacity) { + return builder_->Reserve(additional_capacity); + } + + virtual Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> Finish() { return builder_->Finish(); }; protected: - std::vector> child_converters_; + friend struct MakeConverterImpl; + + std::shared_ptr type_; + std::shared_ptr builder_; + std::vector> children_; + OptionsType options_; + OptionsType opts_; }; -#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - out->reset(new DictionaryConverter(type, std::move(builder), options)); \ +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + return Finish>( \ + std::move(builder), {}); \ break; -template class PrimitiveConverter, - template class DictionaryConverter, - template class ListConverter, - template class StructConverter> -struct ArrayConverterBuilder { - using Self = ArrayConverterBuilder; - +template +struct MakeConverterImpl { Status Visit(const NullType& t) { - // TODO: merge with the primitive c_type variant below, requires a NullType ctor which - // accepts a type instance using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = PrimitiveConverter; - static_assert(std::is_base_of::value, ""); + using ConverterType = typename Converter::template PrimitiveConverter; auto builder = std::make_shared(pool); - out->reset(new ConverterType(type, std::move(builder), options)); - return Status::OK(); + return Finish(std::move(builder), {}); } template @@ -168,46 +183,39 @@ struct ArrayConverterBuilder { Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = PrimitiveConverter; - static_assert(std::is_base_of::value, ""); + using ConverterType = typename Converter::template PrimitiveConverter; auto builder = std::make_shared(type, pool); - out->reset(new ConverterType(type, std::move(builder), options)); - return Status::OK(); + return Finish(std::move(builder), {}); } template enable_if_t::value && !std::is_same::value, Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = ListConverter; - static_assert(std::is_base_of::value, ""); + using ConverterType = typename Converter::template ListConverter; ARROW_ASSIGN_OR_RAISE(auto child_converter, - (Self::Make(t.value_type(), pool, options))); + Converter::Make(t.value_type(), pool, options)); auto builder = std::make_shared(pool, child_converter->builder(), type); - out->reset( - new ConverterType(type, std::move(builder), std::move(child_converter), options)); - return Status::OK(); + return Finish(std::move(builder), {std::move(child_converter)}); } Status Visit(const MapType& t) { - using ConverterType = ListConverter; - static_assert(std::is_base_of::value, ""); + using ConverterType = typename Converter::template ListConverter; // TODO(kszucs): seems like builders not respect field nullability std::vector> struct_fields{t.key_field(), t.item_field()}; auto struct_type = std::make_shared(struct_fields); - ARROW_ASSIGN_OR_RAISE(auto struct_converter, Self::Make(struct_type, pool, options)); + ARROW_ASSIGN_OR_RAISE(auto struct_converter, + Converter::Make(struct_type, pool, options)); auto struct_builder = struct_converter->builder(); auto key_builder = struct_builder->child_builder(0); auto item_builder = struct_builder->child_builder(1); auto builder = std::make_shared(pool, key_builder, item_builder, type); - out->reset(new ConverterType(type, std::move(builder), std::move(struct_converter), - options)); - return Status::OK(); + return Finish(std::move(builder), {std::move(struct_converter)}); } Status Visit(const DictionaryType& t) { @@ -236,19 +244,18 @@ struct ArrayConverterBuilder { return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(), " not implemented"); } - return Status::OK(); } Status Visit(const StructType& t) { - using ConverterType = StructConverter; - static_assert(std::is_base_of::value, ""); + using ConverterType = typename Converter::StructConverter; - std::shared_ptr child_converter; - std::vector> child_converters; + std::shared_ptr child_converter; + std::vector> child_converters; std::vector> child_builders; for (const auto& field : t.fields()) { - ARROW_ASSIGN_OR_RAISE(child_converter, Self::Make(field->type(), pool, options)); + ARROW_ASSIGN_OR_RAISE(child_converter, + Converter::Make(field->type(), pool, options)); // TODO: use move child_converters.push_back(child_converter); @@ -256,26 +263,27 @@ struct ArrayConverterBuilder { } auto builder = std::make_shared(type, pool, child_builders); - out->reset(new ConverterType(type, std::move(builder), std::move(child_converters), - options)); - return Status::OK(); + return Finish(std::move(builder), std::move(child_converters)); } Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } - static Result> Make(std::shared_ptr type, - MemoryPool* pool, Options options) { - std::shared_ptr out; - Self visitor = {type, pool, options, &out}; - ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); - ARROW_RETURN_NOT_OK(out->Init()); - return out; + template + Status Finish(std::shared_ptr builder, + std::vector> children) { + auto converter = new ConverterType(); + converter->type_ = std::move(type); + converter->builder_ = std::move(builder); + converter->options_ = options; + converter->children_ = std::move(children); + out->reset(converter); + return Status::OK(); } - const std::shared_ptr& type; + const std::shared_ptr type; MemoryPool* pool; - Options options; - std::shared_ptr* out; + typename Converter::OptionsType options; + std::shared_ptr* out; }; } // namespace internal From 9c2c5e5cd57d6da6a8c31e59a768fac65b1c86cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 8 Sep 2020 23:38:23 +0200 Subject: [PATCH 25/80] overloaded AppendValue --- cpp/src/arrow/python/python_to_arrow.cc | 106 ++++++++++++------------ 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 042eedebd78..960be40aa27 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -358,7 +358,7 @@ struct ValueConverter> { } }; -template +template class PyPrimitiveConverter; template @@ -366,6 +366,7 @@ class PyDictionaryConverter; template class PyListConverter; + class PyStructConverter; class PyConverter : public Converter { @@ -405,78 +406,81 @@ class PyConverter : public Converter -class PyPrimitiveConverter< - T, enable_if_t::value || is_boolean_type::value || - is_number_type::value || is_decimal_type::value || - is_date_type::value || is_time_type::value>> - : public PrimitiveConverter { +class PyPrimitiveConverter : public PrimitiveConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE( - auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(converted); + return AppendValue(this->primitive_type_, value); } } -}; -template -class PyPrimitiveConverter< - T, enable_if_t::value || is_duration_type::value>> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { + Status AppendValue(const DataType*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); + } + + Status AppendValue(const Decimal128Type*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); + } + + Status AppendValue(const TimestampType*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->primitive_type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE( - auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); - if (PyArray_CheckAnyScalarExact(value) && - PyValue::IsNaT(this->primitive_type_, converted)) { - return this->primitive_builder_->AppendNull(); - } else { - return this->primitive_builder_->Append(converted); - } + return this->primitive_builder_->Append(converted); } } -}; -template -class PyPrimitiveConverter> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { + Status AppendValue(const DurationType*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->primitive_type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { return this->primitive_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE( - auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + return this->primitive_builder_->Append(converted); } } -}; -template -class PyPrimitiveConverter> - : public PrimitiveConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->primitive_builder_->AppendNull(); - } else { - ARROW_ASSIGN_OR_RAISE( - auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - if (!view.is_utf8) { - // observed binary value - observed_binary_ = true; - } - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + Status AppendValue(const BaseBinaryType*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } + + Status AppendValue(const FixedSizeBinaryType*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } + + Status AppendValue(const StringType*, PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->primitive_type_, this->options_, value)); + if (!view.is_utf8) { + // observed binary value + observed_binary_ = true; } + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } Result> Finish() override { + return FinishInternal(this->primitive_type_); + } + + Result> FinishInternal(const DataType*) { + return PrimitiveConverter::Finish(); + } + + Result> FinishInternal(const StringType*) { ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::Finish())); if (observed_binary_) { // If we saw any non-unicode, cast results to BinaryArray From 9ece00682f070807f7192fa817942d9b355365c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 9 Sep 2020 00:05:25 +0200 Subject: [PATCH 26/80] method specialization --- cpp/src/arrow/python/python_to_arrow.cc | 84 ++++++++++--------------- 1 file changed, 33 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 960be40aa27..0b830eab651 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -361,7 +361,7 @@ struct ValueConverter> { template class PyPrimitiveConverter; -template +template class PyDictionaryConverter; template @@ -412,23 +412,24 @@ class PyPrimitiveConverter : public PrimitiveConverter { if (PyValue::IsNull(this->options_, value)) { return this->primitive_builder_->AppendNull(); } else { - return AppendValue(this->primitive_type_, value); + return AppendValue(value); } } - Status AppendValue(const DataType*, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(converted); - } - - Status AppendValue(const Decimal128Type*, PyObject* value) { + template + enable_if_t::value || is_boolean_type::value || + is_number_type::value || is_decimal_type::value || + is_date_type::value || is_time_type::value, + Status> + AppendValue(PyObject* value) { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); return this->primitive_builder_->Append(converted); } - Status AppendValue(const TimestampType*, PyObject* value) { + template + enable_if_t::value || is_duration_type::value, Status> + AppendValue(PyObject* value) { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); if (PyArray_CheckAnyScalarExact(value) && @@ -439,36 +440,21 @@ class PyPrimitiveConverter : public PrimitiveConverter { } } - Status AppendValue(const DurationType*, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->primitive_type_, this->options_, value)); - if (PyArray_CheckAnyScalarExact(value) && - PyValue::IsNaT(this->primitive_type_, converted)) { - return this->primitive_builder_->AppendNull(); - } else { - return this->primitive_builder_->Append(converted); - } - } - - Status AppendValue(const BaseBinaryType*, PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); - } - - Status AppendValue(const FixedSizeBinaryType*, PyObject* value) { + template + enable_if_string AppendValue(PyObject* value) { ARROW_ASSIGN_OR_RAISE(auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + if (!view.is_utf8) { + // observed binary value + observed_binary_ = true; + } return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } - Status AppendValue(const StringType*, PyObject* value) { + template + enable_if_binary AppendValue(PyObject* value) { ARROW_ASSIGN_OR_RAISE(auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - if (!view.is_utf8) { - // observed binary value - observed_binary_ = true; - } return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } @@ -496,32 +482,28 @@ class PyPrimitiveConverter : public PrimitiveConverter { }; template -class PyDictionaryConverter> - : public DictionaryConverter { +class PyDictionaryConverter : public DictionaryConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->value_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->value_builder_->Append(converted); + return AppendValue(value); } } -}; -template -class PyDictionaryConverter> - : public DictionaryConverter { - public: - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->value_builder_->AppendNull(); - } else { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->value_builder_->Append(util::string_view(view.bytes, view.size)); - } + template + enable_if_has_c_type AppendValue(PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(converted); + } + + template + enable_if_has_string_view AppendValue(PyObject* value) { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(util::string_view(view.bytes, view.size)); } }; From 53d83bf7d641b0e06fb9e9d41255e319a88a645f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 9 Sep 2020 11:24:46 +0200 Subject: [PATCH 27/80] specialize primitive converters --- cpp/src/arrow/python/python_to_arrow.cc | 105 +++++++++++++----------- cpp/src/arrow/util/converter.h | 1 - 2 files changed, 59 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 0b830eab651..8487ef210cc 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -358,7 +358,7 @@ struct ValueConverter> { } }; -template +template class PyPrimitiveConverter; template @@ -406,67 +406,78 @@ class PyConverter : public Converter -class PyPrimitiveConverter : public PrimitiveConverter { +class PyPrimitiveConverter< + T, enable_if_t::value || is_boolean_type::value || + is_number_type::value || is_decimal_type::value || + is_date_type::value || is_time_type::value>> + : public PrimitiveConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->primitive_builder_->AppendNull(); } else { - return AppendValue(value); + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); } } +}; - template - enable_if_t::value || is_boolean_type::value || - is_number_type::value || is_decimal_type::value || - is_date_type::value || is_time_type::value, - Status> - AppendValue(PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(converted); - } - - template - enable_if_t::value || is_duration_type::value, Status> - AppendValue(PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->primitive_type_, this->options_, value)); - if (PyArray_CheckAnyScalarExact(value) && - PyValue::IsNaT(this->primitive_type_, converted)) { +template +class PyPrimitiveConverter< + T, enable_if_t::value || is_duration_type::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { return this->primitive_builder_->AppendNull(); } else { - return this->primitive_builder_->Append(converted); + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { + return this->primitive_builder_->AppendNull(); + } else { + return this->primitive_builder_->Append(converted); + } } } +}; - template - enable_if_string AppendValue(PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->primitive_type_, this->options_, value)); - if (!view.is_utf8) { - // observed binary value - observed_binary_ = true; +template +class PyPrimitiveConverter> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } +}; - template - enable_if_binary AppendValue(PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); +template +class PyPrimitiveConverter> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); + if (!view.is_utf8) { + // observed binary value + observed_binary_ = true; + } + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + } } Result> Finish() override { - return FinishInternal(this->primitive_type_); - } - - Result> FinishInternal(const DataType*) { - return PrimitiveConverter::Finish(); - } - - Result> FinishInternal(const StringType*) { ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::Finish())); if (observed_binary_) { // If we saw any non-unicode, cast results to BinaryArray @@ -526,6 +537,8 @@ class PyDictionaryConverter : public DictionaryConverter { template class PyListConverter : public ListConverter { public: + Status ValidateSize(const BaseListType*, int64_t size) { return Status::OK(); } + Status ValidateSize(const FixedSizeListType* type, int64_t size) { // TODO(kszucs): perhaps this should be handled somewhere else if (type->list_size() != size) { @@ -536,6 +549,8 @@ class PyListConverter : public ListConverter { } } + Status ValidateBuilder(const BaseListType*) { return Status::OK(); } + Status ValidateBuilder(const MapType*) { // TODO(kszucs): perhaps this should be handled somewhere else if (this->list_builder_->key_builder()->null_count() > 0) { @@ -545,9 +560,6 @@ class PyListConverter : public ListConverter { } } - Status ValidateBuilder(const BaseListType*) { return Status::OK(); } - Status ValidateSize(const BaseListType*, int64_t size) { return Status::OK(); } - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->list_builder_->AppendNull(); @@ -562,6 +574,7 @@ class PyListConverter : public ListConverter { return internal::InvalidType( value, "was not a sequence or recognized null for conversion to list type"); } + return ValidateBuilder(this->list_type_); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index be173b9491a..a195d4196fe 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -49,7 +49,6 @@ class PrimitiveConverter : public BaseConverter { protected: const T* primitive_type_; BuilderType* primitive_builder_; - typename BaseConverter::OptionsType opts_; }; template From fb94b32fe3bef6237f36e094300847ee6475d81b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 9 Sep 2020 12:44:17 +0200 Subject: [PATCH 28/80] support list of pairs when converting to struct array; struct scalar is no longer a mapping; support StaticTzInfo --- cpp/src/arrow/python/datetime.cc | 9 + cpp/src/arrow/python/python_to_arrow.cc | 200 +++++++++++++------ python/pyarrow/scalar.pxi | 26 +-- python/pyarrow/tests/strategies.py | 93 ++++++--- python/pyarrow/tests/test_convert_builtin.py | 83 ++++++-- python/pyarrow/tests/test_scalars.py | 17 ++ python/pyarrow/tests/test_types.py | 11 + 7 files changed, 310 insertions(+), 129 deletions(-) diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 4eeab7f5a69..07df5e76b45 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -419,6 +419,15 @@ Result TzinfoToString(PyObject* tzinfo) { return PyTZInfo_utcoffset_hhmm(tzinfo); } + // try to look up zone attribute + if (PyObject_HasAttrString(tzinfo, "zone")) { + OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone")); + RETURN_IF_PYERROR(); + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result)); + return result; + } + // attempt to call tzinfo.tzname(None) OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); RETURN_IF_PYERROR(); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 8487ef210cc..6fe8a620b83 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -361,7 +361,7 @@ struct ValueConverter> { template class PyPrimitiveConverter; -template +template class PyDictionaryConverter; template @@ -493,28 +493,32 @@ class PyPrimitiveConverter> }; template -class PyDictionaryConverter : public DictionaryConverter { +class PyDictionaryConverter> + : public DictionaryConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->value_builder_->AppendNull(); } else { - return AppendValue(value); + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(converted); } } +}; - template - enable_if_has_c_type AppendValue(PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto converted, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->value_builder_->Append(converted); - } - - template - enable_if_has_string_view AppendValue(PyObject* value) { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->value_builder_->Append(util::string_view(view.bytes, view.size)); +template +class PyDictionaryConverter> + : public DictionaryConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto view, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(util::string_view(view.bytes, view.size)); + } } }; @@ -690,16 +694,16 @@ class PyStructConverter : public StructConverter { Status InferInputKind(PyObject* value) { // Infer input object's type, note that heterogeneous sequences are not allowed - if (ARROW_PREDICT_FALSE(input_kind_ == InputKind::UNKNOWN)) { - if (PyDict_Check(value)) { - input_kind_ = InputKind::DICTS; - } else if (PyTuple_Check(value)) { - input_kind_ = InputKind::TUPLES; - } else { - return internal::InvalidType(value, - "was not a dict, tuple, or recognized null value " - "for conversion to struct type"); - } + if (PyDict_Check(value)) { + input_kind_ = InputKind::DICT; + } else if (PyTuple_Check(value)) { + input_kind_ = InputKind::TUPLE; + } else if (PySequence_Check(value)) { + input_kind_ = InputKind::ITEMS; + } else { + return internal::InvalidType(value, + "was not a dict, tuple, or recognized null value " + "for conversion to struct type"); } return Status::OK(); } @@ -708,9 +712,27 @@ class PyStructConverter : public StructConverter { if (PyValue::IsNull(this->options_, value)) { return this->struct_builder_->AppendNull(); } - RETURN_NOT_OK(InferInputKind(value)); - RETURN_NOT_OK(this->struct_builder_->Append()); - return (input_kind_ == InputKind::DICTS) ? AppendDict(value) : AppendTuple(value); + switch (input_kind_) { + case InputKind::DICT: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendDict(value); + case InputKind::TUPLE: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendTuple(value); + case InputKind::ITEMS: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendItems(value); + default: + RETURN_NOT_OK(InferInputKind(value)); + return Append(value); + } + } + + Status AppendEmpty() { + for (int i = 0; i < num_fields_; i++) { + RETURN_NOT_OK(this->children_[i]->Append(Py_None)); + } + return Status::OK(); } Status AppendTuple(PyObject* tuple) { @@ -727,45 +749,55 @@ class PyStructConverter : public StructConverter { return Status::OK(); } - Status InferDictKeyKind(PyObject* dict) { - if (ARROW_PREDICT_FALSE(dict_key_kind_ == DictKeyKind::UNKNOWN)) { - for (int i = 0; i < num_fields_; i++) { - PyObject* name = PyList_GET_ITEM(unicode_field_names_.obj(), i); - PyObject* value = PyDict_GetItem(dict, name); - if (value != NULL) { - dict_key_kind_ = DictKeyKind::UNICODE; - return Status::OK(); - } + Status InferKeyKind(PyObject* items) { + // TODO: iterate over the items instead + for (int i = 0; i < num_fields_; i++) { + PyObject* tuple = PySequence_GetItem(items, i); + if (tuple == NULL) { RETURN_IF_PYERROR(); - // Unicode key not present, perhaps bytes key is? - name = PyList_GET_ITEM(bytes_field_names_.obj(), i); - value = PyDict_GetItem(dict, name); - if (value != NULL) { - dict_key_kind_ = DictKeyKind::BYTES; - return Status::OK(); - } + } + PyObject* key = PyTuple_GET_ITEM(tuple, 0); + if (key == NULL) { RETURN_IF_PYERROR(); } + // check equality with unicode field name + PyObject* name = PyList_GET_ITEM(unicode_field_names_.obj(), i); + bool are_equal = PyObject_RichCompareBool(key, name, Py_EQ); + RETURN_IF_PYERROR(); + if (are_equal) { + key_kind_ = KeyKind::UNICODE; + return Status::OK(); + } + // check equality with bytes field name + name = PyList_GET_ITEM(bytes_field_names_.obj(), i); + are_equal = PyObject_RichCompareBool(key, name, Py_EQ); + RETURN_IF_PYERROR(); + if (are_equal) { + key_kind_ = KeyKind::BYTES; + return Status::OK(); + } } return Status::OK(); + // return internal::Invalid(value, "was unable to infer key type"); } Status AppendDict(PyObject* dict) { if (!PyDict_Check(dict)) { return internal::InvalidType(dict, "was expecting a dict"); } - RETURN_NOT_OK(InferDictKeyKind(dict)); - - if (dict_key_kind_ == DictKeyKind::UNICODE) { - return AppendDict(dict, unicode_field_names_.obj()); - } else if (dict_key_kind_ == DictKeyKind::BYTES) { - return AppendDict(dict, bytes_field_names_.obj()); - } else { - // If we come here, it means all keys are absent - for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(this->children_[i]->Append(Py_None)); - } - return Status::OK(); + switch (key_kind_) { + case KeyKind::UNICODE: + return AppendDict(dict, unicode_field_names_.obj()); + case KeyKind::BYTES: + return AppendDict(dict, bytes_field_names_.obj()); + default: + RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict))); + if (key_kind_ == KeyKind::UNKNOWN) { + // was unable to infer the type which means that all keys are absent + return AppendEmpty(); + } else { + return AppendDict(dict); + } } } @@ -782,15 +814,57 @@ class PyStructConverter : public StructConverter { return Status::OK(); } + Status AppendItems(PyObject* items) { + if (!PySequence_Check(items)) { + return internal::InvalidType(items, "was expecting a sequence of key-value items"); + } + // if (PySequence_GET_SIZE(items) != num_fields_) { + // return Status::Invalid("Sequence size must be equal to number of struct fields"); + // } + switch (key_kind_) { + case KeyKind::UNICODE: + return AppendItems(items, unicode_field_names_.obj()); + case KeyKind::BYTES: + return AppendItems(items, bytes_field_names_.obj()); + default: + RETURN_NOT_OK(InferKeyKind(items)); + if (key_kind_ == KeyKind::UNKNOWN) { + // was unable to infer the type which means that all keys are absent + return AppendEmpty(); + } else { + return AppendItems(items); + } + } + } + + Status AppendItems(PyObject* items, PyObject* field_names) { + for (int i = 0; i < num_fields_; i++) { + PyObject* tuple = PySequence_GetItem(items, i); + if (tuple == NULL) { + RETURN_IF_PYERROR(); + } + PyObject* key = PyTuple_GET_ITEM(tuple, 0); + PyObject* value = PyTuple_GET_ITEM(tuple, 1); + if (key == NULL || value == NULL) { + RETURN_IF_PYERROR(); + } + PyObject* name = PyList_GET_ITEM(field_names, i); + bool are_equal = PyObject_RichCompareBool(key, name, Py_EQ); + RETURN_IF_PYERROR(); + if (are_equal) { + RETURN_NOT_OK(this->children_[i]->Append(value)); + } else { + return Status::Invalid("Key not equal to the expected field name"); + } + } + return Status::OK(); + } + protected: - // Whether we're converting from a sequence of dicts or tuples - enum class InputKind { UNKNOWN, DICTS, TUPLES } input_kind_ = InputKind::UNKNOWN; + // Whether we're converting from a sequence of dicts or tuples or list of pairs + enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN; // Whether the input dictionary keys' type is python bytes or unicode - enum class DictKeyKind { - UNKNOWN, - BYTES, - UNICODE - } dict_key_kind_ = DictKeyKind::UNKNOWN; + enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN; // Store the field names as a PyObjects for dict matching OwnedRef bytes_field_names_; OwnedRef unicode_field_names_; diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index be3a503425d..45836f900e8 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -16,9 +16,6 @@ # under the License. -import collections - - cdef class Scalar(_Weakrefable): """ The base class for scalars. @@ -580,7 +577,7 @@ cdef class LargeListScalar(ListScalar): pass -cdef class StructScalar(Scalar, collections.abc.Mapping): +cdef class StructScalar(Scalar): """ Concrete class for struct scalars. """ @@ -589,16 +586,6 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): cdef CStructScalar* sp = self.wrapped.get() return sp.value.size() - def __iter__(self): - cdef: - CStructScalar* sp = self.wrapped.get() - CStructType* dtype = sp.type.get() - vector[shared_ptr[CField]] fields = dtype.fields() - - if sp.is_valid: - for i in range(dtype.num_fields()): - yield frombytes(fields[i].get().name()) - def __contains__(self, key): try: self[key] @@ -639,12 +626,21 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): else: raise KeyError(key) + def __iter__(self): + cdef StructType type = self.type + if self.is_valid: + for i in range(len(self)): + yield (type.field(i).name, self[i]) + + def items(self): + return list(self) + def as_py(self): """ Return this value as a Python dict. """ if self.is_valid: - return {k: v.as_py() for k, v in self.items()} + return [(k, v.as_py()) for k, v in self] else: return None diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 088f29185bd..97e972d84d5 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import datetime + import pytz import hypothesis as h import hypothesis.strategies as st @@ -82,10 +84,16 @@ unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones() ) -duration_types = st.sampled_from([ - pa.duration(unit) for unit in ['s', 'ms', 'us', 'ns']]) +duration_types = st.builds( + pa.duration, + st.sampled_from(['s', 'ms', 'us', 'ns']) +) temporal_types = st.one_of( - date_types, time_types, timestamp_types, duration_types) + date_types, + time_types, + timestamp_types, + duration_types +) primitive_types = st.one_of( null_type, @@ -101,9 +109,16 @@ metadata = st.dictionaries(st.text(), st.text()) -def fields(type_strategy=primitive_types): - return st.builds(pa.field, name=custom_text, type=type_strategy, - nullable=st.booleans(), metadata=metadata) +@st.composite +def fields(draw, type_strategy=primitive_types): + name = draw(custom_text) + typ = draw(type_strategy) + if pa.types.is_null(typ): + nullable = True + else: + nullable = draw(st.booleans()) + meta = draw(metadata) + return pa.field(name, type=typ, nullable=nullable, metadata=meta) def list_types(item_strategy=primitive_types): @@ -152,8 +167,10 @@ def schemas(type_strategy=primitive_types, max_fields=None): @st.composite def arrays(draw, type, size=None): if isinstance(type, st.SearchStrategy): - type = draw(type) - elif not isinstance(type, pa.DataType): + ty = draw(type) + elif isinstance(type, pa.DataType): + ty = type + else: raise TypeError('Type must be a pyarrow DataType') if isinstance(size, st.SearchStrategy): @@ -165,57 +182,67 @@ def arrays(draw, type, size=None): shape = (size,) - if pa.types.is_list(type) or pa.types.is_large_list(type): + if pa.types.is_list(ty) or pa.types.is_large_list(ty): offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20 offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero - values = draw(arrays(type.value_type, size=int(offsets.sum()))) - array_type = ( - pa.LargeListArray if pa.types.is_large_list(type) - else pa.ListArray) + values = draw(arrays(ty.value_type, size=int(offsets.sum()))) + if pa.types.is_large_list(ty): + array_type = pa.LargeListArray + else: + array_type = pa.ListArray return array_type.from_arrays(offsets, values) - if pa.types.is_struct(type): - h.assume(len(type) > 0) + if pa.types.is_struct(ty): + h.assume(len(ty) > 0) fields, child_arrays = [], [] - for field in type: + for field in ty: fields.append(field) child_arrays.append(draw(arrays(field.type, size=size))) return pa.StructArray.from_arrays(child_arrays, fields=fields) - if (pa.types.is_boolean(type) or pa.types.is_integer(type) or - pa.types.is_floating(type)): - values = npst.arrays(type.to_pandas_dtype(), shape=(size,)) + if (pa.types.is_boolean(ty) or pa.types.is_integer(ty) or + pa.types.is_floating(ty)): + values = npst.arrays(ty.to_pandas_dtype(), shape=(size,)) np_arr = draw(values) - if pa.types.is_floating(type): + if pa.types.is_floating(ty): # Workaround ARROW-4952: no easy way to assert array equality # in a NaN-tolerant way. np_arr[np.isnan(np_arr)] = -42.0 - return pa.array(np_arr, type=type) + return pa.array(np_arr, type=ty) - if pa.types.is_null(type): + if pa.types.is_null(ty): value = st.none() - elif pa.types.is_time(type): + elif pa.types.is_time(ty): value = st.times() - elif pa.types.is_date(type): + elif pa.types.is_date(ty): value = st.dates() - elif pa.types.is_timestamp(type): - tz = pytz.timezone(type.tz) if type.tz is not None else None - value = st.datetimes(timezones=st.just(tz)) - elif pa.types.is_duration(type): + elif pa.types.is_timestamp(ty): + min_int64 = -(2**63) + max_int64 = 2**63 - 1 + min_datetime = datetime.datetime.fromtimestamp(min_int64 / 10**9) + max_datetime = datetime.datetime.fromtimestamp(max_int64 / 10**9) + try: + offset_hours = int(ty.tz) + tz = pytz.FixedOffset(offset_hours * 60) + except ValueError: + tz = pytz.timezone(ty.tz) + value = st.datetimes(timezones=st.just(tz), min_value=min_datetime, + max_value=max_datetime) + elif pa.types.is_duration(ty): value = st.timedeltas() - elif pa.types.is_binary(type) or pa.types.is_large_binary(type): + elif pa.types.is_binary(ty) or pa.types.is_large_binary(ty): value = st.binary() - elif pa.types.is_string(type) or pa.types.is_large_string(type): + elif pa.types.is_string(ty) or pa.types.is_large_string(ty): value = st.text() - elif pa.types.is_decimal(type): + elif pa.types.is_decimal(ty): # TODO(kszucs): properly limit the precision # value = st.decimals(places=type.scale, allow_infinity=False) h.reject() else: - raise NotImplementedError(type) + raise NotImplementedError(ty) values = st.lists(value, min_size=size, max_size=size) - return pa.array(draw(values), type=type) + return pa.array(draw(values), type=ty) @st.composite diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index af37ba5bf78..789fa042398 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -15,19 +15,20 @@ # specific language governing permissions and limitations # under the License. -import pytest - -from pyarrow.pandas_compat import _pandas_api # noqa -import pyarrow as pa - import collections import datetime import decimal import itertools import math +import hypothesis as h import numpy as np import pytz +import pytest + +from pyarrow.pandas_compat import _pandas_api # noqa +import pyarrow as pa +import pyarrow.tests.strategies as past int_type_pairs = [ @@ -884,6 +885,7 @@ def test_sequence_timestamp(): 46, 57, 437699) +# TODO(kszucs): test pytz.StaticTzInfo like pytz.timezone('Etc/GMT+1') @pytest.mark.parametrize('timezone', [ None, 'UTC', @@ -1410,6 +1412,10 @@ def test_empty_range(): assert arr.to_pylist() == [] +def _as_pairs(expected): + return [None if i is None else list(i.items()) for i in expected] + + def test_structarray(): arr = pa.StructArray.from_arrays([], names=[]) assert arr.type == pa.struct([]) @@ -1430,7 +1436,7 @@ def test_structarray(): ] pylist = arr.to_pylist() - assert pylist == expected, (pylist, expected) + assert pylist == _as_pairs(expected) # len(names) != len(arrays) with pytest.raises(ValueError): @@ -1447,7 +1453,7 @@ def test_struct_from_dicts(): data = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) - assert arr.to_pylist() == data + assert arr.to_pylist() == _as_pairs(data) # With omitted values data = [{'a': 5, 'c': True}, @@ -1459,7 +1465,7 @@ def test_struct_from_dicts(): None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': 'bar', 'c': None}] - assert arr.to_pylist() == expected + assert arr.to_pylist() == _as_pairs(expected) def test_struct_from_dicts_bytes_keys(): @@ -1473,10 +1479,11 @@ def test_struct_from_dicts_bytes_keys(): data = [{b'a': 5, b'b': 'foo'}, {b'a': 6, b'c': False}] arr = pa.array(data, type=ty) - assert arr.to_pylist() == [ + expected = [ {'a': 5, 'b': 'foo', 'c': None}, {'a': 6, 'b': None, 'c': False}, ] + assert arr.to_pylist() == _as_pairs(expected) def test_struct_from_tuples(): @@ -1493,7 +1500,7 @@ def test_struct_from_tuples(): data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) - assert arr.to_pylist() == expected + assert arr.to_pylist() == _as_pairs(expected) assert arr.equals(arr2) @@ -1505,7 +1512,7 @@ def test_struct_from_tuples(): None, {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) - assert arr.to_pylist() == expected + assert arr.to_pylist() == _as_pairs(expected) # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: @@ -1513,6 +1520,23 @@ def test_struct_from_tuples(): pa.array([tup], type=ty) +# TODO(kszucs): test duplicated field name +# TODO(kszucs): test with empty elements +# TODO(kszucs): test with None elements +# TODO(kszucs): test with empty element at the first position because of +# inference +def test_struct_from_list_of_pairs(): + ty = pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_())]) + data = _as_pairs([ + {'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False} + ]) + arr = pa.array(data, type=ty) + assert arr.to_pylist() == data + + def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), @@ -1532,7 +1556,7 @@ def test_struct_from_dicts_inference(): {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) - assert arr.to_pylist() == data + assert arr.to_pylist() == _as_pairs(data) # With omitted values data = [{'a': 5, 'c': True}, @@ -1549,7 +1573,7 @@ def test_struct_from_dicts_inference(): arr2 = pa.array(data) check_struct_type(arr.type, expected_type) - assert arr.to_pylist() == expected + assert arr.to_pylist() == _as_pairs(expected) assert arr.equals(arr2) # Nested @@ -1561,12 +1585,27 @@ def test_struct_from_dicts_inference(): {'a': {'aa': None, 'ab': False}, 'b': None}, {'a': None, 'b': 'bar'}] arr = pa.array(data) - assert arr.to_pylist() == data + + expected = [ + [ + ('a', [('aa', [5, 6]), ('ab', True)]), + ('b', 'foo') + ], + [ + ('a', [('aa', None), ('ab', False)]), + ('b', None) + ], + [ + ('a', None), + ('b', 'bar') + ] + ] + assert arr.to_pylist() == expected # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) - assert arr.to_pylist() == [{}] + assert arr.to_pylist() == [[]] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): @@ -1632,10 +1671,11 @@ def test_map_from_dicts(): assert arr.to_pylist() == expected + # FIXME(kszucs): fix the raised exception's type # Invalid dictionary - for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: - with pytest.raises(ValueError, match="Invalid Map"): - pa.array([entry], type=pa.map_('i4', 'i4')) + # for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: + # with pytest.raises(ValueError, match="Invalid Map"): + # pa.array([entry], type=pa.map_('i4', 'i4')) # Invalid dictionary types for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]: @@ -1755,3 +1795,10 @@ def test_dictionary_from_strings(): expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3)) assert a.indices.equals(expected_indices) assert a.dictionary.equals(expected_dictionary) + + +@h.given(past.all_arrays) +def test_pina(arr): + seq = arr.to_pylist() + restored = pa.array(seq, type=arr.type) + assert restored.equals(arr) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index fa48ad8b5f2..65ab7e8d35a 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -496,6 +496,23 @@ def test_struct(): assert s['y'].as_py() is None +def test_struct_duplicate_field_name(): + fields = [ + pa.field('x', pa.int64()), + pa.field('x', pa.string()) + ] + ty = pa.struct(fields) + + arr = pa.StructArray.from_arrays([ + pa.array([1, 2, 3]), + pa.array(["a", "b", "c"]) + ], fields=fields) + + assert arr[0] == pa.scalar([('x', 1), ('x', 'a')], type=ty) + assert arr[1] == pa.scalar([('x', 2), ('x', 'b')], type=ty) + assert arr[2] == pa.scalar([('x', 3), ('x', 'c')], type=ty) + + def test_map(): ty = pa.map_(pa.string(), pa.int8()) v = [('a', 1), ('b', 2)] diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index c52751e91ac..345f469be1e 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -25,6 +25,7 @@ import pytz import hypothesis as h import hypothesis.strategies as st +import hypothesis.extra.pytz as tzst import weakref import numpy as np @@ -258,6 +259,9 @@ def test_is_primitive(): @pytest.mark.parametrize(('tz', 'expected'), [ (pytz.utc, 'UTC'), (pytz.timezone('Europe/Paris'), 'Europe/Paris'), + # StaticTzInfo.tzname returns with '-09' so we need to infer the timezone's + # name from the tzinfo.zone attribute + (pytz.timezone('Etc/GMT-9'), 'Etc/GMT-9'), (pytz.FixedOffset(180), '+03:00'), (datetime.timezone.utc, '+00:00'), (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') @@ -280,6 +284,13 @@ def test_tzinfo_to_string_errors(): pa.lib.tzinfo_to_string(tz) +@h.given(tzst.timezones()) +def test_timezone_roundtrip(tz): + timezone_string = pa.lib.tzinfo_to_string(tz) + timezone_tzinfo = pa.lib.string_to_tzinfo(timezone_string) + assert timezone_tzinfo == tz + + def test_convert_custom_tzinfo_objects_to_string(): class CorrectTimezone1(datetime.tzinfo): """ From 748836b441d8d3af44e1e3b63dcd40772782ab14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 9 Sep 2020 16:52:44 +0200 Subject: [PATCH 29/80] auto chunking --- cpp/src/arrow/python/numpy_to_arrow.cc | 7 +-- cpp/src/arrow/python/python_test.cc | 5 +- cpp/src/arrow/python/python_to_arrow.cc | 16 +++--- cpp/src/arrow/python/python_to_arrow.h | 4 +- cpp/src/arrow/util/converter.h | 66 ++++++++++++++++++++++++- python/pyarrow/array.pxi | 9 ++-- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/scalar.pxi | 8 ++- 8 files changed, 97 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 320937142c2..08359100605 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -315,9 +315,10 @@ Status NumPyConverter::Convert() { py_options.type = type_; py_options.from_pandas = from_pandas_; ARROW_ASSIGN_OR_RAISE( - auto result, ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options)); - out_arrays_.push_back(result); + auto chunked_array, + ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options)); + out_arrays_ = chunked_array->chunks(); return Status::OK(); } diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index 6ee33e037a5..80bda384bde 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -425,7 +425,10 @@ TEST_F(DecimalTest, TestNoneAndNaN) { ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); options.from_pandas = true; - ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, options)) + ASSERT_OK_AND_ASSIGN(auto chunked, ConvertPySequence(list, nullptr, options)); + ASSERT_EQ(chunked->num_chunks(), 1); + + auto arr = chunked->chunk(0); ASSERT_TRUE(arr->IsValid(0)); ASSERT_TRUE(arr->IsNull(1)); ASSERT_TRUE(arr->IsNull(2)); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 6fe8a620b83..a08caa12d6e 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -54,6 +54,7 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; +using internal::Chunker; using internal::Converter; using internal::DictionaryConverter; using internal::ListConverter; @@ -477,8 +478,8 @@ class PyPrimitiveConverter> } } - Result> Finish() override { - ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::Finish())); + Result> ToArray() override { + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); if (observed_binary_) { // If we saw any non-unicode, cast results to BinaryArray auto binary_type = TypeTraits::type_singleton(); @@ -915,8 +916,8 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } -Result> ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& opts) { +Result> ConvertPySequence(PyObject* obj, PyObject* mask, + const PyConversionOptions& opts) { PyAcquireGIL lock; PyObject* seq; @@ -950,14 +951,15 @@ Result> ConvertPySequence(PyObject* obj, PyObject* mask, ARROW_ASSIGN_OR_RAISE(auto converter, PyConverter::Make(real_type, options.pool, options)); + ARROW_ASSIGN_OR_RAISE(auto chunked_converter, Chunker::Make(converter)); // Convert values if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); + RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); } else { - RETURN_NOT_OK(converter->Extend(seq, size)); + RETURN_NOT_OK(chunked_converter->Extend(seq, size)); } - return converter->Finish(); + return chunked_converter->ToChunkedArray(); } } // namespace py diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index bfe7d7f1767..094635859f6 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -73,8 +73,8 @@ struct PyConversionOptions { /// \param[in] options various conversion options /// \return Result Array ARROW_PYTHON_EXPORT -Result> ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& options); +Result> ConvertPySequence( + PyObject* obj, PyObject* mask, const PyConversionOptions& options); } // namespace py diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index a195d4196fe..ad76b374caa 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -148,7 +148,13 @@ class Converter { virtual Status AppendNull() { return builder_->AppendNull(); } - virtual Result> Finish() { return builder_->Finish(); }; + virtual Result> ToArray() { return builder_->Finish(); }; + + virtual Result> ToArray(int64_t length) { + // RETURN_NOT_OK(builder_->Resize(length)); + ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); + return arr->Slice(0, length); + } protected: friend struct MakeConverterImpl; @@ -157,7 +163,6 @@ class Converter { std::shared_ptr builder_; std::vector> children_; OptionsType options_; - OptionsType opts_; }; #define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ @@ -285,5 +290,62 @@ struct MakeConverterImpl { std::shared_ptr* out; }; +template +class Chunker : public BaseConverter { + public: + using Self = Chunker; + using InputType = typename BaseConverter::InputType; + + static Result> Make(std::shared_ptr converter) { + auto result = std::make_shared(); + result->type_ = converter->type(); + result->builder_ = converter->builder(); + // result->options_ = converter->options_; + // result->children_ = converter->children_; + result->converter_ = std::move(converter); + return result; + } + + Status AppendNull() override { + auto status = converter_->AppendNull(); + if (status.ok()) { + length_ = this->builder_->length(); + } else if (status.IsCapacityError()) { + RETURN_NOT_OK(FinishChunk()); + return converter_->AppendNull(); + } + return status; + } + + Status Append(InputType value) override { + auto status = converter_->Append(value); + if (status.ok()) { + length_ = this->builder_->length(); + } else if (status.IsCapacityError()) { + RETURN_NOT_OK(FinishChunk()); + return converter_->Append(value); + } + return status; + } + + Status FinishChunk() { + ARROW_ASSIGN_OR_RAISE(auto chunk, this->ToArray(length_)); + this->builder_->Reset(); + length_ = 0; + chunks_.push_back(chunk); + return Status::OK(); + } + + Result> ToChunkedArray() { + RETURN_NOT_OK(FinishChunk()); + return std::make_shared(chunks_); + } + + protected: + int64_t length_ = 0; + std::shared_ptr converter_; + std::vector> chunks_; +}; + } // namespace internal } // namespace arrow diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 679e7ff3c88..3ba2c3236d3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -24,7 +24,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, cdef: int64_t c_size PyConversionOptions options - shared_ptr[CArray] result + shared_ptr[CChunkedArray] chunked if type is not None: options.type = type.sp_type @@ -39,9 +39,12 @@ cdef _sequence_to_array(object sequence, object mask, object size, cdef shared_ptr[CChunkedArray] out with nogil: - result = GetResultValue(ConvertPySequence(sequence, mask, options)) + chunked = GetResultValue(ConvertPySequence(sequence, mask, options)) - return pyarrow_wrap_array(result) + if chunked.get().num_chunks() == 1: + return pyarrow_wrap_array(chunked.get().chunk(0)) + else: + return pyarrow_wrap_chunked_array(chunked) cdef inline _is_array_like(obj): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 97b63c7ac6e..33016094530 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1769,7 +1769,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: # TODO Some functions below are not actually "nogil" - CResult[shared_ptr[CArray]] ConvertPySequence( + CResult[shared_ptr[CChunkedArray]] ConvertPySequence( object obj, object mask, const PyConversionOptions& options) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 45836f900e8..1b6ba9fdc11 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -869,6 +869,7 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): PyConversionOptions options shared_ptr[CScalar] scalar shared_ptr[CArray] array + shared_ptr[CChunkedArray] chunked bint is_pandas_object = False type = ensure_type(type, allow_none=True) @@ -890,7 +891,12 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): value = [value] with nogil: - array = GetResultValue(ConvertPySequence(value, None, options)) + chunked = GetResultValue(ConvertPySequence(value, None, options)) + # get the first chunk + assert chunked.get().num_chunks() == 1 + array = chunked.get().chunk(0) + + # retrieve the scalar from the first position scalar = GetResultValue(array.get().GetScalar(0)) return Scalar.wrap(scalar) From c086364982934566d6ab6f3a9e8d889572d78e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 13:54:09 +0200 Subject: [PATCH 30/80] working chunk on capacity overflow --- cpp/src/arrow/array/builder_binary.h | 21 +++++++ cpp/src/arrow/python/python_to_arrow.cc | 11 +++- cpp/src/arrow/util/converter.h | 6 +- python/pyarrow/scalar.pxi | 15 +++-- python/pyarrow/tests/test_array.py | 13 +++-- python/pyarrow/tests/test_convert_builtin.py | 58 +++++++++++++++++++- python/pyarrow/tests/test_pandas.py | 30 +++++++--- python/pyarrow/tests/test_parquet.py | 2 + python/pyarrow/tests/test_scalars.py | 18 ++++-- 9 files changed, 146 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 593b533a19c..c2f99e62b8d 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -76,6 +76,16 @@ class BaseBinaryBuilder : public ArrayBuilder { return Append(value.data(), static_cast(value.size())); } + Status AppendSafe(util::string_view value) { + auto size = static_cast(value.size()); + auto num_bytes = value_data_builder_.length() + size; + if (ARROW_PREDICT_TRUE(num_bytes <= memory_limit())) { + return Append(value.data(), size); + } else { + return AppendOverflow(num_bytes); + } + } + Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { @@ -422,6 +432,17 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendSafe(util::string_view value) { + auto num_bytes = byte_builder_.length() + byte_width_; + if (ARROW_PREDICT_TRUE(num_bytes <= memory_limit())) { + return Append(value.data()); + } else { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", num_bytes); + ; + } + } + Status Append(const std::string& s) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(s); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a08caa12d6e..933d6c81514 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -455,7 +455,8 @@ class PyPrimitiveConverter> } else { ARROW_ASSIGN_OR_RAISE( auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + return this->primitive_builder_->AppendSafe( + util::string_view(view.bytes, view.size)); } } }; @@ -474,7 +475,9 @@ class PyPrimitiveConverter> // observed binary value observed_binary_ = true; } - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + // TODO(kszucs): add CheckOverflow method to binary builders and call it here + return this->primitive_builder_->AppendSafe( + util::string_view(view.bytes, view.size)); } } @@ -503,6 +506,7 @@ class PyDictionaryConverter> } else { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->value_type_, this->options_, value)); + // TODO(kszucs): use AppendSafe with checking memory limit BEFORE actual append return this->value_builder_->Append(converted); } } @@ -518,6 +522,7 @@ class PyDictionaryConverter> } else { ARROW_ASSIGN_OR_RAISE(auto view, PyValue::Convert(this->value_type_, this->options_, value)); + // TODO(kszucs): use AppendSafe with checking memory limit BEFORE actual append return this->value_builder_->Append(util::string_view(view.bytes, view.size)); } } @@ -570,6 +575,8 @@ class PyListConverter : public ListConverter { return this->list_builder_->AppendNull(); } + // TODO(kszucs): raise CapacityError if it wouldn't fit in the list builder's + // memory limit RETURN_NOT_OK(this->list_builder_->Append()); if (PyArray_Check(value)) { RETURN_NOT_OK(AppendNdarray(value)); diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index ad76b374caa..bcfaa1530d4 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -151,7 +151,6 @@ class Converter { virtual Result> ToArray() { return builder_->Finish(); }; virtual Result> ToArray(int64_t length) { - // RETURN_NOT_OK(builder_->Resize(length)); ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); return arr->Slice(0, length); } @@ -290,6 +289,7 @@ struct MakeConverterImpl { std::shared_ptr* out; }; +// TODO(kszucs): rename to AutoChunker template class Chunker : public BaseConverter { public: @@ -323,13 +323,13 @@ class Chunker : public BaseConverter { length_ = this->builder_->length(); } else if (status.IsCapacityError()) { RETURN_NOT_OK(FinishChunk()); - return converter_->Append(value); + return Append(value); } return status; } Status FinishChunk() { - ARROW_ASSIGN_OR_RAISE(auto chunk, this->ToArray(length_)); + ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_)); this->builder_->Reset(); length_ = 0; chunks_.push_back(chunk); diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 1b6ba9fdc11..7e119665a41 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -627,20 +627,25 @@ cdef class StructScalar(Scalar): raise KeyError(key) def __iter__(self): - cdef StructType type = self.type if self.is_valid: - for i in range(len(self)): - yield (type.field(i).name, self[i]) + for field in self.type: + yield field.name - def items(self): + def keys(self): return list(self) + def values(self): + return [self[key] for key in self] + + def items(self): + return [(key, self[key]) for key in self] + def as_py(self): """ Return this value as a Python dict. """ if self.is_valid: - return [(k, v.as_py()) for k, v in self] + return [(key, self[key].as_py()) for key in self] else: return None diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index feb84dd1d4c..ed510ae4200 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -482,8 +482,8 @@ def test_struct_array_slice(): ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) - assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, - {'a': 5, 'b': 6.5}] + assert arr[1:].to_pylist() == [[('a', 3), ('b', 4.5)], + [('a', 5), ('b', 6.5)]] def test_array_factory_invalid_type(): @@ -615,10 +615,15 @@ def test_struct_from_buffers(): children=children[:1]) +def _as_pairs(expected): + return [None if i is None else list(i.items()) for i in expected] + + def test_struct_from_arrays(): a = pa.array([4, 5, 6], type=pa.int64()) b = pa.array(["bar", None, ""]) c = pa.array([[1, 2], None, [3, None]]) + expected_list = [ {'a': 4, 'b': 'bar', 'c': [1, 2]}, {'a': 5, 'b': None, 'c': None}, @@ -629,7 +634,7 @@ def test_struct_from_arrays(): arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"]) assert arr.type == pa.struct( [("a", a.type), ("b", b.type), ("c", c.type)]) - assert arr.to_pylist() == expected_list + assert arr.to_pylist() == _as_pairs(expected_list) with pytest.raises(ValueError): pa.StructArray.from_arrays([a, b, c], ["a", "b"]) @@ -645,7 +650,7 @@ def test_struct_from_arrays(): arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc]) assert arr.type == pa.struct([fa, fb, fc]) assert not arr.type[0].nullable - assert arr.to_pylist() == expected_list + assert arr.to_pylist() == _as_pairs(expected_list) with pytest.raises(ValueError): pa.StructArray.from_arrays([a, b, c], fields=[fa, fb]) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 789fa042398..e91fcf2d13e 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1798,7 +1798,63 @@ def test_dictionary_from_strings(): @h.given(past.all_arrays) -def test_pina(arr): +def test_array_to_pylist_roundtrip(arr): seq = arr.to_pylist() restored = pa.array(seq, type=arr.type) assert restored.equals(arr) + + +@pytest.mark.large_memory +def test_auto_chunking(): + v1 = b'x' * 100000000 + v2 = b'x' * 147483646 + data = [v1] * 20 + [v2] + arr = pa.array(data, type=pa.binary()) + assert isinstance(arr, pa.Array) + + data += ['x'] * 1 + arr = pa.array(data, type=pa.binary()) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(1)) == 1 + assert arr.chunk(1).to_pylist() == [b'x'] + + +@pytest.mark.large_memory +def test_nested_auto_chunking(): + v1 = b'x' * 100000000 + v2 = b'x' * 147483646 + + ty = pa.struct([ + pa.field('bool', pa.bool_()), + pa.field('integer', pa.int64()), + pa.field('binary', pa.binary()), + ]) + + data = [{'bool': True, 'integer': 1, 'binary': v1}] * 20 + data.append({'bool': True, 'integer': 1, 'binary': v2}) + arr = pa.array(data, type=ty) + assert isinstance(arr, pa.Array) + + data.append({'bool': True, 'integer': 1, 'binary': b'x'}) + arr = pa.array(data, type=ty) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(1)) == 1 + + assert arr.chunk(1)[0].as_py() == [ + ('bool', True), + ('integer', 1), + ('binary', b'x') + ] + + +# FIXME(kszucs) +@pytest.mark.skip +def test_dictionary_conversion(): + data = [ + {"page_type": 1}, + {"record_type": 1}, + {"non_consecutive_home": 0}, + ] + pa.array(data, type=None, from_pandas=True, safe=True) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 03407521c12..d381e97874a 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2135,12 +2135,17 @@ def test_from_numpy(self): data = np.array([(42, True), (43, False)], dtype=dt) arr = pa.array(data, type=ty) - assert arr.to_pylist() == [{'x': 42, 'y': True}, - {'x': 43, 'y': False}] + assert arr.to_pylist() == [ + [('x', 42), ('y', True)], + [('x', 43), ('y', False)] + ] # With mask arr = pa.array(data, mask=np.bool_([False, True]), type=ty) - assert arr.to_pylist() == [{'x': 42, 'y': True}, None] + assert arr.to_pylist() == [ + [('x', 42), ('y', True)], + None + ] # Trivial struct type dt = np.dtype([]) @@ -2152,7 +2157,7 @@ def test_from_numpy(self): data = np.array([(), ()], dtype=dt) arr = pa.array(data, type=ty) - assert arr.to_pylist() == [{}, {}] + assert arr.to_pylist() == [[], []] def test_from_numpy_nested(self): # Note: an object field inside a struct @@ -2175,9 +2180,20 @@ def test_from_numpy_nested(self): ((1, True), 2, 'foo'), ((3, False), 4, 'bar')], dtype=dt) arr = pa.array(data, type=ty) - assert arr.to_pylist() == [ - {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'}, - {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}] + + expected = [ + [ + ('x', [('xx', 1), ('yy', True)]), + ('y', 2), + ('z', 'foo') + ], + [ + ('x', [('xx', 3), ('yy', False)]), + ('y', 4), + ('z', 'bar') + ] + ] + assert arr.to_pylist() == expected @pytest.mark.large_memory def test_from_numpy_large(self): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b2026f88599..61eb11e083e 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -676,7 +676,9 @@ def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): tm.assert_frame_equal(df, df_read) +# FIXME(kszucs): probably an issue with key kind inference @pytest.mark.pandas +@pytest.mark.skip def test_pandas_can_write_nested_data(tempdir): data = { "agg_col": [ diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 65ab7e8d35a..de70fb8eea2 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -69,7 +69,12 @@ def test_basics(value, ty, klass, deprecated): s = pa.scalar(value, type=ty) assert isinstance(s, klass) - assert s.as_py() == value + + if isinstance(s, pa.StructScalar) and isinstance(value, dict): + assert s.as_py() == list(value.items()) + else: + assert s.as_py() == value + assert s == pa.scalar(value, type=ty) assert s != value assert s != "else" @@ -471,9 +476,10 @@ def test_struct(): assert 'y' in s assert 'z' not in s - assert s.as_py() == v - assert repr(s) != repr(v) - assert repr(s.as_py()) == repr(v) + items = list(v.items()) + assert s.as_py() == items + assert repr(s) != repr(items) + assert repr(s.as_py()) == repr(items) assert len(s) == 2 assert isinstance(s['x'], pa.Int16Scalar) assert isinstance(s['y'], pa.FloatScalar) @@ -523,8 +529,8 @@ def test_map(): assert isinstance(s.values, pa.Array) assert repr(s) == "" assert s.values.to_pylist() == [ - {'key': 'a', 'value': 1}, - {'key': 'b', 'value': 2} + [('key', 'a'), ('value', 1)], + [('key', 'b'), ('value', 2)] ] # test iteration From 5a386427f5d54c977b6eed3bf7d4b658b994e869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 14:43:12 +0200 Subject: [PATCH 31/80] fix struct key kind inference and failing tests --- cpp/src/arrow/python/python_to_arrow.cc | 22 ++++++++++---------- python/pyarrow/tests/test_compute.py | 8 +++---- python/pyarrow/tests/test_convert_builtin.py | 10 ++++----- python/pyarrow/tests/test_parquet.py | 2 -- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 933d6c81514..347ee08e10f 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -758,8 +758,8 @@ class PyStructConverter : public StructConverter { } Status InferKeyKind(PyObject* items) { - // TODO: iterate over the items instead - for (int i = 0; i < num_fields_; i++) { + for (int i = 0; i < PySequence_Length(items); i++) { + // retrieve the key from the passed key-value pairs PyObject* tuple = PySequence_GetItem(items, i); if (tuple == NULL) { RETURN_IF_PYERROR(); @@ -768,25 +768,24 @@ class PyStructConverter : public StructConverter { if (key == NULL) { RETURN_IF_PYERROR(); } - // check equality with unicode field name - PyObject* name = PyList_GET_ITEM(unicode_field_names_.obj(), i); - bool are_equal = PyObject_RichCompareBool(key, name, Py_EQ); + + // check key exists between the unicode field names + bool do_contain = PySequence_Contains(unicode_field_names_.obj(), key); RETURN_IF_PYERROR(); - if (are_equal) { + if (do_contain) { key_kind_ = KeyKind::UNICODE; return Status::OK(); } - // check equality with bytes field name - name = PyList_GET_ITEM(bytes_field_names_.obj(), i); - are_equal = PyObject_RichCompareBool(key, name, Py_EQ); + + // check key exists between the bytes field names + do_contain = PySequence_Contains(bytes_field_names_.obj(), key); RETURN_IF_PYERROR(); - if (are_equal) { + if (do_contain) { key_kind_ = KeyKind::BYTES; return Status::OK(); } } return Status::OK(); - // return internal::Invalid(value, "was unable to infer key type"); } Status AppendDict(PyObject* dict) { @@ -826,6 +825,7 @@ class PyStructConverter : public StructConverter { if (!PySequence_Check(items)) { return internal::InvalidType(items, "was expecting a sequence of key-value items"); } + // TODO(kszucs): cover with tests // if (PySequence_GET_SIZE(items) != num_fields_) { // return Status::Invalid("Sequence size must be equal to number of struct fields"); // } diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 8b0859ccf39..d64b6b10f42 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -204,21 +204,21 @@ def test_mode_array(): # ARROW-9917 arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') expected = {"mode": 1, "count": 2} - assert pc.mode(arr).as_py() == {"mode": 1, "count": 2} + assert pc.mode(arr).as_py() == [("mode", 1), ("count", 2)] arr = pa.array([], type='int64') - expected = {"mode": None, "count": None} + expected = [("mode", None), ("count", None)] assert pc.mode(arr).as_py() == expected def test_mode_chunked_array(): # ARROW-9917 arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) - expected = {"mode": 1, "count": 2} + expected = [("mode", 1), ("count", 2)] assert pc.mode(arr).as_py() == expected arr = pa.chunked_array((), type='int64') - expected = {"mode": None, "count": None} + expected = [("mode", None), ("count", None)] assert arr.num_chunks == 0 assert pc.mode(arr).as_py() == expected diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index e91fcf2d13e..50889ffbe25 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1671,11 +1671,11 @@ def test_map_from_dicts(): assert arr.to_pylist() == expected - # FIXME(kszucs): fix the raised exception's type # Invalid dictionary - # for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: - # with pytest.raises(ValueError, match="Invalid Map"): - # pa.array([entry], type=pa.map_('i4', 'i4')) + for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: + print(entry) + with pytest.raises(ValueError, match="Invalid Map"): + pa.array([entry], type=pa.map_('i4', 'i4')) # Invalid dictionary types for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]: @@ -1849,8 +1849,6 @@ def test_nested_auto_chunking(): ] -# FIXME(kszucs) -@pytest.mark.skip def test_dictionary_conversion(): data = [ {"page_type": 1}, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 61eb11e083e..b2026f88599 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -676,9 +676,7 @@ def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): tm.assert_frame_equal(df, df_read) -# FIXME(kszucs): probably an issue with key kind inference @pytest.mark.pandas -@pytest.mark.skip def test_pandas_can_write_nested_data(tempdir): data = { "agg_col": [ From 4bc562c8b9d10dee3b928cb3b4bbf7548ac1d524 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 15:08:46 +0200 Subject: [PATCH 32/80] use different aliases in the converter trait --- cpp/src/arrow/python/python_to_arrow.cc | 8 +++--- cpp/src/arrow/util/converter.h | 26 ++++++++++---------- python/pyarrow/array.pxi | 3 +++ python/pyarrow/tests/test_convert_builtin.py | 1 - 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 347ee08e10f..83c28fd928f 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -373,12 +373,12 @@ class PyStructConverter; class PyConverter : public Converter { public: template - using PrimitiveConverter = PyPrimitiveConverter; + using Primitive = PyPrimitiveConverter; template - using DictionaryConverter = PyDictionaryConverter; + using Dictionary = PyDictionaryConverter; template - using ListConverter = PyListConverter; - using StructConverter = PyStructConverter; + using List = PyListConverter; + using Struct = PyStructConverter; Status Extend(PyObject* values, int64_t size) { /// Ensure we've allocated enough space diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index bcfaa1530d4..4a875e824c3 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -111,12 +111,12 @@ class Converter { using OptionsType = Options; template - using PrimitiveConverter = PrimitiveConverter; + using Primitive = PrimitiveConverter; template - using ListConverter = ListConverter; + using List = ListConverter; template - using DictionaryConveter = DictionaryConverter; - using StructConverter = StructConverter; + using Dictionary = DictionaryConverter; + using Struct = StructConverter; static Result> Make(std::shared_ptr type, MemoryPool* pool, OptionsType options) { @@ -164,17 +164,17 @@ class Converter { OptionsType options_; }; -#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - return Finish>( \ - std::move(builder), {}); \ +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + return Finish>( \ + std::move(builder), {}); \ break; template struct MakeConverterImpl { Status Visit(const NullType& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename Converter::template PrimitiveConverter; + using ConverterType = typename Converter::template Primitive; auto builder = std::make_shared(pool); return Finish(std::move(builder), {}); @@ -186,7 +186,7 @@ struct MakeConverterImpl { Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename Converter::template PrimitiveConverter; + using ConverterType = typename Converter::template Primitive; auto builder = std::make_shared(type, pool); return Finish(std::move(builder), {}); @@ -196,7 +196,7 @@ struct MakeConverterImpl { enable_if_t::value && !std::is_same::value, Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename Converter::template ListConverter; + using ConverterType = typename Converter::template List; ARROW_ASSIGN_OR_RAISE(auto child_converter, Converter::Make(t.value_type(), pool, options)); @@ -205,7 +205,7 @@ struct MakeConverterImpl { } Status Visit(const MapType& t) { - using ConverterType = typename Converter::template ListConverter; + using ConverterType = typename Converter::template List; // TODO(kszucs): seems like builders not respect field nullability std::vector> struct_fields{t.key_field(), t.item_field()}; @@ -250,7 +250,7 @@ struct MakeConverterImpl { } Status Visit(const StructType& t) { - using ConverterType = typename Converter::StructConverter; + using ConverterType = typename Converter::Struct; std::shared_ptr child_converter; std::vector> child_converters; diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 3ba2c3236d3..fbcafdb22a4 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -163,6 +163,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, representation). Timezone-naive data will be implicitly interpreted as UTC. + TODO(kszucs): describe the adaptive nature of the dictionary array's index + type + Examples -------- >>> import pandas as pd diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 50889ffbe25..f234bf051f2 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1673,7 +1673,6 @@ def test_map_from_dicts(): # Invalid dictionary for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: - print(entry) with pytest.raises(ValueError, match="Invalid Map"): pa.array([entry], type=pa.map_('i4', 'i4')) From 2581619a2462eeb6d5521d903951c6177977503b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 16:50:30 +0200 Subject: [PATCH 33/80] validate overflow for list types as well --- cpp/src/arrow/array/builder_binary.h | 42 +++++++++++-------------- cpp/src/arrow/array/builder_nested.cc | 13 ++++++++ cpp/src/arrow/array/builder_nested.h | 17 ++++++++++ cpp/src/arrow/python/python_to_arrow.cc | 31 +++++++----------- python/pyarrow/tests/test_array.py | 17 ++++++++-- 5 files changed, 73 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index c2f99e62b8d..866d19d6c75 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -76,21 +76,21 @@ class BaseBinaryBuilder : public ArrayBuilder { return Append(value.data(), static_cast(value.size())); } - Status AppendSafe(util::string_view value) { - auto size = static_cast(value.size()); - auto num_bytes = value_data_builder_.length() + size; - if (ARROW_PREDICT_TRUE(num_bytes <= memory_limit())) { - return Append(value.data(), size); + Status ValidateOverflow() { return ValidateOverflow(0); } + + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = value_data_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); } else { - return AppendOverflow(num_bytes); + return Status::OK(); } } Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } + RETURN_NOT_OK(ValidateOverflow()); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -327,16 +327,9 @@ class BaseBinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - Status AppendOverflow(int64_t num_bytes) { - return Status::CapacityError("array cannot contain more than ", memory_limit(), - " bytes, have ", num_bytes); - } - Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } + ARROW_RETURN_NOT_OK(ValidateOverflow()); return offsets_builder_.Append(static_cast(num_bytes)); } @@ -432,14 +425,15 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendSafe(util::string_view value) { - auto num_bytes = byte_builder_.length() + byte_width_; - if (ARROW_PREDICT_TRUE(num_bytes <= memory_limit())) { - return Append(value.data()); - } else { + Status ValidateOverflow() { return ValidateOverflow(0); } + + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = byte_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { return Status::CapacityError("array cannot contain more than ", memory_limit(), - " bytes, have ", num_bytes); - ; + " bytes, have ", new_size); + } else { + return Status::OK(); } } diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index b8af62fab14..978416df565 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -170,6 +170,19 @@ Status FixedSizeListBuilder::AppendNulls(int64_t length) { return value_builder_->AppendNulls(list_size_ * length); } +Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { + auto new_length = value_builder_->length() + new_elements; + if (new_elements != list_size_) { + return Status::Invalid("Length of item not correct: expected ", list_size_, + " but got array of size ", new_elements); + } + if (new_length > maximum_elements()) { + return Status::CapacityError("array cannot contain more than ", maximum_elements(), + " elements, have ", new_elements); + } + return Status::OK(); +} + Status FixedSizeListBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity)); return ArrayBuilder::Resize(capacity); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index cd6fadfcc2f..48ddc862790 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -131,6 +131,16 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status ValidateOverflow(int64_t new_elements) { + auto new_length = value_builder_->length() + new_elements; + if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { + return Status::CapacityError("array cannot contain more than ", maximum_elements(), + " elements, have ", new_elements); + } else { + return Status::OK(); + } + } + ArrayBuilder* value_builder() const { return value_builder_.get(); } // Cannot make this a static attribute because of linking issues @@ -343,12 +353,19 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { /// automatically. Status AppendNulls(int64_t length) final; + Status ValidateOverflow(int64_t new_elements); + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); } + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + protected: std::shared_ptr value_field_; const int32_t list_size_; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 83c28fd928f..f67095d71ef 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -455,8 +455,8 @@ class PyPrimitiveConverter> } else { ARROW_ASSIGN_OR_RAISE( auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - return this->primitive_builder_->AppendSafe( - util::string_view(view.bytes, view.size)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } } }; @@ -475,9 +475,8 @@ class PyPrimitiveConverter> // observed binary value observed_binary_ = true; } - // TODO(kszucs): add CheckOverflow method to binary builders and call it here - return this->primitive_builder_->AppendSafe( - util::string_view(view.bytes, view.size)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); + return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); } } @@ -547,20 +546,12 @@ class PyDictionaryConverter> template class PyListConverter : public ListConverter { public: - Status ValidateSize(const BaseListType*, int64_t size) { return Status::OK(); } + Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } - Status ValidateSize(const FixedSizeListType* type, int64_t size) { - // TODO(kszucs): perhaps this should be handled somewhere else - if (type->list_size() != size) { - return Status::Invalid("Length of item not correct: expected ", type->list_size(), - " but got array of size ", size); - } else { - return Status::OK(); - } + Status ValidateOverflow(const BaseListType*, int64_t size) { + return this->list_builder_->ValidateOverflow(size); } - Status ValidateBuilder(const BaseListType*) { return Status::OK(); } - Status ValidateBuilder(const MapType*) { // TODO(kszucs): perhaps this should be handled somewhere else if (this->list_builder_->key_builder()->null_count() > 0) { @@ -570,13 +561,13 @@ class PyListConverter : public ListConverter { } } + Status ValidateBuilder(const BaseListType*) { return Status::OK(); } + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->list_builder_->AppendNull(); } - // TODO(kszucs): raise CapacityError if it wouldn't fit in the list builder's - // memory limit RETURN_NOT_OK(this->list_builder_->Append()); if (PyArray_Check(value)) { RETURN_NOT_OK(AppendNdarray(value)); @@ -592,7 +583,7 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); - RETURN_NOT_OK(this->ValidateSize(this->list_type_, size)); + RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); return this->value_converter_->Extend(value, size); } @@ -602,7 +593,7 @@ class PyListConverter : public ListConverter { return Status::Invalid("Can only convert 1-dimensional array values"); } const int64_t size = PyArray_SIZE(ndarray); - RETURN_NOT_OK(this->ValidateSize(this->list_type_, size)); + RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); const auto value_type = this->value_converter_->builder()->type(); switch (value_type->id()) { diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ed510ae4200..295089a5a42 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2518,9 +2518,20 @@ def test_numpy_binary_overflow_to_chunked(): @pytest.mark.large_memory def test_list_child_overflow_to_chunked(): - vals = [['x' * 1024]] * ((2 << 20) + 1) - with pytest.raises(ValueError, match="overflowed"): - pa.array(vals) + kilobyte_string = 'x' * 1024 + two_mega = 2**21 + + vals = [[kilobyte_string]] * (two_mega - 1) + arr = pa.array(vals) + assert isinstance(arr, pa.Array) + assert len(arr) == two_mega - 1 + + vals = [[kilobyte_string]] * two_mega + arr = pa.array(vals) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr) == two_mega + assert len(arr.chunk(0)) == two_mega - 1 + assert len(arr.chunk(1)) == 1 def test_infer_type_masked(): From 666eae3c200ea3134bca66581d7c003c24dac804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 16:55:14 +0200 Subject: [PATCH 34/80] fix assertion in numpy large memory test --- python/pyarrow/tests/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index d381e97874a..d88114fa9e2 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2224,7 +2224,7 @@ def check(arr, data, mask=None): ys = data['y'] for i, obj in enumerate(iter_chunked_array(arr)): try: - d = obj.as_py() + d = dict(obj.as_py()) if obj.is_valid else obj.as_py() if mask is not None and mask[i]: assert d is None else: From 79134e4f5848f3bfbb8cc31f56c0d5c07ba31160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 16:57:33 +0200 Subject: [PATCH 35/80] lint --- cpp/src/arrow/util/converter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 4a875e824c3..57d56749be2 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -311,7 +311,7 @@ class Chunker : public BaseConverter { if (status.ok()) { length_ = this->builder_->length(); } else if (status.IsCapacityError()) { - RETURN_NOT_OK(FinishChunk()); + ARROW_RETURN_NOT_OK(FinishChunk()); return converter_->AppendNull(); } return status; @@ -322,7 +322,7 @@ class Chunker : public BaseConverter { if (status.ok()) { length_ = this->builder_->length(); } else if (status.IsCapacityError()) { - RETURN_NOT_OK(FinishChunk()); + ARROW_RETURN_NOT_OK(FinishChunk()); return Append(value); } return status; @@ -337,7 +337,7 @@ class Chunker : public BaseConverter { } Result> ToChunkedArray() { - RETURN_NOT_OK(FinishChunk()); + ARROW_RETURN_NOT_OK(FinishChunk()); return std::make_shared(chunks_); } From 80eff04bb654ca7fe8162771f6a98c601d7715f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 18:47:33 +0200 Subject: [PATCH 36/80] fix py35 tests cases; fix linting error --- cpp/src/arrow/util/converter.h | 4 +- python/pyarrow/tests/test_array.py | 14 +- python/pyarrow/tests/test_convert_builtin.py | 140 ++++++++++++------- python/pyarrow/tests/test_orc.py | 3 + python/pyarrow/tests/test_scalars.py | 6 +- 5 files changed, 101 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 57d56749be2..ff6b06e2ef4 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -129,7 +129,7 @@ class Converter { virtual ~Converter() = default; - virtual Status Init() { return Status::OK(); }; + virtual Status Init() { return Status::OK(); } virtual Status Append(InputType value) { return Status::NotImplemented("Converter not implemented for type ", @@ -148,7 +148,7 @@ class Converter { virtual Status AppendNull() { return builder_->AppendNull(); } - virtual Result> ToArray() { return builder_->Finish(); }; + virtual Result> ToArray() { return builder_->Finish(); } virtual Result> ToArray(int64_t length) { ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 295089a5a42..21a0a83ca65 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -615,26 +615,22 @@ def test_struct_from_buffers(): children=children[:1]) -def _as_pairs(expected): - return [None if i is None else list(i.items()) for i in expected] - - def test_struct_from_arrays(): a = pa.array([4, 5, 6], type=pa.int64()) b = pa.array(["bar", None, ""]) c = pa.array([[1, 2], None, [3, None]]) expected_list = [ - {'a': 4, 'b': 'bar', 'c': [1, 2]}, - {'a': 5, 'b': None, 'c': None}, - {'a': 6, 'b': '', 'c': [3, None]}, + [('a', 4), ('b', 'bar'), ('c', [1, 2])], + [('a', 5), ('b', None), ('c', None)], + [('a', 6), ('b', ''), ('c', [3, None])], ] # From field names arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"]) assert arr.type == pa.struct( [("a", a.type), ("b", b.type), ("c", c.type)]) - assert arr.to_pylist() == _as_pairs(expected_list) + assert arr.to_pylist() == expected_list with pytest.raises(ValueError): pa.StructArray.from_arrays([a, b, c], ["a", "b"]) @@ -650,7 +646,7 @@ def test_struct_from_arrays(): arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc]) assert arr.type == pa.struct([fa, fb, fc]) assert not arr.type[0].nullable - assert arr.to_pylist() == _as_pairs(expected_list) + assert arr.to_pylist() == expected_list with pytest.raises(ValueError): pa.StructArray.from_arrays([a, b, c], fields=[fa, fb]) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f234bf051f2..82f54185444 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -133,6 +133,10 @@ def _as_tuple(xs): return tuple(xs) +def _as_pairs(xs): + return [None if x is None else list(x.items()) for x in xs] + + def _as_deque(xs): # deque is a sequence while neither tuple nor list return collections.deque(xs) @@ -1412,10 +1416,6 @@ def test_empty_range(): assert arr.to_pylist() == [] -def _as_pairs(expected): - return [None if i is None else list(i.items()) for i in expected] - - def test_structarray(): arr = pa.StructArray.from_arrays([], names=[]) assert arr.type == pa.struct([]) @@ -1430,13 +1430,13 @@ def test_structarray(): ['ints', 'strs', 'bools']) expected = [ - {'ints': None, 'strs': 'a', 'bools': True}, - {'ints': 2, 'strs': None, 'bools': False}, - {'ints': 3, 'strs': 'c', 'bools': None}, + [('ints', None), ('strs', 'a'), ('bools', True)], + [('ints', 2), ('strs', None), ('bools', False)], + [('ints', 3), ('strs', 'c'), ('bools', None)], ] pylist = arr.to_pylist() - assert pylist == _as_pairs(expected) + assert pylist == expected # len(names) != len(arrays) with pytest.raises(ValueError): @@ -1450,22 +1450,28 @@ def test_struct_from_dicts(): arr = pa.array([], type=ty) assert arr.to_pylist() == [] - data = [{'a': 5, 'b': 'foo', 'c': True}, - {'a': 6, 'b': 'bar', 'c': False}] + data = [ + collections.OrderedDict([('a', 5), ('b', 'foo'), ('c', True)]), + collections.OrderedDict([('a', 6), ('b', 'bar'), ('c', False)]) + ] arr = pa.array(data, type=ty) assert arr.to_pylist() == _as_pairs(data) # With omitted values - data = [{'a': 5, 'c': True}, - None, - {}, - {'a': None, 'b': 'bar'}] + data = [ + collections.OrderedDict([('a', 5), ('c', True)]), + None, + collections.OrderedDict([]), + collections.OrderedDict([('a', None), ('b', 'bar')]) + ] arr = pa.array(data, type=ty) - expected = [{'a': 5, 'b': None, 'c': True}, - None, - {'a': None, 'b': None, 'c': None}, - {'a': None, 'b': 'bar', 'c': None}] - assert arr.to_pylist() == _as_pairs(expected) + expected = [ + [('a', 5), ('b', None), ('c', True)], + None, + [('a', None), ('b', None), ('c', None)], + [('a', None), ('b', 'bar'), ('c', None)] + ] + assert arr.to_pylist() == expected def test_struct_from_dicts_bytes_keys(): @@ -1476,14 +1482,16 @@ def test_struct_from_dicts_bytes_keys(): arr = pa.array([], type=ty) assert arr.to_pylist() == [] - data = [{b'a': 5, b'b': 'foo'}, - {b'a': 6, b'c': False}] + data = [ + collections.OrderedDict([(b'a', 5), (b'b', 'foo')]), + collections.OrderedDict([(b'a', 6), (b'c', False)]), + ] arr = pa.array(data, type=ty) expected = [ - {'a': 5, 'b': 'foo', 'c': None}, - {'a': 6, 'b': None, 'c': False}, + [('a', 5), ('b', 'foo'), ('c', None)], + [('a', 6), ('b', None), ('c', False)], ] - assert arr.to_pylist() == _as_pairs(expected) + assert arr.to_pylist() == expected def test_struct_from_tuples(): @@ -1493,26 +1501,32 @@ def test_struct_from_tuples(): data = [(5, 'foo', True), (6, 'bar', False)] - expected = [{'a': 5, 'b': 'foo', 'c': True}, - {'a': 6, 'b': 'bar', 'c': False}] + expected = [ + [('a', 5), ('b', 'foo'), ('c', True)], + [('a', 6), ('b', 'bar'), ('c', False)] + ] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data_as_ndarray, type=ty) - assert arr.to_pylist() == _as_pairs(expected) + assert arr.to_pylist() == expected assert arr.equals(arr2) # With omitted values - data = [(5, 'foo', None), - None, - (6, None, False)] - expected = [{'a': 5, 'b': 'foo', 'c': None}, - None, - {'a': 6, 'b': None, 'c': False}] + data = [ + (5, 'foo', None), + None, + (6, None, False) + ] + expected = [ + [('a', 5), ('b', 'foo'), ('c', None)], + None, + [('a', 6), ('b', None), ('c', False)], + ] arr = pa.array(data, type=ty) - assert arr.to_pylist() == _as_pairs(expected) + assert arr.to_pylist() == expected # Invalid tuple size for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: @@ -1529,10 +1543,10 @@ def test_struct_from_list_of_pairs(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) - data = _as_pairs([ - {'a': 5, 'b': 'foo', 'c': True}, - {'a': 6, 'b': 'bar', 'c': False} - ]) + data = [ + [('a', 5), ('b', 'foo'), ('c', True)], + [('a', 6), ('b', 'bar'), ('c', False)], + ] arr = pa.array(data, type=ty) assert arr.to_pylist() == data @@ -1552,28 +1566,37 @@ def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) - data = [{'a': 5, 'b': 'foo', 'c': True}, - {'a': 6, 'b': 'bar', 'c': False}] + data = [ + collections.OrderedDict([('a', 5), ('b', 'foo'), ('c', True)]), + collections.OrderedDict([('a', 6), ('b', 'bar'), ('c', False)]) + ] + expected = [list(d.items()) for d in data] + arr = pa.array(data) check_struct_type(arr.type, expected_type) - assert arr.to_pylist() == _as_pairs(data) + assert arr.to_pylist() == expected # With omitted values - data = [{'a': 5, 'c': True}, - None, - {}, - {'a': None, 'b': 'bar'}] - expected = [{'a': 5, 'b': None, 'c': True}, - None, - {'a': None, 'b': None, 'c': None}, - {'a': None, 'b': 'bar', 'c': None}] + data = [ + collections.OrderedDict([('a', 5), ('c', True)]), + None, + collections.OrderedDict([]), + collections.OrderedDict([('a', None), ('b', 'bar')]) + ] + expected = [ + [('a', 5), ('b', None), ('c', True)], + None, + [('a', None), ('b', None), ('c', None)], + [('a', None), ('b', 'bar'), ('c', None)] + ] + arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) - assert arr.to_pylist() == _as_pairs(expected) + assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested @@ -1581,9 +1604,20 @@ def test_struct_from_dicts_inference(): pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_())])), pa.field('b', pa.string())]) - data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, - {'a': {'aa': None, 'ab': False}, 'b': None}, - {'a': None, 'b': 'bar'}] + data = [ + collections.OrderedDict([ + ('a', collections.OrderedDict([('aa', [5, 6]), ('ab', True)])), + ('b', 'foo') + ]), + collections.OrderedDict([ + ('a', collections.OrderedDict([('aa', None), ('ab', False)])), + ('b', None), + ]), + collections.OrderedDict([ + ('a', None), + ('b', 'bar') + ]) + ] arr = pa.array(data) expected = [ diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index cc758867ad5..6a0b2bca532 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -110,6 +110,9 @@ def check_example_file(orc_path, expected_df, need_fix=False): assert json_pos == orc_file.nrows + +#FIXME(kszucs) +@pytest.mark.skip @pytest.mark.pandas @pytest.mark.parametrize('filename', [ 'TestOrcFile.test1.orc', diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index de70fb8eea2..447d116e107 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import collections import datetime import decimal import pickle @@ -62,7 +63,8 @@ pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), - ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue), + (collections.OrderedDict([('a', 1), ('b', [1, 2])]), None, pa.StructScalar, + pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue), ]) @@ -461,7 +463,7 @@ def test_struct(): pa.field('y', pa.float32()) ]) - v = {'x': 2, 'y': 3.5} + v = collections.OrderedDict([('x', 2), ('y', 3.5)]) s = pa.scalar(v, type=ty) assert list(s) == list(s.keys()) == ['x', 'y'] assert list(s.values()) == [ From e2baab00f54550357f4c982ece82400590b1abf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 10 Sep 2020 19:16:00 +0200 Subject: [PATCH 37/80] linting again --- cpp/src/arrow/array/builder_binary.h | 2 +- python/pyarrow/tests/test_orc.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 866d19d6c75..45e9aedd6f5 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -90,7 +90,7 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - RETURN_NOT_OK(ValidateOverflow()); + ARROW_RETURN_NOT_OK(ValidateOverflow()); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index 6a0b2bca532..d44b41dde92 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -110,8 +110,8 @@ def check_example_file(orc_path, expected_df, need_fix=False): assert json_pos == orc_file.nrows - -#FIXME(kszucs) +# FIXME(kszucs): need to update the expected format to contain list of pairs +# instead of a dictionary @pytest.mark.skip @pytest.mark.pandas @pytest.mark.parametrize('filename', [ From 3dc203ecbd36ec1b1f03ce0cd6a730b426963bfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 11 Sep 2020 11:13:30 +0200 Subject: [PATCH 38/80] test nested overflow for string --- cpp/src/arrow/python/python_to_arrow.cc | 2 -- python/pyarrow/scalar.pxi | 7 +++--- python/pyarrow/tests/test_convert_builtin.py | 26 +++++++++++--------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index f67095d71ef..cd7330e2d62 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -505,7 +505,6 @@ class PyDictionaryConverter> } else { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->value_type_, this->options_, value)); - // TODO(kszucs): use AppendSafe with checking memory limit BEFORE actual append return this->value_builder_->Append(converted); } } @@ -521,7 +520,6 @@ class PyDictionaryConverter> } else { ARROW_ASSIGN_OR_RAISE(auto view, PyValue::Convert(this->value_type_, this->options_, value)); - // TODO(kszucs): use AppendSafe with checking memory limit BEFORE actual append return this->value_builder_->Append(util::string_view(view.bytes, view.size)); } } diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 7e119665a41..569fadaca50 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -626,6 +626,7 @@ cdef class StructScalar(Scalar): else: raise KeyError(key) + # TODO(kszucs): consider to remove __iter__, keys and values def __iter__(self): if self.is_valid: for field in self.type: @@ -635,17 +636,17 @@ cdef class StructScalar(Scalar): return list(self) def values(self): - return [self[key] for key in self] + return [self[i] for i, _ in enumerate(self)] def items(self): - return [(key, self[key]) for key in self] + return [(key, self[i]) for i, key in enumerate(self)] def as_py(self): """ Return this value as a Python dict. """ if self.is_valid: - return [(key, self[key].as_py()) for key in self] + return [(key, self[i].as_py()) for i, key in enumerate(self)] else: return None diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 82f54185444..8b695e309a5 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1854,23 +1854,27 @@ def test_auto_chunking(): @pytest.mark.large_memory -def test_nested_auto_chunking(): - v1 = b'x' * 100000000 - v2 = b'x' * 147483646 +@pytest.mark.parametrize(('ty', 'char'), [ + (pa.string(), 'x'), + (pa.binary(), b'x'), +]) +def test_nested_auto_chunking(ty, char): + v1 = char * 100000000 + v2 = char * 147483646 - ty = pa.struct([ + struct_type = pa.struct([ pa.field('bool', pa.bool_()), pa.field('integer', pa.int64()), - pa.field('binary', pa.binary()), + pa.field('string-like', ty), ]) - data = [{'bool': True, 'integer': 1, 'binary': v1}] * 20 - data.append({'bool': True, 'integer': 1, 'binary': v2}) - arr = pa.array(data, type=ty) + data = [{'bool': True, 'integer': 1, 'string-like': v1}] * 20 + data.append({'bool': True, 'integer': 1, 'string-like': v2}) + arr = pa.array(data, type=struct_type) assert isinstance(arr, pa.Array) - data.append({'bool': True, 'integer': 1, 'binary': b'x'}) - arr = pa.array(data, type=ty) + data.append({'bool': True, 'integer': 1, 'string-like': char}) + arr = pa.array(data, type=struct_type) assert isinstance(arr, pa.ChunkedArray) assert len(arr.chunk(0)) == 21 assert len(arr.chunk(1)) == 1 @@ -1878,7 +1882,7 @@ def test_nested_auto_chunking(): assert arr.chunk(1)[0].as_py() == [ ('bool', True), ('integer', 1), - ('binary', b'x') + ('string-like', char) ] From 723f602c1059822160a6a6c517c9a03567933c75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 14 Sep 2020 16:35:44 +0200 Subject: [PATCH 39/80] test builder overflow checking --- cpp/src/arrow/array/array_binary_test.cc | 22 +++++++++++++++++++ cpp/src/arrow/array/array_list_test.cc | 28 ++++++++++++++++++++++++ cpp/src/arrow/scalar_test.cc | 6 +++++ cpp/src/arrow/testing/gtest_util.h | 2 +- 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 9c2cd888692..af732eab068 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -570,6 +570,26 @@ class TestStringBuilder : public TestBuilder { ASSERT_EQ(reps * 40, result_->value_data()->size()); } + void TestOverflowCheck() { + auto max_size = builder_->memory_limit(); + + ASSERT_OK(builder_->ValidateOverflow(1)); + ASSERT_OK(builder_->ValidateOverflow(max_size)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size + 1)); + + ASSERT_OK(builder_->Append("bb")); + ASSERT_OK(builder_->ValidateOverflow(max_size - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1)); + + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->ValidateOverflow(max_size - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1)); + + ASSERT_OK(builder_->Append("ccc")); + ASSERT_OK(builder_->ValidateOverflow(max_size - 5)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 4)); + } + void TestZeroLength() { // All buffers are null Done(); @@ -602,6 +622,8 @@ TYPED_TEST(TestStringBuilder, TestCapacityReserve) { this->TestCapacityReserve() TYPED_TEST(TestStringBuilder, TestZeroLength) { this->TestZeroLength(); } +TYPED_TEST(TestStringBuilder, TestOverflowCheck) { this->TestOverflowCheck(); } + // ---------------------------------------------------------------------- // ChunkedBinaryBuilder tests diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index df0eb522cf4..6a2b47e8e20 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -467,6 +467,32 @@ class TestListArray : public TestBuilder { AssertArraysEqual(*result_, *expected); } + void TestOverflowCheck() { + Int16Builder* vb = checked_cast(builder_->value_builder()); + auto max_elements = builder_->maximum_elements(); + + ASSERT_OK(builder_->ValidateOverflow(1)); + ASSERT_OK(builder_->ValidateOverflow(max_elements)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); + + ASSERT_OK(builder_->Append()); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); + + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); + + ASSERT_OK(builder_->Append()); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 5)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 4)); + } + protected: std::shared_ptr value_type_; @@ -508,6 +534,8 @@ TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } +TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index a8e4e4780f2..3e3c8bd6698 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -657,6 +657,12 @@ TEST(TestDictionaryScalar, Basics) { ASSERT_TRUE(first->Equals(scalar_gamma)); ASSERT_TRUE(second->Equals(scalar_alpha)); ASSERT_TRUE(last->Equals(scalar_null)); + + auto first_dict_scalar = checked_cast(*first); + ASSERT_TRUE(first_dict_scalar.value.dictionary->Equals(arr.dictionary())); + + auto second_dict_scalar = checked_cast(*second); + ASSERT_TRUE(second_dict_scalar.value.dictionary->Equals(arr.dictionary())); } } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 6592af39557..31f1c1bc7f8 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -254,7 +254,7 @@ ARROW_TESTING_EXPORT void TestInitialized(const Array& array); template void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr* out) { - ASSERT_OK(builder->Finish(out)); + ASSERT_OK_AND_ASSIGN(*out, builder->Finish()); AssertZeroPadded(**out); TestInitialized(**out); } From 984c82eed006bdaf18d5bf0c6f2647e231d2d72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 14 Sep 2020 22:30:04 +0200 Subject: [PATCH 40/80] pass pool separately --- cpp/src/arrow/python/numpy_to_arrow.cc | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 6 +++--- cpp/src/arrow/python/python_to_arrow.h | 10 ++++------ python/pyarrow/array.pxi | 8 +++----- python/pyarrow/includes/libarrow.pxd | 3 ++- python/pyarrow/scalar.pxi | 5 +++-- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 08359100605..2847c4aa6b5 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -317,7 +317,7 @@ Status NumPyConverter::Convert() { ARROW_ASSIGN_OR_RAISE( auto chunked_array, ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options)); + reinterpret_cast(mask_), py_options, pool_)); out_arrays_ = chunked_array->chunks(); return Status::OK(); } diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index cd7330e2d62..6958ec4b17b 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -913,7 +913,8 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz } Result> ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& opts) { + const PyConversionOptions& opts, + MemoryPool* pool) { PyAcquireGIL lock; PyObject* seq; @@ -945,8 +946,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } DCHECK_GE(size, 0); - ARROW_ASSIGN_OR_RAISE(auto converter, - PyConverter::Make(real_type, options.pool, options)); + ARROW_ASSIGN_OR_RAISE(auto converter, PyConverter::Make(real_type, pool, options)); ARROW_ASSIGN_OR_RAISE(auto chunked_converter, Chunker::Make(converter)); // Convert values diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 094635859f6..3081194d3d1 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -39,11 +39,11 @@ class Status; namespace py { struct PyConversionOptions { - PyConversionOptions() : type(NULLPTR), size(-1), pool(NULLPTR), from_pandas(false) {} + PyConversionOptions() : type(NULLPTR), size(-1), from_pandas(false) {} PyConversionOptions(const std::shared_ptr& type, int64_t size, MemoryPool* pool, bool from_pandas) - : type(type), size(size), pool(default_memory_pool()), from_pandas(from_pandas) {} + : type(type), size(size), from_pandas(from_pandas) {} // Set to null if to be inferred std::shared_ptr type; @@ -51,9 +51,6 @@ struct PyConversionOptions { // Default is -1: infer from data int64_t size; - // Memory pool to use for allocations - MemoryPool* pool; - bool from_pandas = false; /// Used to maintain backwards compatibility for @@ -74,7 +71,8 @@ struct PyConversionOptions { /// \return Result Array ARROW_PYTHON_EXPORT Result> ConvertPySequence( - PyObject* obj, PyObject* mask, const PyConversionOptions& options); + PyObject* obj, PyObject* mask, const PyConversionOptions& options, + MemoryPool* pool = default_memory_pool()); } // namespace py diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index fbcafdb22a4..1ea4228ed89 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -32,14 +32,12 @@ cdef _sequence_to_array(object sequence, object mask, object size, if size is not None: options.size = size - options.pool = pool options.from_pandas = from_pandas - options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) - - cdef shared_ptr[CChunkedArray] out with nogil: - chunked = GetResultValue(ConvertPySequence(sequence, mask, options)) + chunked = GetResultValue( + ConvertPySequence(sequence, mask, options, pool) + ) if chunked.get().num_chunks() == 1: return pyarrow_wrap_array(chunked.get().chunk(0)) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 33016094530..047a06444cc 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1766,11 +1766,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CMemoryPool* pool c_bool from_pandas c_bool ignore_timezone + c_bool strict # TODO Some functions below are not actually "nogil" CResult[shared_ptr[CChunkedArray]] ConvertPySequence( - object obj, object mask, const PyConversionOptions& options) + object obj, object mask, const PyConversionOptions& options, CMemoryPool* pool) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 569fadaca50..2784fa65bc7 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -877,14 +877,15 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): shared_ptr[CArray] array shared_ptr[CChunkedArray] chunked bint is_pandas_object = False + CMemoryPool* pool type = ensure_type(type, allow_none=True) + pool = maybe_unbox_memory_pool(memory_pool) if _is_array_like(value): value = get_values(value, &is_pandas_object) options.size = 1 - options.pool = maybe_unbox_memory_pool(memory_pool) if type is not None: ty = ensure_type(type) @@ -897,7 +898,7 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): value = [value] with nogil: - chunked = GetResultValue(ConvertPySequence(value, None, options)) + chunked = GetResultValue(ConvertPySequence(value, None, options, pool)) # get the first chunk assert chunked.get().num_chunks() == 1 From 4b7a408f2e13d087fab96b00c34f0deae3042123 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 14 Sep 2020 22:59:04 +0200 Subject: [PATCH 41/80] remove unused inference function --- cpp/src/arrow/python/inference.cc | 30 ++++++------------------- cpp/src/arrow/python/inference.h | 10 ++------- cpp/src/arrow/python/python_to_arrow.cc | 24 +++----------------- python/pyarrow/array.pxi | 2 +- python/pyarrow/includes/libarrow.pxd | 5 ++--- 5 files changed, 15 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index d1ce2c2f797..a75a887693c 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -620,39 +620,23 @@ class TypeInferrer { }; // Non-exhaustive type inference -Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, - std::shared_ptr* out_type) { +Result> InferArrowType(PyObject* obj, PyObject* mask, + bool pandas_null_sentinels) { if (pandas_null_sentinels) { // ARROW-842: If pandas is not installed then null checks will be less // comprehensive, but that is okay. internal::InitPandasStaticData(); } + std::shared_ptr out_type; TypeInferrer inferrer(pandas_null_sentinels); RETURN_NOT_OK(inferrer.VisitSequence(obj, mask)); - RETURN_NOT_OK(inferrer.GetType(out_type)); - if (*out_type == nullptr) { + RETURN_NOT_OK(inferrer.GetType(&out_type)); + if (out_type == nullptr) { return Status::TypeError("Unable to determine data type"); + } else { + return std::move(out_type); } - - return Status::OK(); -} - -Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, - int64_t* size, std::shared_ptr* out_type) { - if (!PySequence_Check(obj)) { - return Status::TypeError("Object is not a sequence"); - } - *size = static_cast(PySequence_Size(obj)); - - // For 0-length sequences, refuse to guess - if (*size == 0) { - *out_type = null(); - return Status::OK(); - } - RETURN_NOT_OK(InferArrowType(obj, mask, pandas_null_sentinels, out_type)); - - return Status::OK(); } ARROW_PYTHON_EXPORT diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h index 74d1b78161c..eff18362934 100644 --- a/cpp/src/arrow/python/inference.h +++ b/cpp/src/arrow/python/inference.h @@ -44,15 +44,9 @@ namespace py { /// \param[in] mask an optional mask where True values are null. May /// be nullptr /// \param[in] pandas_null_sentinels use pandas's null value markers -/// \param[out] out_type the inferred type ARROW_PYTHON_EXPORT -arrow::Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, - std::shared_ptr* out_type); - -ARROW_PYTHON_EXPORT -arrow::Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask, - bool pandas_null_sentinels, int64_t* size, - std::shared_ptr* out_type); +Result> InferArrowType(PyObject* obj, PyObject* mask, + bool pandas_null_sentinels); /// Checks whether the passed Python object is a boolean scalar ARROW_PYTHON_EXPORT diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 6958ec4b17b..127ffdbcf99 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -196,7 +196,6 @@ struct ValueConverter> { static Result Convert(const Time32Type* type, const O&, I obj) { int32_t value; if (PyTime_Check(obj)) { - // TODO(kszucs): consider to raise if a timezone aware time object is encountered switch (type->unit()) { case TimeUnit::SECOND: value = static_cast(internal::PyTime_to_s(obj)); @@ -208,7 +207,6 @@ struct ValueConverter> { return Status::UnknownError("Invalid time unit"); } } else { - // TODO(kszucs): validate maximum value? RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); } return value; @@ -217,7 +215,6 @@ struct ValueConverter> { static Result Convert(const Time64Type* type, const O&, I obj) { int64_t value; if (PyTime_Check(obj)) { - // TODO(kszucs): consider to raise if a timezone aware time object is encountered switch (type->unit()) { case TimeUnit::MICRO: value = internal::PyTime_to_us(obj); @@ -229,7 +226,6 @@ struct ValueConverter> { return Status::UnknownError("Invalid time unit"); } } else { - // TODO(kszucs): validate maximum value? RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); } return value; @@ -272,7 +268,6 @@ struct ValueConverter> { std::shared_ptr numpy_type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); if (!numpy_type->Equals(*type)) { - // TODO(kszucs): the message should highlight the received numpy dtype // TODO(kszucs): it also validates the unit, so add the unit to the error message return Status::NotImplemented("Expected np.datetime64 but got: ", numpy_type->ToString()); @@ -309,7 +304,6 @@ struct ValueConverter> { std::shared_ptr numpy_type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); if (!numpy_type->Equals(*type)) { - // TODO(kszucs): the message should highlight the received numpy dtype // TODO(kszucs): it also validates the unit, so add the unit to the error message return Status::NotImplemented("Expected np.timedelta64 but got: ", numpy_type->ToString()); @@ -343,7 +337,6 @@ struct ValueConverter> { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true)); - // TODO(kszucs): revisit this one if (!view.is_utf8) { return internal::InvalidValue(obj, "was not a utf8 string"); } @@ -551,7 +544,6 @@ class PyListConverter : public ListConverter { } Status ValidateBuilder(const MapType*) { - // TODO(kszucs): perhaps this should be handled somewhere else if (this->list_builder_->key_builder()->null_count() > 0) { return Status::Invalid("Invalid Map: key field can not contain null values"); } else { @@ -921,8 +913,6 @@ Result> ConvertPySequence(PyObject* obj, PyObject* OwnedRef tmp_seq_nanny; PyConversionOptions options = opts; // copy options struct since we modify it below - std::shared_ptr real_type; - int64_t size = options.size; RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); tmp_seq_nanny.reset(seq); @@ -930,23 +920,15 @@ Result> ConvertPySequence(PyObject* obj, PyObject* // In some cases, type inference may be "loose", like strings. If the user // passed pa.string(), then we will error if we encounter any non-UTF8 // value. If not, then we will allow the result to be a BinaryArray - auto copied_options = options; - options.strict = false; - if (options.type == nullptr) { - RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); - // TODO(kszucs): remove this - // if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { - // const auto& ts_type = checked_cast(*real_type); - // real_type = timestamp(ts_type.unit()); - // } + ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); + options.strict = false; } else { - real_type = options.type; options.strict = true; } DCHECK_GE(size, 0); - ARROW_ASSIGN_OR_RAISE(auto converter, PyConverter::Make(real_type, pool, options)); + ARROW_ASSIGN_OR_RAISE(auto converter, PyConverter::Make(options.type, pool, options)); ARROW_ASSIGN_OR_RAISE(auto chunked_converter, Chunker::Make(converter)); // Convert values diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1ea4228ed89..8f788ea06a5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -463,7 +463,7 @@ def infer_type(values, mask=None, from_pandas=False): if mask is not None and not isinstance(mask, np.ndarray): mask = np.array(mask, dtype=bool) - check_status(InferArrowType(values, mask, use_pandas_sentinels, &out)) + out = GetResultValue(InferArrowType(values, mask, use_pandas_sentinels)) return pyarrow_wrap_data_type(out) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 047a06444cc..b260de89e7d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1748,9 +1748,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef extern from "arrow/python/api.h" namespace "arrow::py": # Requires GIL - CStatus InferArrowType(object obj, object mask, - c_bool pandas_null_sentinels, - shared_ptr[CDataType]* out_type) + CResult[shared_ptr[CDataType]] InferArrowType( + object obj, object mask, c_bool pandas_null_sentinels) cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: From bf064721964dd8ec91df760be2f9ffa746c3a881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 14 Sep 2020 23:18:50 +0200 Subject: [PATCH 42/80] pa.array docstrings --- cpp/src/arrow/python/python_to_arrow.cc | 2 -- python/pyarrow/array.pxi | 31 +++++++++++++++++++------ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 127ffdbcf99..7bca6885635 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -268,7 +268,6 @@ struct ValueConverter> { std::shared_ptr numpy_type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); if (!numpy_type->Equals(*type)) { - // TODO(kszucs): it also validates the unit, so add the unit to the error message return Status::NotImplemented("Expected np.datetime64 but got: ", numpy_type->ToString()); } @@ -304,7 +303,6 @@ struct ValueConverter> { std::shared_ptr numpy_type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); if (!numpy_type->Equals(*type)) { - // TODO(kszucs): it also validates the unit, so add the unit to the error message return Status::NotImplemented("Expected np.timedelta64 but got: ", numpy_type->ToString()); } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8f788ea06a5..5a111d17383 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -158,27 +158,44 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Notes ----- Localized timestamps will currently be returned as UTC (pandas's native - representation). Timezone-naive data will be implicitly interpreted as + representation). Timezone-naive data will be implicitly interpreted as UTC. - TODO(kszucs): describe the adaptive nature of the dictionary array's index - type + Converting to dictionary array will choose to use a larger integer type for + the indices if the number of distict values wouldn't fit to the range of + the passed type. This adaptive nature means that if there are more than 127 + values the returned dictionary array's key type is going to be pa.int16() + even if pa.int8() was passed to the function. Note that smaller key type + than the passed one won't be chosed. Examples -------- >>> import pandas as pd >>> import pyarrow as pa >>> pa.array(pd.Series([1, 2])) - + [ 1, 2 ] + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + >>> import numpy as np - >>> pa.array(pd.Series([1, 2]), np.array([0, 1], - ... dtype=bool)) - + >>> pa.array(pd.Series([1, 2]), np.array([0, 1], dtype=bool)) + [ 1, null From 196a0c11d18b9d843fa70cd4d4a80b674710f28d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Sep 2020 14:36:09 +0200 Subject: [PATCH 43/80] additional testing for struct conversion --- cpp/src/arrow/python/python_to_arrow.cc | 128 ++++++++++--------- python/pyarrow/tests/test_compute.py | 12 +- python/pyarrow/tests/test_convert_builtin.py | 112 ++++++++++++++-- 3 files changed, 173 insertions(+), 79 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 7bca6885635..1b12186d3bb 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -695,6 +695,30 @@ class PyStructConverter : public StructConverter { return Status::OK(); } + Status InferKeyKind(PyObject* items) { + for (int i = 0; i < PySequence_Length(items); i++) { + // retrieve the key from the passed key-value pairs + ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); + + // check key exists between the unicode field names + bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first); + RETURN_IF_PYERROR(); + if (do_contain) { + key_kind_ = KeyKind::UNICODE; + return Status::OK(); + } + + // check key exists between the bytes field names + do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first); + RETURN_IF_PYERROR(); + if (do_contain) { + key_kind_ = KeyKind::BYTES; + return Status::OK(); + } + } + return Status::OK(); + } + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->struct_builder_->AppendNull(); @@ -736,37 +760,6 @@ class PyStructConverter : public StructConverter { return Status::OK(); } - Status InferKeyKind(PyObject* items) { - for (int i = 0; i < PySequence_Length(items); i++) { - // retrieve the key from the passed key-value pairs - PyObject* tuple = PySequence_GetItem(items, i); - if (tuple == NULL) { - RETURN_IF_PYERROR(); - } - PyObject* key = PyTuple_GET_ITEM(tuple, 0); - if (key == NULL) { - RETURN_IF_PYERROR(); - } - - // check key exists between the unicode field names - bool do_contain = PySequence_Contains(unicode_field_names_.obj(), key); - RETURN_IF_PYERROR(); - if (do_contain) { - key_kind_ = KeyKind::UNICODE; - return Status::OK(); - } - - // check key exists between the bytes field names - do_contain = PySequence_Contains(bytes_field_names_.obj(), key); - RETURN_IF_PYERROR(); - if (do_contain) { - key_kind_ = KeyKind::BYTES; - return Status::OK(); - } - } - return Status::OK(); - } - Status AppendDict(PyObject* dict) { if (!PyDict_Check(dict)) { return internal::InvalidType(dict, "was expecting a dict"); @@ -787,27 +780,10 @@ class PyStructConverter : public StructConverter { } } - Status AppendDict(PyObject* dict, PyObject* field_names) { - // NOTE we're ignoring any extraneous dict items - for (int i = 0; i < num_fields_; i++) { - PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed - PyObject* value = PyDict_GetItem(dict, name); // borrowed - if (value == NULL) { - RETURN_IF_PYERROR(); - } - RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); - } - return Status::OK(); - } - Status AppendItems(PyObject* items) { if (!PySequence_Check(items)) { return internal::InvalidType(items, "was expecting a sequence of key-value items"); } - // TODO(kszucs): cover with tests - // if (PySequence_GET_SIZE(items) != num_fields_) { - // return Status::Invalid("Sequence size must be equal to number of struct fields"); - // } switch (key_kind_) { case KeyKind::UNICODE: return AppendItems(items, unicode_field_names_.obj()); @@ -824,26 +800,60 @@ class PyStructConverter : public StructConverter { } } - Status AppendItems(PyObject* items, PyObject* field_names) { + Status AppendDict(PyObject* dict, PyObject* field_names) { + // NOTE we're ignoring any extraneous dict items for (int i = 0; i < num_fields_; i++) { - PyObject* tuple = PySequence_GetItem(items, i); - if (tuple == NULL) { - RETURN_IF_PYERROR(); - } - PyObject* key = PyTuple_GET_ITEM(tuple, 0); - PyObject* value = PyTuple_GET_ITEM(tuple, 1); - if (key == NULL || value == NULL) { + PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed + PyObject* value = PyDict_GetItem(dict, name); // borrowed + if (value == NULL) { RETURN_IF_PYERROR(); } + RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); + } + return Status::OK(); + } + + Result> GetKeyValuePair(PyObject* seq, int index) { + PyObject* pair = PySequence_GetItem(seq, index); + RETURN_IF_PYERROR(); + if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) { + return internal::InvalidType(pair, "was expecting tuple of (key, value) pair"); + } + PyObject* key = PyTuple_GetItem(pair, 0); + RETURN_IF_PYERROR(); + PyObject* value = PyTuple_GetItem(pair, 1); + RETURN_IF_PYERROR(); + return std::make_pair(key, value); + } + + Status AppendItems(PyObject* items, PyObject* field_names) { + auto length = static_cast(PySequence_Size(items)); + RETURN_IF_PYERROR(); + + // append the values for the defined fields + for (int i = 0; i < std::min(num_fields_, length); i++) { + // retrieve the key-value pair + ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); + + // validate that the key and the field name are equal PyObject* name = PyList_GET_ITEM(field_names, i); - bool are_equal = PyObject_RichCompareBool(key, name, Py_EQ); + bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ); RETURN_IF_PYERROR(); + + // finally append to the respective child builder if (are_equal) { - RETURN_NOT_OK(this->children_[i]->Append(value)); + RETURN_NOT_OK(this->children_[i]->Append(pair.second)); } else { - return Status::Invalid("Key not equal to the expected field name"); + ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first)); + ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name)); + return Status::Invalid("The expected field name is `", name_view.bytes, "` but `", + key_view.bytes, "` was given"); } } + // insert null values for missing fields + for (int i = length; i < num_fields_; i++) { + RETURN_NOT_OK(this->children_[i]->AppendNull()); + } return Status::OK(); } diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index d64b6b10f42..27d10f98a92 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -234,20 +234,20 @@ def test_min_max(): # An example generated function wrapper with possible options data = [4, 5, 6, None, 1] s = pc.min_max(data) - assert s.as_py() == {'min': 1, 'max': 6} + assert s.as_py() == [('min', 1), ('max', 6)] s = pc.min_max(data, options=pc.MinMaxOptions()) - assert s.as_py() == {'min': 1, 'max': 6} + assert s.as_py() == [('min', 1), ('max', 6)] s = pc.min_max(data, options=pc.MinMaxOptions(null_handling='skip')) - assert s.as_py() == {'min': 1, 'max': 6} + assert s.as_py() == [('min', 1), ('max', 6)] s = pc.min_max(data, options=pc.MinMaxOptions(null_handling='emit_null')) - assert s.as_py() == {'min': None, 'max': None} + assert s.as_py() == [('min', None), ('max', None)] # Options as dict of kwargs s = pc.min_max(data, options={'null_handling': 'emit_null'}) - assert s.as_py() == {'min': None, 'max': None} + assert s.as_py() == [('min', None), ('max', None)] # Options as named functions arguments s = pc.min_max(data, null_handling='emit_null') - assert s.as_py() == {'min': None, 'max': None} + assert s.as_py() == [('min', None), ('max', None)] # Both options and named arguments with pytest.raises(TypeError): diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 8b695e309a5..70a8c7caea9 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -889,10 +889,10 @@ def test_sequence_timestamp(): 46, 57, 437699) -# TODO(kszucs): test pytz.StaticTzInfo like pytz.timezone('Etc/GMT+1') @pytest.mark.parametrize('timezone', [ None, 'UTC', + 'Etc/GMT-1', 'Europe/Budapest', ]) @pytest.mark.parametrize('unit', [ @@ -1534,30 +1534,114 @@ def test_struct_from_tuples(): pa.array([tup], type=ty) -# TODO(kszucs): test duplicated field name -# TODO(kszucs): test with empty elements -# TODO(kszucs): test with None elements -# TODO(kszucs): test with empty element at the first position because of -# inference def test_struct_from_list_of_pairs(): - ty = pa.struct([pa.field('a', pa.int32()), - pa.field('b', pa.string()), - pa.field('c', pa.bool_())]) + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) data = [ [('a', 5), ('b', 'foo'), ('c', True)], [('a', 6), ('b', 'bar'), ('c', False)], + None + ] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == data + + # test with duplicated field names + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('a', pa.string()), + pa.field('b', pa.bool_()) + ]) + data = [ + [('a', 5), ('a', 'foo'), ('b', True)], + [('a', 6), ('a', 'bar'), ('b', False)], ] arr = pa.array(data, type=ty) assert arr.to_pylist() == data + # test with empty elements + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) + data = [ + [], + [('a', 5), ('b', 'foo'), ('c', True)], + [('a', 2), ('b', 'baz')], + [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')], + ] + expected = [ + [('a', None), ('b', None), ('c', None)], + [('a', 5), ('b', 'foo'), ('c', True)], + [('a', 2), ('b', 'baz'), ('c', None)], + [('a', 1), ('b', 'bar'), ('c', False)], + ] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == expected + + +def test_struct_from_list_of_pairs_errors(): + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) + + # test that it raises if the key doesn't match the expected field name + data = [ + [], + [('a', 5), ('c', True), ('b', None)], + ] + msg = "The expected field name is `b` but `c` was given" + with pytest.raises(ValueError, match=msg): + pa.array(data, type=ty) + + # test various errors both at the first position and after because of key + # type inference + template = ( + r"Could not convert {} with type {}: was expecting tuple of " + r"\(key, value\) pair" + ) + cases = [ + tuple(), # empty key-value pair + tuple('a',), # missing value + tuple('unknown-key',), # not known field name + 'string', # not a tuple + ] + for key_value_pair in cases: + msg = template.format( + str(key_value_pair).replace('(', r'\(').replace(')', r'\)'), + type(key_value_pair).__name__ + ) + + with pytest.raises(TypeError, match=msg): + pa.array([ + [key_value_pair], + [('a', 5), ('b', 'foo'), ('c', None)], + ], type=ty) + + with pytest.raises(TypeError, match=msg): + pa.array([ + [('a', 5), ('b', 'foo'), ('c', None)], + [key_value_pair], + ], type=ty) + def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array - ty = pa.struct([pa.field('a', pa.int32()), - pa.field('b', pa.string()), - pa.field('c', pa.bool_())]) - data = [(5, 'foo', True), - {'a': 6, 'b': 'bar', 'c': False}] + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) + data = [ + (5, 'foo', True), + None, + {'a': 6, 'b': 'bar', 'c': False} + ] with pytest.raises(TypeError): pa.array(data, type=ty) From 50efce92a413f830611b46e22df83662153d0a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Sep 2020 15:13:56 +0200 Subject: [PATCH 44/80] inline comments --- cpp/src/arrow/ipc/metadata_internal.cc | 2 +- cpp/src/arrow/python/python_to_arrow.cc | 21 +++++++++++++++++++- cpp/src/arrow/util/converter.h | 6 ++++-- cpp/src/arrow/util/uri.cc | 2 +- python/pyarrow/tests/test_convert_builtin.py | 9 --------- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 9c967a5423d..fe4314921e0 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1154,7 +1154,7 @@ Status GetKeyValueMetadata(const KVVector* fb_metadata, auto metadata = std::make_shared(); metadata->reserve(fb_metadata->size()); - for (const auto pair : *fb_metadata) { + for (const auto& pair : *fb_metadata) { CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key"); CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value"); metadata->Append(pair->key()->str(), pair->value()->str()); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 1b12186d3bb..3e3607e64c9 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -63,11 +63,15 @@ using internal::StructConverter; namespace py { +// Utility for converting single python objects to their intermediate C representations +// which can be fed to the typed builders class PyValue { public: + // Type aliases for shorter signature definitions using I = PyObject*; using O = PyConversionOptions; + // Used for null checking before actually converting the values static bool IsNull(const O& options, I obj) { if (options.from_pandas) { return internal::PandasObjectIsNull(obj); @@ -76,10 +80,12 @@ class PyValue { } } + // Used for post-conversion numpy NaT sentinel checking static bool IsNaT(const TimestampType*, int64_t value) { return internal::npy_traits::isnull(value); } + // Used for post-conversion numpy NaT sentinel checking static bool IsNaT(const DurationType*, int64_t value) { return internal::npy_traits::isnull(value); } @@ -313,6 +319,11 @@ struct ValueConverter> { return value; } + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. + static Result Convert(const BaseBinaryType*, const O&, I obj) { return PyBytesView::FromString(obj); } @@ -350,6 +361,9 @@ struct ValueConverter> { } }; +// Forward-declare the type-family specific converters to inject them to the PyConverter +// base class as type aliases. + template class PyPrimitiveConverter; @@ -361,8 +375,10 @@ class PyListConverter; class PyStructConverter; +// The base Converter class is a mixin with predefined behavior and constructors. class PyConverter : public Converter { public: + // Type aliases used by the parent Converter mixin's factory. template using Primitive = PyPrimitiveConverter; template @@ -371,6 +387,7 @@ class PyConverter : public Converter; using Struct = PyStructConverter; + // Convert and append a sequence of values Status Extend(PyObject* values, int64_t size) { /// Ensure we've allocated enough space RETURN_NOT_OK(this->Reserve(size)); @@ -380,6 +397,7 @@ class PyConverter : public ConverterReserve(size)); @@ -426,6 +444,7 @@ class PyPrimitiveConverter< } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + // Numpy NaT sentinels can be checked after the conversion if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->primitive_type_, converted)) { return this->primitive_builder_->AppendNull(); @@ -474,7 +493,7 @@ class PyPrimitiveConverter> Result> ToArray() override { ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); if (observed_binary_) { - // If we saw any non-unicode, cast results to BinaryArray + // if we saw any non-unicode, cast results to BinaryArray auto binary_type = TypeTraits::type_singleton(); return array->View(binary_type); } else { diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index ff6b06e2ef4..d0b74fd00cb 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -142,6 +142,8 @@ class Converter { OptionsType options() const { return options_; } + const std::vector> children() const { return children_; } + virtual Status Reserve(int64_t additional_capacity) { return builder_->Reserve(additional_capacity); } @@ -300,8 +302,8 @@ class Chunker : public BaseConverter { auto result = std::make_shared(); result->type_ = converter->type(); result->builder_ = converter->builder(); - // result->options_ = converter->options_; - // result->children_ = converter->children_; + result->options_ = converter->options(); + result->children_ = converter->children(); result->converter_ = std::move(converter); return result; } diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc index 1261607b6c1..795e3fa2c8b 100644 --- a/cpp/src/arrow/util/uri.cc +++ b/cpp/src/arrow/util/uri.cc @@ -169,7 +169,7 @@ std::string Uri::path() const { ss << "/"; } bool first = true; - for (const auto& seg : segments) { + for (const auto seg : segments) { if (!first) { ss << "/"; } diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 70a8c7caea9..265a6e0cdf1 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1968,12 +1968,3 @@ def test_nested_auto_chunking(ty, char): ('integer', 1), ('string-like', char) ] - - -def test_dictionary_conversion(): - data = [ - {"page_type": 1}, - {"record_type": 1}, - {"non_consecutive_home": 0}, - ] - pa.array(data, type=None, from_pandas=True, safe=True) From a3af6707e2a6fc095b38214c2511ddb8bcad4bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Sep 2020 15:51:21 +0200 Subject: [PATCH 45/80] flake8 --- python/pyarrow/includes/libarrow.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b260de89e7d..2042cbc2292 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1770,7 +1770,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: # TODO Some functions below are not actually "nogil" CResult[shared_ptr[CChunkedArray]] ConvertPySequence( - object obj, object mask, const PyConversionOptions& options, CMemoryPool* pool) + object obj, object mask, const PyConversionOptions& options, + CMemoryPool* pool) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) From 768c75cd6c430e1321be571ecd6a70a54bdd6af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Sep 2020 15:53:13 +0200 Subject: [PATCH 46/80] missing parameter documentation --- cpp/src/arrow/python/python_to_arrow.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 3081194d3d1..3d6862fc63a 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -68,7 +68,8 @@ struct PyConversionOptions { /// values in the sequence are null (true) or not null (false). This parameter /// may be null /// \param[in] options various conversion options -/// \return Result Array +/// \param[in] pool MemoryPool to use for allocations +/// \return Result ChunkedArray ARROW_PYTHON_EXPORT Result> ConvertPySequence( PyObject* obj, PyObject* mask, const PyConversionOptions& options, From 01b08fcd49fa66483c1adb4ae5fad7f7cca33574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Sep 2020 16:08:42 +0200 Subject: [PATCH 47/80] resolve rebase problem --- cpp/src/arrow/python/python_to_arrow.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 3e3607e64c9..3d476621ff7 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -110,10 +110,6 @@ class PyValue { } } -template -struct ValueConverter> { - using ValueType = typename Type::c_type; - template static enable_if_integer> Convert(const T*, const O&, I obj) { From 9691c87139108ec28b986e6ba50bb488d97c283b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 15 Sep 2020 23:37:48 +0200 Subject: [PATCH 48/80] revert breaking StructScalar change --- python/pyarrow/scalar.pxi | 33 ++-- python/pyarrow/tests/test_array.py | 11 +- python/pyarrow/tests/test_compute.py | 20 +-- python/pyarrow/tests/test_convert_builtin.py | 168 +++++++------------ python/pyarrow/tests/test_orc.py | 3 - python/pyarrow/tests/test_pandas.py | 32 +--- python/pyarrow/tests/test_scalars.py | 41 +---- 7 files changed, 106 insertions(+), 202 deletions(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2784fa65bc7..5d444c55077 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import collections + cdef class Scalar(_Weakrefable): """ @@ -577,7 +579,7 @@ cdef class LargeListScalar(ListScalar): pass -cdef class StructScalar(Scalar): +cdef class StructScalar(Scalar, collections.abc.Mapping): """ Concrete class for struct scalars. """ @@ -586,6 +588,16 @@ cdef class StructScalar(Scalar): cdef CStructScalar* sp = self.wrapped.get() return sp.value.size() + def __iter__(self): + cdef: + CStructScalar* sp = self.wrapped.get() + CStructType* dtype = sp.type.get() + vector[shared_ptr[CField]] fields = dtype.fields() + + if sp.is_valid: + for i in range(dtype.num_fields()): + yield frombytes(fields[i].get().name()) + def __contains__(self, key): try: self[key] @@ -597,12 +609,10 @@ cdef class StructScalar(Scalar): def __getitem__(self, key): """ Return the child value for the given field. - Parameters ---------- index : Union[int, str] Index / position or name of the field. - Returns ------- result : Scalar @@ -626,27 +636,12 @@ cdef class StructScalar(Scalar): else: raise KeyError(key) - # TODO(kszucs): consider to remove __iter__, keys and values - def __iter__(self): - if self.is_valid: - for field in self.type: - yield field.name - - def keys(self): - return list(self) - - def values(self): - return [self[i] for i, _ in enumerate(self)] - - def items(self): - return [(key, self[i]) for i, key in enumerate(self)] - def as_py(self): """ Return this value as a Python dict. """ if self.is_valid: - return [(key, self[i].as_py()) for i, key in enumerate(self)] + return {k: v.as_py() for k, v in self.items()} else: return None diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 21a0a83ca65..13a167fd311 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -482,8 +482,8 @@ def test_struct_array_slice(): ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) - assert arr[1:].to_pylist() == [[('a', 3), ('b', 4.5)], - [('a', 5), ('b', 6.5)]] + assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, + {'a': 5, 'b': 6.5}] def test_array_factory_invalid_type(): @@ -619,11 +619,10 @@ def test_struct_from_arrays(): a = pa.array([4, 5, 6], type=pa.int64()) b = pa.array(["bar", None, ""]) c = pa.array([[1, 2], None, [3, None]]) - expected_list = [ - [('a', 4), ('b', 'bar'), ('c', [1, 2])], - [('a', 5), ('b', None), ('c', None)], - [('a', 6), ('b', ''), ('c', [3, None])], + {'a': 4, 'b': 'bar', 'c': [1, 2]}, + {'a': 5, 'b': None, 'c': None}, + {'a': 6, 'b': '', 'c': [3, None]}, ] # From field names diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 27d10f98a92..8b0859ccf39 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -204,21 +204,21 @@ def test_mode_array(): # ARROW-9917 arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') expected = {"mode": 1, "count": 2} - assert pc.mode(arr).as_py() == [("mode", 1), ("count", 2)] + assert pc.mode(arr).as_py() == {"mode": 1, "count": 2} arr = pa.array([], type='int64') - expected = [("mode", None), ("count", None)] + expected = {"mode": None, "count": None} assert pc.mode(arr).as_py() == expected def test_mode_chunked_array(): # ARROW-9917 arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) - expected = [("mode", 1), ("count", 2)] + expected = {"mode": 1, "count": 2} assert pc.mode(arr).as_py() == expected arr = pa.chunked_array((), type='int64') - expected = [("mode", None), ("count", None)] + expected = {"mode": None, "count": None} assert arr.num_chunks == 0 assert pc.mode(arr).as_py() == expected @@ -234,20 +234,20 @@ def test_min_max(): # An example generated function wrapper with possible options data = [4, 5, 6, None, 1] s = pc.min_max(data) - assert s.as_py() == [('min', 1), ('max', 6)] + assert s.as_py() == {'min': 1, 'max': 6} s = pc.min_max(data, options=pc.MinMaxOptions()) - assert s.as_py() == [('min', 1), ('max', 6)] + assert s.as_py() == {'min': 1, 'max': 6} s = pc.min_max(data, options=pc.MinMaxOptions(null_handling='skip')) - assert s.as_py() == [('min', 1), ('max', 6)] + assert s.as_py() == {'min': 1, 'max': 6} s = pc.min_max(data, options=pc.MinMaxOptions(null_handling='emit_null')) - assert s.as_py() == [('min', None), ('max', None)] + assert s.as_py() == {'min': None, 'max': None} # Options as dict of kwargs s = pc.min_max(data, options={'null_handling': 'emit_null'}) - assert s.as_py() == [('min', None), ('max', None)] + assert s.as_py() == {'min': None, 'max': None} # Options as named functions arguments s = pc.min_max(data, null_handling='emit_null') - assert s.as_py() == [('min', None), ('max', None)] + assert s.as_py() == {'min': None, 'max': None} # Both options and named arguments with pytest.raises(TypeError): diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 265a6e0cdf1..0ad15cfc8a5 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1430,13 +1430,13 @@ def test_structarray(): ['ints', 'strs', 'bools']) expected = [ - [('ints', None), ('strs', 'a'), ('bools', True)], - [('ints', 2), ('strs', None), ('bools', False)], - [('ints', 3), ('strs', 'c'), ('bools', None)], + {'ints': None, 'strs': 'a', 'bools': True}, + {'ints': 2, 'strs': None, 'bools': False}, + {'ints': 3, 'strs': 'c', 'bools': None}, ] pylist = arr.to_pylist() - assert pylist == expected + assert pylist == expected, (pylist, expected) # len(names) != len(arrays) with pytest.raises(ValueError): @@ -1450,27 +1450,21 @@ def test_struct_from_dicts(): arr = pa.array([], type=ty) assert arr.to_pylist() == [] - data = [ - collections.OrderedDict([('a', 5), ('b', 'foo'), ('c', True)]), - collections.OrderedDict([('a', 6), ('b', 'bar'), ('c', False)]) - ] + data = [{'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) - assert arr.to_pylist() == _as_pairs(data) + assert arr.to_pylist() == data # With omitted values - data = [ - collections.OrderedDict([('a', 5), ('c', True)]), - None, - collections.OrderedDict([]), - collections.OrderedDict([('a', None), ('b', 'bar')]) - ] + data = [{'a': 5, 'c': True}, + None, + {}, + {'a': None, 'b': 'bar'}] arr = pa.array(data, type=ty) - expected = [ - [('a', 5), ('b', None), ('c', True)], - None, - [('a', None), ('b', None), ('c', None)], - [('a', None), ('b', 'bar'), ('c', None)] - ] + expected = [{'a': 5, 'b': None, 'c': True}, + None, + {'a': None, 'b': None, 'c': None}, + {'a': None, 'b': 'bar', 'c': None}] assert arr.to_pylist() == expected @@ -1482,16 +1476,13 @@ def test_struct_from_dicts_bytes_keys(): arr = pa.array([], type=ty) assert arr.to_pylist() == [] - data = [ - collections.OrderedDict([(b'a', 5), (b'b', 'foo')]), - collections.OrderedDict([(b'a', 6), (b'c', False)]), - ] + data = [{b'a': 5, b'b': 'foo'}, + {b'a': 6, b'c': False}] arr = pa.array(data, type=ty) - expected = [ - [('a', 5), ('b', 'foo'), ('c', None)], - [('a', 6), ('b', None), ('c', False)], + assert arr.to_pylist() == [ + {'a': 5, 'b': 'foo', 'c': None}, + {'a': 6, 'b': None, 'c': False}, ] - assert arr.to_pylist() == expected def test_struct_from_tuples(): @@ -1501,10 +1492,8 @@ def test_struct_from_tuples(): data = [(5, 'foo', True), (6, 'bar', False)] - expected = [ - [('a', 5), ('b', 'foo'), ('c', True)], - [('a', 6), ('b', 'bar'), ('c', False)] - ] + expected = [{'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data, type=ty) data_as_ndarray = np.empty(len(data), dtype=object) @@ -1515,16 +1504,12 @@ def test_struct_from_tuples(): assert arr.equals(arr2) # With omitted values - data = [ - (5, 'foo', None), - None, - (6, None, False) - ] - expected = [ - [('a', 5), ('b', 'foo'), ('c', None)], - None, - [('a', 6), ('b', None), ('c', False)], - ] + data = [(5, 'foo', None), + None, + (6, None, False)] + expected = [{'a': 5, 'b': 'foo', 'c': None}, + None, + {'a': 6, 'b': None, 'c': False}] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected @@ -1546,7 +1531,11 @@ def test_struct_from_list_of_pairs(): None ] arr = pa.array(data, type=ty) - assert arr.to_pylist() == data + assert arr.to_pylist() == [ + {'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False}, + None + ] # test with duplicated field names ty = pa.struct([ @@ -1559,7 +1548,9 @@ def test_struct_from_list_of_pairs(): [('a', 6), ('a', 'bar'), ('b', False)], ] arr = pa.array(data, type=ty) - assert arr.to_pylist() == data + with pytest.raises(KeyError): + # TODO(kszucs): ARROW-9997 + arr.to_pylist() # test with empty elements ty = pa.struct([ @@ -1574,10 +1565,10 @@ def test_struct_from_list_of_pairs(): [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')], ] expected = [ - [('a', None), ('b', None), ('c', None)], - [('a', 5), ('b', 'foo'), ('c', True)], - [('a', 2), ('b', 'baz'), ('c', None)], - [('a', 1), ('b', 'bar'), ('c', False)], + {'a': None, 'b': None, 'c': None}, + {'a': 5, 'b': 'foo', 'c': True}, + {'a': 2, 'b': 'baz', 'c': None}, + {'a': 1, 'b': 'bar', 'c': False}, ] arr = pa.array(data, type=ty) assert arr.to_pylist() == expected @@ -1632,16 +1623,11 @@ def test_struct_from_list_of_pairs_errors(): def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array - ty = pa.struct([ - pa.field('a', pa.int32()), - pa.field('b', pa.string()), - pa.field('c', pa.bool_()) - ]) - data = [ - (5, 'foo', True), - None, - {'a': 6, 'b': 'bar', 'c': False} - ] + ty = pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_())]) + data = [(5, 'foo', True), + {'a': 6, 'b': 'bar', 'c': False}] with pytest.raises(TypeError): pa.array(data, type=ty) @@ -1650,29 +1636,22 @@ def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) - data = [ - collections.OrderedDict([('a', 5), ('b', 'foo'), ('c', True)]), - collections.OrderedDict([('a', 6), ('b', 'bar'), ('c', False)]) - ] - expected = [list(d.items()) for d in data] + data = [{'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) - assert arr.to_pylist() == expected + assert arr.to_pylist() == data # With omitted values - data = [ - collections.OrderedDict([('a', 5), ('c', True)]), - None, - collections.OrderedDict([]), - collections.OrderedDict([('a', None), ('b', 'bar')]) - ] - expected = [ - [('a', 5), ('b', None), ('c', True)], - None, - [('a', None), ('b', None), ('c', None)], - [('a', None), ('b', 'bar'), ('c', None)] - ] + data = [{'a': 5, 'c': True}, + None, + {}, + {'a': None, 'b': 'bar'}] + expected = [{'a': 5, 'b': None, 'c': True}, + None, + {'a': None, 'b': None, 'c': None}, + {'a': None, 'b': 'bar', 'c': None}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) @@ -1688,42 +1667,17 @@ def test_struct_from_dicts_inference(): pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_())])), pa.field('b', pa.string())]) - data = [ - collections.OrderedDict([ - ('a', collections.OrderedDict([('aa', [5, 6]), ('ab', True)])), - ('b', 'foo') - ]), - collections.OrderedDict([ - ('a', collections.OrderedDict([('aa', None), ('ab', False)])), - ('b', None), - ]), - collections.OrderedDict([ - ('a', None), - ('b', 'bar') - ]) - ] + data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, + {'a': {'aa': None, 'ab': False}, 'b': None}, + {'a': None, 'b': 'bar'}] arr = pa.array(data) - expected = [ - [ - ('a', [('aa', [5, 6]), ('ab', True)]), - ('b', 'foo') - ], - [ - ('a', [('aa', None), ('ab', False)]), - ('b', None) - ], - [ - ('a', None), - ('b', 'bar') - ] - ] - assert arr.to_pylist() == expected + assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) - assert arr.to_pylist() == [[]] + assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py index d44b41dde92..cc758867ad5 100644 --- a/python/pyarrow/tests/test_orc.py +++ b/python/pyarrow/tests/test_orc.py @@ -110,9 +110,6 @@ def check_example_file(orc_path, expected_df, need_fix=False): assert json_pos == orc_file.nrows -# FIXME(kszucs): need to update the expected format to contain list of pairs -# instead of a dictionary -@pytest.mark.skip @pytest.mark.pandas @pytest.mark.parametrize('filename', [ 'TestOrcFile.test1.orc', diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index d88114fa9e2..03407521c12 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2135,17 +2135,12 @@ def test_from_numpy(self): data = np.array([(42, True), (43, False)], dtype=dt) arr = pa.array(data, type=ty) - assert arr.to_pylist() == [ - [('x', 42), ('y', True)], - [('x', 43), ('y', False)] - ] + assert arr.to_pylist() == [{'x': 42, 'y': True}, + {'x': 43, 'y': False}] # With mask arr = pa.array(data, mask=np.bool_([False, True]), type=ty) - assert arr.to_pylist() == [ - [('x', 42), ('y', True)], - None - ] + assert arr.to_pylist() == [{'x': 42, 'y': True}, None] # Trivial struct type dt = np.dtype([]) @@ -2157,7 +2152,7 @@ def test_from_numpy(self): data = np.array([(), ()], dtype=dt) arr = pa.array(data, type=ty) - assert arr.to_pylist() == [[], []] + assert arr.to_pylist() == [{}, {}] def test_from_numpy_nested(self): # Note: an object field inside a struct @@ -2180,20 +2175,9 @@ def test_from_numpy_nested(self): ((1, True), 2, 'foo'), ((3, False), 4, 'bar')], dtype=dt) arr = pa.array(data, type=ty) - - expected = [ - [ - ('x', [('xx', 1), ('yy', True)]), - ('y', 2), - ('z', 'foo') - ], - [ - ('x', [('xx', 3), ('yy', False)]), - ('y', 4), - ('z', 'bar') - ] - ] - assert arr.to_pylist() == expected + assert arr.to_pylist() == [ + {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'}, + {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}] @pytest.mark.large_memory def test_from_numpy_large(self): @@ -2224,7 +2208,7 @@ def check(arr, data, mask=None): ys = data['y'] for i, obj in enumerate(iter_chunked_array(arr)): try: - d = dict(obj.as_py()) if obj.is_valid else obj.as_py() + d = obj.as_py() if mask is not None and mask[i]: assert d is None else: diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 447d116e107..fa48ad8b5f2 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -import collections import datetime import decimal import pickle @@ -63,20 +62,14 @@ pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), - (collections.OrderedDict([('a', 1), ('b', [1, 2])]), None, pa.StructScalar, - pa.StructValue), + ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue), ]) def test_basics(value, ty, klass, deprecated): s = pa.scalar(value, type=ty) assert isinstance(s, klass) - - if isinstance(s, pa.StructScalar) and isinstance(value, dict): - assert s.as_py() == list(value.items()) - else: - assert s.as_py() == value - + assert s.as_py() == value assert s == pa.scalar(value, type=ty) assert s != value assert s != "else" @@ -463,7 +456,7 @@ def test_struct(): pa.field('y', pa.float32()) ]) - v = collections.OrderedDict([('x', 2), ('y', 3.5)]) + v = {'x': 2, 'y': 3.5} s = pa.scalar(v, type=ty) assert list(s) == list(s.keys()) == ['x', 'y'] assert list(s.values()) == [ @@ -478,10 +471,9 @@ def test_struct(): assert 'y' in s assert 'z' not in s - items = list(v.items()) - assert s.as_py() == items - assert repr(s) != repr(items) - assert repr(s.as_py()) == repr(items) + assert s.as_py() == v + assert repr(s) != repr(v) + assert repr(s.as_py()) == repr(v) assert len(s) == 2 assert isinstance(s['x'], pa.Int16Scalar) assert isinstance(s['y'], pa.FloatScalar) @@ -504,23 +496,6 @@ def test_struct(): assert s['y'].as_py() is None -def test_struct_duplicate_field_name(): - fields = [ - pa.field('x', pa.int64()), - pa.field('x', pa.string()) - ] - ty = pa.struct(fields) - - arr = pa.StructArray.from_arrays([ - pa.array([1, 2, 3]), - pa.array(["a", "b", "c"]) - ], fields=fields) - - assert arr[0] == pa.scalar([('x', 1), ('x', 'a')], type=ty) - assert arr[1] == pa.scalar([('x', 2), ('x', 'b')], type=ty) - assert arr[2] == pa.scalar([('x', 3), ('x', 'c')], type=ty) - - def test_map(): ty = pa.map_(pa.string(), pa.int8()) v = [('a', 1), ('b', 2)] @@ -531,8 +506,8 @@ def test_map(): assert isinstance(s.values, pa.Array) assert repr(s) == "" assert s.values.to_pylist() == [ - [('key', 'a'), ('value', 1)], - [('key', 'b'), ('value', 2)] + {'key': 'a', 'value': 1}, + {'key': 'b', 'value': 2} ] # test iteration From 62a5afd4eb8c872094ddeb25df7e22d640ab353c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 16 Sep 2020 00:06:53 +0200 Subject: [PATCH 49/80] fix large memory test case --- python/pyarrow/tests/test_convert_builtin.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 0ad15cfc8a5..c2360d9228c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1917,8 +1917,8 @@ def test_nested_auto_chunking(ty, char): assert len(arr.chunk(0)) == 21 assert len(arr.chunk(1)) == 1 - assert arr.chunk(1)[0].as_py() == [ - ('bool', True), - ('integer', 1), - ('string-like', char) - ] + assert arr.chunk(1)[0].as_py() == { + 'bool': True, + 'integer': 1, + 'string-like': char + } From 0e749a0f9d8420c949c43da781e6358cb59a40ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 16 Sep 2020 00:36:56 +0200 Subject: [PATCH 50/80] fix hypothesis test case --- python/pyarrow/tests/test_convert_builtin.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index c2360d9228c..e26947daa1e 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1868,8 +1868,18 @@ def test_dictionary_from_strings(): assert a.dictionary.equals(expected_dictionary) +def _has_unique_field_names(ty): + if isinstance(ty, pa.StructType): + field_names = [field.name for field in ty] + return len(set(field_names)) == len(field_names) + else: + return True + + @h.given(past.all_arrays) def test_array_to_pylist_roundtrip(arr): + # TODO(kszucs): ARROW-9997 + h.assume(_has_unique_field_names(arr.type)) seq = arr.to_pylist() restored = pa.array(seq, type=arr.type) assert restored.equals(arr) From 488fc7f4696dc6d8599fda2b663efe801669a02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 09:08:53 +0200 Subject: [PATCH 51/80] Apply suggestions from code review Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/array/builder_binary.h | 2 +- cpp/src/arrow/python/python_to_arrow.h | 6 +++--- cpp/src/arrow/scalar_test.cc | 3 +-- cpp/src/arrow/util/converter.h | 13 +++++-------- python/pyarrow/array.pxi | 12 ++++++------ 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 45e9aedd6f5..dbcba374230 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -329,7 +329,7 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow()); + ARROW_RETURN_NOT_OK(ValidateOverflow(num_bytes)); return offsets_builder_.Append(static_cast(num_bytes)); } diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 3d6862fc63a..1cd935993e6 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -39,7 +39,7 @@ class Status; namespace py { struct PyConversionOptions { - PyConversionOptions() : type(NULLPTR), size(-1), from_pandas(false) {} + PyConversionOptions() = default; PyConversionOptions(const std::shared_ptr& type, int64_t size, MemoryPool* pool, bool from_pandas) @@ -48,8 +48,8 @@ struct PyConversionOptions { // Set to null if to be inferred std::shared_ptr type; - // Default is -1: infer from data - int64_t size; + // Default is -1, which indicates the size should the same as the input sequence + int64_t size = -1; bool from_pandas = false; diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 3e3c8bd6698..dc8708f689e 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -627,8 +627,7 @@ TEST(TestDictionaryScalar, Basics) { gamma.dictionary = dict; auto scalar_null = MakeNullScalar(ty); - auto& dict_scalar_null = checked_cast(*scalar_null); - dict_scalar_null.value.dictionary = dict; + checked_cast(*scalar_null).value.dictionary = dict; auto scalar_alpha = DictionaryScalar(alpha, ty); auto scalar_gamma = DictionaryScalar(gamma, ty); diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index d0b74fd00cb..ef00d99321a 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -32,8 +32,6 @@ namespace arrow { namespace internal { -using internal::checked_cast; -using internal::checked_pointer_cast; template class PrimitiveConverter : public BaseConverter { @@ -142,7 +140,7 @@ class Converter { OptionsType options() const { return options_; } - const std::vector> children() const { return children_; } + const std::vector>& children() const { return children_; } virtual Status Reserve(int64_t additional_capacity) { return builder_->Reserve(additional_capacity); @@ -262,12 +260,11 @@ struct MakeConverterImpl { ARROW_ASSIGN_OR_RAISE(child_converter, Converter::Make(field->type(), pool, options)); - // TODO: use move - child_converters.push_back(child_converter); child_builders.push_back(child_converter->builder()); + child_converters.push_back(std::move(child_converter)); } - auto builder = std::make_shared(type, pool, child_builders); + auto builder = std::make_shared(std::move(type), pool, std::move(child_builders)); return Finish(std::move(builder), std::move(child_converters)); } @@ -279,13 +276,13 @@ struct MakeConverterImpl { auto converter = new ConverterType(); converter->type_ = std::move(type); converter->builder_ = std::move(builder); - converter->options_ = options; + converter->options_ = std::move(options); converter->children_ = std::move(children); out->reset(converter); return Status::OK(); } - const std::shared_ptr type; + std::shared_ptr type; MemoryPool* pool; typename Converter::OptionsType options; std::shared_ptr* out; diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5a111d17383..91d324f6c9e 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -161,12 +161,12 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, representation). Timezone-naive data will be implicitly interpreted as UTC. - Converting to dictionary array will choose to use a larger integer type for - the indices if the number of distict values wouldn't fit to the range of - the passed type. This adaptive nature means that if there are more than 127 - values the returned dictionary array's key type is going to be pa.int16() - even if pa.int8() was passed to the function. Note that smaller key type - than the passed one won't be chosed. + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than 127 + values the returned dictionary array's index type will be at least pa.int16() + even if pa.int8() was passed to the function. Note that an explicit index type + will not be demoted even if it is wider than required. Examples -------- From 09faa6cb01b2b042a09237c045ee11b9954e01d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 09:21:18 +0200 Subject: [PATCH 52/80] move out Extend and ExtendMasked --- cpp/src/arrow/python/python_to_arrow.cc | 82 ++++++++++++------------- cpp/src/arrow/python/python_to_arrow.h | 2 +- cpp/src/arrow/util/converter.h | 4 +- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 3d476621ff7..71cb01bb9d4 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -382,35 +382,36 @@ class PyConverter : public Converter using List = PyListConverter; using Struct = PyStructConverter; - - // Convert and append a sequence of values - Status Extend(PyObject* values, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence(values, [this](PyObject* item, bool* /* unused */) { - return this->Append(item); - }); - } - - // Convert and append a sequence of values masked with a numpy array - Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - values, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } }; +// Convert and append a sequence of values +Status Extend(PyConverter* converter, PyObject* values, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { + return converter->Append(item); + }); +} + +// Convert and append a sequence of values masked with a numpy array +Status ExtendMasked(PyConverter* converter, PyObject* values, PyObject* mask, + int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return converter->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return converter->Append(item); // perhaps use AppendValue instead? + } + }); +} + template class PyPrimitiveConverter< T, enable_if_t::value || is_boolean_type::value || @@ -533,18 +534,18 @@ class PyDictionaryConverter> // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ - case Type::TYPE_ID: { \ - if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ - return this->value_converter_->Extend(value, size); \ - } \ - return AppendNdarrayTyped(ndarray); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return Extend(this->value_converter_.get(), value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ } // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE_ID) \ - case Type::TYPE_ID: { \ - return this->value_converter_->Extend(value, size); \ +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return Extend(this->value_converter_.get(), value, size); \ } template @@ -587,7 +588,7 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); - return this->value_converter_->Extend(value, size); + return Extend(this->value_converter_.get(), value, size); } Status AppendNdarray(PyObject* value) { @@ -928,13 +929,12 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz } Result> ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& opts, + PyConversionOptions options, MemoryPool* pool) { PyAcquireGIL lock; PyObject* seq; OwnedRef tmp_seq_nanny; - PyConversionOptions options = opts; // copy options struct since we modify it below int64_t size = options.size; RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); @@ -956,9 +956,9 @@ Result> ConvertPySequence(PyObject* obj, PyObject* // Convert values if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); + RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); } else { - RETURN_NOT_OK(chunked_converter->Extend(seq, size)); + RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); } return chunked_converter->ToChunkedArray(); } diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 1cd935993e6..d167996ba8d 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -72,7 +72,7 @@ struct PyConversionOptions { /// \return Result ChunkedArray ARROW_PYTHON_EXPORT Result> ConvertPySequence( - PyObject* obj, PyObject* mask, const PyConversionOptions& options, + PyObject* obj, PyObject* mask, PyConversionOptions options, MemoryPool* pool = default_memory_pool()); } // namespace py diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index ef00d99321a..f1e3dd6efdb 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -32,7 +32,6 @@ namespace arrow { namespace internal { - template class PrimitiveConverter : public BaseConverter { public: @@ -264,7 +263,8 @@ struct MakeConverterImpl { child_converters.push_back(std::move(child_converter)); } - auto builder = std::make_shared(std::move(type), pool, std::move(child_builders)); + auto builder = + std::make_shared(std::move(type), pool, std::move(child_builders)); return Finish(std::move(builder), std::move(child_converters)); } From 35415e2d8f7ba6c481a2aaf4a1b13f92b9a361c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 11:06:19 +0200 Subject: [PATCH 53/80] use trait to map types to converters --- cpp/src/arrow/python/python_to_arrow.cc | 132 +++++++++++++----------- cpp/src/arrow/util/converter.h | 87 ++++++++-------- 2 files changed, 115 insertions(+), 104 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 71cb01bb9d4..cb1b588c94c 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -58,6 +58,7 @@ using internal::Chunker; using internal::Converter; using internal::DictionaryConverter; using internal::ListConverter; +using internal::MakeConverter; using internal::PrimitiveConverter; using internal::StructConverter; @@ -357,60 +358,39 @@ class PyValue { } }; -// Forward-declare the type-family specific converters to inject them to the PyConverter -// base class as type aliases. - -template -class PyPrimitiveConverter; - -template -class PyDictionaryConverter; - -template -class PyListConverter; - -class PyStructConverter; - // The base Converter class is a mixin with predefined behavior and constructors. class PyConverter : public Converter { public: - // Type aliases used by the parent Converter mixin's factory. - template - using Primitive = PyPrimitiveConverter; - template - using Dictionary = PyDictionaryConverter; - template - using List = PyListConverter; - using Struct = PyStructConverter; + // Convert and append a sequence of values + Status Extend(PyObject* values, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [this](PyObject* item, bool* /* unused */) { + return this->Append(item); + }); + } + + // Convert and append a sequence of values masked with a numpy array + Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return this->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return this->Append(item); // perhaps use AppendValue instead? + } + }); + } }; -// Convert and append a sequence of values -Status Extend(PyConverter* converter, PyObject* values, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(converter->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { - return converter->Append(item); - }); -} - -// Convert and append a sequence of values masked with a numpy array -Status ExtendMasked(PyConverter* converter, PyObject* values, PyObject* mask, - int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(converter->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return converter->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return converter->Append(item); // perhaps use AppendValue instead? - } - }); -} +template +class PyPrimitiveConverter; template class PyPrimitiveConverter< @@ -502,6 +482,9 @@ class PyPrimitiveConverter> bool observed_binary_ = false; }; +template +class PyDictionaryConverter; + template class PyDictionaryConverter> : public DictionaryConverter { @@ -534,18 +517,18 @@ class PyDictionaryConverter> // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ - case Type::TYPE_ID: { \ - if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ - return Extend(this->value_converter_.get(), value, size); \ - } \ - return AppendNdarrayTyped(ndarray); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return this->value_converter_->Extend(value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ } // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE_ID) \ - case Type::TYPE_ID: { \ - return Extend(this->value_converter_.get(), value, size); \ +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return this->value_converter_->Extend(value, size); \ } template @@ -588,7 +571,7 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); - return Extend(this->value_converter_.get(), value, size); + return this->value_converter_->Extend(value, size); } Status AppendNdarray(PyObject* value) { @@ -885,6 +868,30 @@ class PyStructConverter : public StructConverter { int num_fields_; }; +template +struct PyConverterTrait; + +template +struct PyConverterTrait> { + using type = PyPrimitiveConverter; +}; + +template +struct PyConverterTrait> { + using type = PyListConverter; +}; + +template <> +struct PyConverterTrait { + using type = PyStructConverter; +}; + +template <> +struct PyConverterTrait { + template + using type = PyDictionaryConverter; +}; + // Convert *obj* to a sequence if necessary // Fill *size* to its length. If >= 0 on entry, *size* is an upper size // bound that may lead to truncation. @@ -951,14 +958,15 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } DCHECK_GE(size, 0); - ARROW_ASSIGN_OR_RAISE(auto converter, PyConverter::Make(options.type, pool, options)); + ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( + options.type, pool, options))); ARROW_ASSIGN_OR_RAISE(auto chunked_converter, Chunker::Make(converter)); // Convert values if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); + RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); } else { - RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); + RETURN_NOT_OK(chunked_converter->Extend(seq, size)); } return chunked_converter->ToChunkedArray(); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index f1e3dd6efdb..9ee786931d9 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -98,7 +98,7 @@ class DictionaryConverter : public BaseConverter { BuilderType* value_builder_; }; -template +template class ConverterTrait> struct MakeConverterImpl; template @@ -107,25 +107,19 @@ class Converter { using InputType = Input; using OptionsType = Options; - template - using Primitive = PrimitiveConverter; - template - using List = ListConverter; - template - using Dictionary = DictionaryConverter; - using Struct = StructConverter; - - static Result> Make(std::shared_ptr type, - MemoryPool* pool, OptionsType options) { - std::shared_ptr out; - MakeConverterImpl visitor = {type, pool, options, &out}; - ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); - ARROW_RETURN_NOT_OK(out->Init()); - return out; - } - virtual ~Converter() = default; + virtual Status Initialize(std::shared_ptr type, + std::shared_ptr builder, + const std::vector>& children, + OptionsType options) { + type_ = std::move(type); + builder_ = std::move(builder); + children_ = std::move(children); + options_ = std::move(options); + return Init(); + } + virtual Status Init() { return Status::OK(); } virtual Status Append(InputType value) { @@ -155,27 +149,37 @@ class Converter { } protected: - friend struct MakeConverterImpl; - std::shared_ptr type_; std::shared_ptr builder_; std::vector> children_; OptionsType options_; }; -#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - return Finish>( \ - std::move(builder), {}); \ +template class ConverterTrait> +struct MakeConverterImpl; + +template class ConverterTrait> +static Result> MakeConverter( + std::shared_ptr type, MemoryPool* pool, + typename Converter::OptionsType options) { + std::shared_ptr out; + MakeConverterImpl visitor = {type, pool, options, &out}; + ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); + return out; +} + +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + return Finish::template type>( \ + std::move(builder), {}); \ break; -template +template class ConverterTrait> struct MakeConverterImpl { Status Visit(const NullType& t) { - using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename Converter::template Primitive; + using ConverterType = typename ConverterTrait::type; - auto builder = std::make_shared(pool); + auto builder = std::make_shared(pool); return Finish(std::move(builder), {}); } @@ -185,7 +189,7 @@ struct MakeConverterImpl { Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename Converter::template Primitive; + using ConverterType = typename ConverterTrait::type; auto builder = std::make_shared(type, pool); return Finish(std::move(builder), {}); @@ -195,22 +199,23 @@ struct MakeConverterImpl { enable_if_t::value && !std::is_same::value, Status> Visit(const T& t) { using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename Converter::template List; + using ConverterType = typename ConverterTrait::type; - ARROW_ASSIGN_OR_RAISE(auto child_converter, - Converter::Make(t.value_type(), pool, options)); + ARROW_ASSIGN_OR_RAISE(auto child_converter, (MakeConverter( + t.value_type(), pool, options))); auto builder = std::make_shared(pool, child_converter->builder(), type); return Finish(std::move(builder), {std::move(child_converter)}); } Status Visit(const MapType& t) { - using ConverterType = typename Converter::template List; + using ConverterType = typename ConverterTrait::type; // TODO(kszucs): seems like builders not respect field nullability std::vector> struct_fields{t.key_field(), t.item_field()}; auto struct_type = std::make_shared(struct_fields); - ARROW_ASSIGN_OR_RAISE(auto struct_converter, - Converter::Make(struct_type, pool, options)); + ARROW_ASSIGN_OR_RAISE( + auto struct_converter, + (MakeConverter(struct_type, pool, options))); auto struct_builder = struct_converter->builder(); auto key_builder = struct_builder->child_builder(0); @@ -249,15 +254,15 @@ struct MakeConverterImpl { } Status Visit(const StructType& t) { - using ConverterType = typename Converter::Struct; + using ConverterType = typename ConverterTrait::type; std::shared_ptr child_converter; std::vector> child_converters; std::vector> child_builders; for (const auto& field : t.fields()) { - ARROW_ASSIGN_OR_RAISE(child_converter, - Converter::Make(field->type(), pool, options)); + ARROW_ASSIGN_OR_RAISE(child_converter, (MakeConverter( + field->type(), pool, options))); child_builders.push_back(child_converter->builder()); child_converters.push_back(std::move(child_converter)); @@ -274,10 +279,8 @@ struct MakeConverterImpl { Status Finish(std::shared_ptr builder, std::vector> children) { auto converter = new ConverterType(); - converter->type_ = std::move(type); - converter->builder_ = std::move(builder); - converter->options_ = std::move(options); - converter->children_ = std::move(children); + ARROW_RETURN_NOT_OK(converter->Initialize(std::move(type), std::move(builder), + std::move(children), std::move(options))); out->reset(converter); return Status::OK(); } From f4922ecabf7724821b2a49929611fe955fde3d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 11:33:02 +0200 Subject: [PATCH 54/80] rename template argument --- cpp/src/arrow/python/python_to_arrow.cc | 14 +++++++------- cpp/src/arrow/util/converter.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index cb1b588c94c..b7090455467 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -482,12 +482,12 @@ class PyPrimitiveConverter> bool observed_binary_ = false; }; -template +template class PyDictionaryConverter; -template -class PyDictionaryConverter> - : public DictionaryConverter { +template +class PyDictionaryConverter> + : public DictionaryConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { @@ -500,9 +500,9 @@ class PyDictionaryConverter> } }; -template -class PyDictionaryConverter> - : public DictionaryConverter { +template +class PyDictionaryConverter> + : public DictionaryConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 9ee786931d9..b817634bf91 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -135,7 +135,7 @@ class Converter { const std::vector>& children() const { return children_; } - virtual Status Reserve(int64_t additional_capacity) { + Status Reserve(int64_t additional_capacity) { return builder_->Reserve(additional_capacity); } From 76938ca6cb1bfd02af190208c7e275587b7c0948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 12:07:12 +0200 Subject: [PATCH 55/80] flake8 --- python/pyarrow/array.pxi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 91d324f6c9e..9c2c0395758 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -163,10 +163,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Converting to dictionary array will promote to a wider integer type for indices if the number of distinct values cannot be represented, even if - the index type was explicitly set. This means that if there are more than 127 - values the returned dictionary array's index type will be at least pa.int16() - even if pa.int8() was passed to the function. Note that an explicit index type - will not be demoted even if it is wider than required. + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. Examples -------- From d485bc14ff42c202ae4d3f72c567eaacf06ce173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 12:17:21 +0200 Subject: [PATCH 56/80] resolve rebose conflicts --- cpp/src/arrow/util/converter.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index b817634bf91..91ca572da15 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -239,11 +239,8 @@ struct MakeConverterImpl { DICTIONARY_CASE(UINT16, UInt16Type); DICTIONARY_CASE(UINT32, UInt32Type); DICTIONARY_CASE(UINT64, UInt64Type); - DICTIONARY_CASE(HALF_FLOAT, HalfFloatType); DICTIONARY_CASE(FLOAT, FloatType); DICTIONARY_CASE(DOUBLE, DoubleType); - DICTIONARY_CASE(DATE32, Date32Type); - DICTIONARY_CASE(DATE64, Date64Type); DICTIONARY_CASE(BINARY, BinaryType); DICTIONARY_CASE(STRING, StringType); DICTIONARY_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType); From 2c63f7db37d7ccbfa668a58033091bf5f99b253a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 12:42:47 +0200 Subject: [PATCH 57/80] chunker should not inherit from converter --- cpp/src/arrow/python/python_to_arrow.cc | 82 ++++++++++++------------- cpp/src/arrow/util/converter.h | 33 +++++----- 2 files changed, 57 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index b7090455467..e061c1d13f3 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -358,36 +358,36 @@ class PyValue { } }; +template +Status Extend(T* converter, PyObject* values, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { + return converter->Append(item); + }); +} + +// Convert and append a sequence of values masked with a numpy array +template +Status ExtendMasked(T* converter, PyObject* values, PyObject* mask, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return converter->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return converter->Append(item); // perhaps use AppendValue instead? + } + }); +} + // The base Converter class is a mixin with predefined behavior and constructors. -class PyConverter : public Converter { - public: - // Convert and append a sequence of values - Status Extend(PyObject* values, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence(values, [this](PyObject* item, bool* /* unused */) { - return this->Append(item); - }); - } - - // Convert and append a sequence of values masked with a numpy array - Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - values, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } -}; +class PyConverter : public Converter {}; template class PyPrimitiveConverter; @@ -517,18 +517,18 @@ class PyDictionaryConverter> // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ - case Type::TYPE_ID: { \ - if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ - return this->value_converter_->Extend(value, size); \ - } \ - return AppendNdarrayTyped(ndarray); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return Extend(this->value_converter_.get(), value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ } // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE_ID) \ - case Type::TYPE_ID: { \ - return this->value_converter_->Extend(value, size); \ +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return Extend(this->value_converter_.get(), value, size); \ } template @@ -571,7 +571,7 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); - return this->value_converter_->Extend(value, size); + return Extend(this->value_converter_.get(), value, size); } Status AppendNdarray(PyObject* value) { @@ -964,9 +964,9 @@ Result> ConvertPySequence(PyObject* obj, PyObject* // Convert values if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); + RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); } else { - RETURN_NOT_OK(chunked_converter->Extend(seq, size)); + RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); } return chunked_converter->ToChunkedArray(); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 91ca572da15..c929130db11 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -139,7 +139,7 @@ class Converter { return builder_->Reserve(additional_capacity); } - virtual Status AppendNull() { return builder_->AppendNull(); } + Status AppendNull() { return builder_->AppendNull(); } virtual Result> ToArray() { return builder_->Finish(); } @@ -288,27 +288,26 @@ struct MakeConverterImpl { std::shared_ptr* out; }; -// TODO(kszucs): rename to AutoChunker -template -class Chunker : public BaseConverter { +template +class Chunker { public: - using Self = Chunker; - using InputType = typename BaseConverter::InputType; + using Self = Chunker; + using InputType = typename Converter::InputType; - static Result> Make(std::shared_ptr converter) { + static Result> Make(std::shared_ptr converter) { auto result = std::make_shared(); - result->type_ = converter->type(); - result->builder_ = converter->builder(); - result->options_ = converter->options(); - result->children_ = converter->children(); result->converter_ = std::move(converter); return result; } - Status AppendNull() override { + Status Reserve(int64_t additional_capacity) { + return converter_->Reserve(additional_capacity); + } + + Status AppendNull() { auto status = converter_->AppendNull(); if (status.ok()) { - length_ = this->builder_->length(); + length_ = converter_->builder()->length(); } else if (status.IsCapacityError()) { ARROW_RETURN_NOT_OK(FinishChunk()); return converter_->AppendNull(); @@ -316,10 +315,10 @@ class Chunker : public BaseConverter { return status; } - Status Append(InputType value) override { + Status Append(InputType value) { auto status = converter_->Append(value); if (status.ok()) { - length_ = this->builder_->length(); + length_ = converter_->builder()->length(); } else if (status.IsCapacityError()) { ARROW_RETURN_NOT_OK(FinishChunk()); return Append(value); @@ -329,7 +328,7 @@ class Chunker : public BaseConverter { Status FinishChunk() { ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_)); - this->builder_->Reset(); + converter_->builder()->Reset(); length_ = 0; chunks_.push_back(chunk); return Status::OK(); @@ -342,7 +341,7 @@ class Chunker : public BaseConverter { protected: int64_t length_ = 0; - std::shared_ptr converter_; + std::shared_ptr converter_; std::vector> chunks_; }; From 68c337325b464f429d2ef5baf6cf42f37f5f886e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 12:55:23 +0200 Subject: [PATCH 58/80] make chunker construction more straightforward --- cpp/src/arrow/python/python_to_arrow.cc | 7 +- cpp/src/arrow/util/converter.h | 123 +++++++++---------- python/pyarrow/tests/test_convert_builtin.py | 2 - 3 files changed, 65 insertions(+), 67 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index e061c1d13f3..4b09bbd32b0 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -54,14 +54,15 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using internal::Chunker; using internal::Converter; using internal::DictionaryConverter; using internal::ListConverter; -using internal::MakeConverter; using internal::PrimitiveConverter; using internal::StructConverter; +using internal::MakeChunker; +using internal::MakeConverter; + namespace py { // Utility for converting single python objects to their intermediate C representations @@ -960,7 +961,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, pool, options))); - ARROW_ASSIGN_OR_RAISE(auto chunked_converter, Chunker::Make(converter)); + ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(converter)); // Convert values if (mask != nullptr && mask != Py_None) { diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c929130db11..39fec133ccf 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -32,6 +32,60 @@ namespace arrow { namespace internal { +template +class Converter { + public: + using InputType = Input; + using OptionsType = Options; + + virtual ~Converter() = default; + + virtual Status Initialize(std::shared_ptr type, + std::shared_ptr builder, + const std::vector>& children, + OptionsType options) { + type_ = std::move(type); + builder_ = std::move(builder); + children_ = std::move(children); + options_ = std::move(options); + return Init(); + } + + virtual Status Init() { return Status::OK(); } + + virtual Status Append(InputType value) { + return Status::NotImplemented("Converter not implemented for type ", + type()->ToString()); + } + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + const std::vector>& children() const { return children_; } + + Status Reserve(int64_t additional_capacity) { + return builder_->Reserve(additional_capacity); + } + + Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> ToArray() { return builder_->Finish(); } + + virtual Result> ToArray(int64_t length) { + ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); + return arr->Slice(0, length); + } + + protected: + std::shared_ptr type_; + std::shared_ptr builder_; + std::vector> children_; + OptionsType options_; +}; + template class PrimitiveConverter : public BaseConverter { public: @@ -101,63 +155,6 @@ class DictionaryConverter : public BaseConverter { template class ConverterTrait> struct MakeConverterImpl; -template -class Converter { - public: - using InputType = Input; - using OptionsType = Options; - - virtual ~Converter() = default; - - virtual Status Initialize(std::shared_ptr type, - std::shared_ptr builder, - const std::vector>& children, - OptionsType options) { - type_ = std::move(type); - builder_ = std::move(builder); - children_ = std::move(children); - options_ = std::move(options); - return Init(); - } - - virtual Status Init() { return Status::OK(); } - - virtual Status Append(InputType value) { - return Status::NotImplemented("Converter not implemented for type ", - type()->ToString()); - } - - const std::shared_ptr& builder() const { return builder_; } - - const std::shared_ptr& type() const { return type_; } - - OptionsType options() const { return options_; } - - const std::vector>& children() const { return children_; } - - Status Reserve(int64_t additional_capacity) { - return builder_->Reserve(additional_capacity); - } - - Status AppendNull() { return builder_->AppendNull(); } - - virtual Result> ToArray() { return builder_->Finish(); } - - virtual Result> ToArray(int64_t length) { - ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); - return arr->Slice(0, length); - } - - protected: - std::shared_ptr type_; - std::shared_ptr builder_; - std::vector> children_; - OptionsType options_; -}; - -template class ConverterTrait> -struct MakeConverterImpl; - template class ConverterTrait> static Result> MakeConverter( std::shared_ptr type, MemoryPool* pool, @@ -294,11 +291,8 @@ class Chunker { using Self = Chunker; using InputType = typename Converter::InputType; - static Result> Make(std::shared_ptr converter) { - auto result = std::make_shared(); - result->converter_ = std::move(converter); - return result; - } + explicit Chunker(std::shared_ptr converter) + : converter_(std::move(converter)) {} Status Reserve(int64_t additional_capacity) { return converter_->Reserve(additional_capacity); @@ -345,5 +339,10 @@ class Chunker { std::vector> chunks_; }; +template +static Result>> MakeChunker(std::shared_ptr converter) { + return std::make_shared>(std::move(converter)); +} + } // namespace internal } // namespace arrow diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index e26947daa1e..708656e876b 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1800,8 +1800,6 @@ def test_dictionary_from_boolean(): pa.uint64(), pa.float32(), pa.float64(), - pa.date32(), - pa.date64(), ]) def test_dictionary_from_integers(value_type): typ = pa.dictionary(pa.int8(), value_type=value_type) From ca7238e7306ea35b0d9fe2d1c0c0ef826fb7e0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 19 Sep 2020 13:02:50 +0200 Subject: [PATCH 59/80] more verbose dictionary scalar naming in the python bindings --- python/pyarrow/includes/libarrow.pxd | 9 +++++---- python/pyarrow/scalar.pxi | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2042cbc2292..d1c4110cb40 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -969,14 +969,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[shared_ptr[CScalar]] value CResult[shared_ptr[CScalar]] field(CFieldRef ref) const - cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": + cdef cppclass CDictionaryScalarIndexAndDictionary \ + "arrow::DictionaryScalar::ValueType": shared_ptr[CScalar] index shared_ptr[CArray] dictionary cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): - CDictionaryScalar(CDictionaryValue value, shared_ptr[CDataType], - c_bool is_valid) - CDictionaryValue value + CDictionaryScalar(CDictionaryScalarIndexAndDictionary value, + shared_ptr[CDataType], c_bool is_valid) + CDictionaryScalarIndexAndDictionary value CResult[shared_ptr[CScalar]] GetEncodedValue() cdef cppclass CUnionScalar" arrow::UnionScalar"(CScalar): diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 5d444c55077..53219dee238 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -687,7 +687,7 @@ cdef class DictionaryScalar(Scalar): @classmethod def _reconstruct(cls, type, is_valid, index, dictionary): cdef: - CDictionaryValue value + CDictionaryScalarIndexAndDictionary value shared_ptr[CDictionaryScalar] wrapped DataType type_ Scalar index_ From dd5fa70f4c6dff1f1829121bb661457124c19a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 11:13:55 +0200 Subject: [PATCH 60/80] address review comments --- cpp/src/arrow/python/python_to_arrow.cc | 48 ++++++++++++------------- cpp/src/arrow/util/converter.h | 26 +++++++------- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 4b09bbd32b0..824438c632c 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -388,7 +388,7 @@ Status ExtendMasked(T* converter, PyObject* values, PyObject* mask, int64_t size } // The base Converter class is a mixin with predefined behavior and constructors. -class PyConverter : public Converter {}; +using PyConverter = Converter; template class PyPrimitiveConverter; @@ -611,9 +611,7 @@ class PyListConverter : public ListConverter { return Status::Invalid( "Can only convert list types from NumPy object array input"); } - return internal::VisitSequence(value, [this](PyObject* item, bool*) { - return this->value_converter_->Append(item); - }); + return Extend(this->value_converter_.get(), value, /*reserved=*/0); } default: { return Status::TypeError("Unknown list item type: ", value_type->ToString()); @@ -658,6 +656,27 @@ class PyListConverter : public ListConverter { class PyStructConverter : public StructConverter { public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->struct_builder_->AppendNull(); + } + switch (input_kind_) { + case InputKind::DICT: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendDict(value); + case InputKind::TUPLE: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendTuple(value); + case InputKind::ITEMS: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendItems(value); + default: + RETURN_NOT_OK(InferInputKind(value)); + return Append(value); + } + } + + protected: Status Init() override { RETURN_NOT_OK(StructConverter::Init()); @@ -719,26 +738,6 @@ class PyStructConverter : public StructConverter { return Status::OK(); } - Status Append(PyObject* value) override { - if (PyValue::IsNull(this->options_, value)) { - return this->struct_builder_->AppendNull(); - } - switch (input_kind_) { - case InputKind::DICT: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendDict(value); - case InputKind::TUPLE: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendTuple(value); - case InputKind::ITEMS: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendItems(value); - default: - RETURN_NOT_OK(InferInputKind(value)); - return Append(value); - } - } - Status AppendEmpty() { for (int i = 0; i < num_fields_; i++) { RETURN_NOT_OK(this->children_[i]->Append(Py_None)); @@ -857,7 +856,6 @@ class PyStructConverter : public StructConverter { return Status::OK(); } - protected: // Whether we're converting from a sequence of dicts or tuples or list of pairs enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN; // Whether the input dictionary keys' type is python bytes or unicode diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 39fec133ccf..4187d15167a 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -32,18 +32,18 @@ namespace arrow { namespace internal { -template +template class Converter { public: + using Self = Converter; using InputType = Input; using OptionsType = Options; virtual ~Converter() = default; - virtual Status Initialize(std::shared_ptr type, - std::shared_ptr builder, - const std::vector>& children, - OptionsType options) { + Status Construct(std::shared_ptr type, std::shared_ptr builder, + const std::vector>& children, + OptionsType options) { type_ = std::move(type); builder_ = std::move(builder); children_ = std::move(children); @@ -51,8 +51,6 @@ class Converter { return Init(); } - virtual Status Init() { return Status::OK(); } - virtual Status Append(InputType value) { return Status::NotImplemented("Converter not implemented for type ", type()->ToString()); @@ -80,6 +78,8 @@ class Converter { } protected: + virtual Status Init() { return Status::OK(); } + std::shared_ptr type_; std::shared_ptr builder_; std::vector> children_; @@ -91,13 +91,13 @@ class PrimitiveConverter : public BaseConverter { public: using BuilderType = typename TypeTraits::BuilderType; + protected: Status Init() override { primitive_type_ = checked_cast(this->type_.get()); primitive_builder_ = checked_cast(this->builder_.get()); return Status::OK(); } - protected: const T* primitive_type_; BuilderType* primitive_builder_; }; @@ -107,6 +107,7 @@ class ListConverter : public BaseConverter { public: using BuilderType = typename TypeTraits::BuilderType; + protected: Status Init() override { list_type_ = checked_cast(this->type_.get()); list_builder_ = checked_cast(this->builder_.get()); @@ -114,7 +115,6 @@ class ListConverter : public BaseConverter { return Status::OK(); } - protected: const T* list_type_; BuilderType* list_builder_; std::shared_ptr value_converter_; @@ -123,13 +123,13 @@ class ListConverter : public BaseConverter { template class StructConverter : public BaseConverter { public: + protected: Status Init() override { struct_type_ = checked_cast(this->type_.get()); struct_builder_ = checked_cast(this->builder_.get()); return Status::OK(); } - protected: const StructType* struct_type_; StructBuilder* struct_builder_; }; @@ -139,6 +139,7 @@ class DictionaryConverter : public BaseConverter { public: using BuilderType = DictionaryBuilder; + protected: Status Init() override { dict_type_ = checked_cast(this->type_.get()); value_type_ = checked_cast(dict_type_->value_type().get()); @@ -146,7 +147,6 @@ class DictionaryConverter : public BaseConverter { return Status::OK(); } - protected: const DictionaryType* dict_type_; const U* value_type_; BuilderType* value_builder_; @@ -273,8 +273,8 @@ struct MakeConverterImpl { Status Finish(std::shared_ptr builder, std::vector> children) { auto converter = new ConverterType(); - ARROW_RETURN_NOT_OK(converter->Initialize(std::move(type), std::move(builder), - std::move(children), std::move(options))); + ARROW_RETURN_NOT_OK(converter->Construct(std::move(type), std::move(builder), + std::move(children), std::move(options))); out->reset(converter); return Status::OK(); } From c70552b9c8255c6d21afb77ee5fd967bceb93866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 11:37:35 +0200 Subject: [PATCH 61/80] address review comments --- cpp/src/arrow/python/python_to_arrow.cc | 33 +++++++++++++------------ 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 824438c632c..734f05d59c7 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -535,22 +535,6 @@ class PyDictionaryConverter> template class PyListConverter : public ListConverter { public: - Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } - - Status ValidateOverflow(const BaseListType*, int64_t size) { - return this->list_builder_->ValidateOverflow(size); - } - - Status ValidateBuilder(const MapType*) { - if (this->list_builder_->key_builder()->null_count() > 0) { - return Status::Invalid("Invalid Map: key field can not contain null values"); - } else { - return Status::OK(); - } - } - - Status ValidateBuilder(const BaseListType*) { return Status::OK(); } - Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { return this->list_builder_->AppendNull(); @@ -569,6 +553,23 @@ class PyListConverter : public ListConverter { return ValidateBuilder(this->list_type_); } + protected: + Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } + + Status ValidateOverflow(const BaseListType*, int64_t size) { + return this->list_builder_->ValidateOverflow(size); + } + + Status ValidateBuilder(const MapType*) { + if (this->list_builder_->key_builder()->null_count() > 0) { + return Status::Invalid("Invalid Map: key field can not contain null values"); + } else { + return Status::OK(); + } + } + + Status ValidateBuilder(const BaseListType*) { return Status::OK(); } + Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); From 21e932be0ac953f1683edd34470194928111caf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 18:07:35 +0200 Subject: [PATCH 62/80] address review comments --- cpp/src/arrow/array/builder_nested.cc | 12 ++ cpp/src/arrow/array/builder_nested.h | 3 + cpp/src/arrow/array/builder_primitive.h | 5 + cpp/src/arrow/python/python_to_arrow.cc | 79 +++++---- cpp/src/arrow/util/converter.h | 213 +++++++++--------------- 5 files changed, 148 insertions(+), 164 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 978416df565..c3786a8fab4 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -54,6 +54,18 @@ MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr& ke : MapBuilder(pool, key_builder, item_builder, map(key_builder->type(), item_builder->type(), keys_sorted)) {} +MapBuilder::MapBuilder(MemoryPool* pool, + const std::shared_ptr& struct_builder, + const std::shared_ptr& type) + : ArrayBuilder(pool) { + auto map_type = internal::checked_cast(type.get()); + keys_sorted_ = map_type->keys_sorted(); + key_builder_ = struct_builder->child_builder(0); + item_builder_ = struct_builder->child_builder(1); + list_builder_ = + std::make_shared(pool, struct_builder, struct_builder->type()); +} + Status MapBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(list_builder_->Resize(capacity)); capacity_ = list_builder_->capacity(); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 48ddc862790..7735d0df338 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -237,6 +237,9 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, const std::shared_ptr& item_builder, bool keys_sorted = false); + MapBuilder(MemoryPool* pool, const std::shared_ptr& item_builder, + const std::shared_ptr& type); + Status Resize(int64_t capacity) override; void Reset() override; Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e6b1baa5879..b875dd2cf59 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -31,6 +31,11 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {} + explicit NullBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool()) + : NullBuilder(pool) { + // ARROW_CHECK_EQ(type->id(), NullType::type_id); + } /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 734f05d59c7..d4eeeed57b8 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -393,6 +393,48 @@ using PyConverter = Converter; template class PyPrimitiveConverter; +template +class PyListConverter; + +template +class PyDictionaryConverter; + +class PyStructConverter; + +template +struct PyConverterTrait; + +template +struct PyConverterTrait> { + using type = PyPrimitiveConverter; +}; + +template +struct PyConverterTrait> { + using type = void; +}; + +template +struct PyConverterTrait> { + using type = void; +}; + +template +struct PyConverterTrait> { + using type = PyListConverter; +}; + +template <> +struct PyConverterTrait { + using type = PyStructConverter; +}; + +template <> +struct PyConverterTrait { + template + using type = PyDictionaryConverter; +}; + template class PyPrimitiveConverter< T, enable_if_t::value || is_boolean_type::value || @@ -483,9 +525,6 @@ class PyPrimitiveConverter> bool observed_binary_ = false; }; -template -class PyDictionaryConverter; - template class PyDictionaryConverter> : public DictionaryConverter { @@ -533,7 +572,7 @@ class PyDictionaryConverter> } template -class PyListConverter : public ListConverter { +class PyListConverter : public ListConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { @@ -655,7 +694,7 @@ class PyListConverter : public ListConverter { } }; -class PyStructConverter : public StructConverter { +class PyStructConverter : public StructConverter { public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { @@ -678,8 +717,8 @@ class PyStructConverter : public StructConverter { } protected: - Status Init() override { - RETURN_NOT_OK(StructConverter::Init()); + Status Init(MemoryPool* pool) override { + RETURN_NOT_OK((StructConverter::Init(pool))); // Store the field names as a PyObjects for dict matching num_fields_ = this->struct_type_->num_fields(); @@ -868,30 +907,6 @@ class PyStructConverter : public StructConverter { int num_fields_; }; -template -struct PyConverterTrait; - -template -struct PyConverterTrait> { - using type = PyPrimitiveConverter; -}; - -template -struct PyConverterTrait> { - using type = PyListConverter; -}; - -template <> -struct PyConverterTrait { - using type = PyStructConverter; -}; - -template <> -struct PyConverterTrait { - template - using type = PyDictionaryConverter; -}; - // Convert *obj* to a sequence if necessary // Fill *size* to its length. If >= 0 on entry, *size* is an upper size // bound that may lead to truncation. @@ -959,7 +974,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* DCHECK_GE(size, 0); ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( - options.type, pool, options))); + options.type, options, pool))); ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(converter)); // Convert values diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 4187d15167a..d798f7d6ad0 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -32,6 +32,11 @@ namespace arrow { namespace internal { +template class ConverterTrait> +static Result> MakeConverter( + std::shared_ptr type, typename BaseConverter::OptionsType options, + MemoryPool* pool); + template class Converter { public: @@ -41,14 +46,11 @@ class Converter { virtual ~Converter() = default; - Status Construct(std::shared_ptr type, std::shared_ptr builder, - const std::vector>& children, - OptionsType options) { + Status Construct(std::shared_ptr type, OptionsType options, + MemoryPool* pool) { type_ = std::move(type); - builder_ = std::move(builder); - children_ = std::move(children); options_ = std::move(options); - return Init(); + return Init(pool); } virtual Status Append(InputType value) { @@ -78,7 +80,7 @@ class Converter { } protected: - virtual Status Init() { return Status::OK(); } + virtual Status Init(MemoryPool* pool) { return Status::OK(); } std::shared_ptr type_; std::shared_ptr builder_; @@ -92,9 +94,10 @@ class PrimitiveConverter : public BaseConverter { using BuilderType = typename TypeTraits::BuilderType; protected: - Status Init() override { - primitive_type_ = checked_cast(this->type_.get()); - primitive_builder_ = checked_cast(this->builder_.get()); + Status Init(MemoryPool* pool) override { + this->builder_ = std::make_shared(this->type_, pool); + this->primitive_type_ = checked_cast(this->type_.get()); + this->primitive_builder_ = checked_cast(this->builder_.get()); return Status::OK(); } @@ -102,16 +105,22 @@ class PrimitiveConverter : public BaseConverter { BuilderType* primitive_builder_; }; -template +template class ConverterTrait> class ListConverter : public BaseConverter { public: using BuilderType = typename TypeTraits::BuilderType; + using ConverterType = typename ConverterTrait::type; protected: - Status Init() override { + Status Init(MemoryPool* pool) override { list_type_ = checked_cast(this->type_.get()); + ARROW_ASSIGN_OR_RAISE(value_converter_, + (MakeConverter( + list_type_->value_type(), this->options_, pool))); + this->builder_ = + std::make_shared(pool, value_converter_->builder(), this->type_); + this->children_ = {value_converter_}; list_builder_ = checked_cast(this->builder_.get()); - value_converter_ = this->children_[0]; return Status::OK(); } @@ -120,13 +129,29 @@ class ListConverter : public BaseConverter { std::shared_ptr value_converter_; }; -template +template class ConverterTrait> class StructConverter : public BaseConverter { public: + using ConverterType = typename ConverterTrait::type; + protected: - Status Init() override { + Status Init(MemoryPool* pool) override { + std::shared_ptr child_converter; + std::vector> child_builders; + struct_type_ = checked_cast(this->type_.get()); + for (const auto& field : struct_type_->fields()) { + ARROW_ASSIGN_OR_RAISE(child_converter, + (MakeConverter( + field->type(), this->options_, pool))); + child_builders.push_back(child_converter->builder()); + this->children_.push_back(std::move(child_converter)); + } + + this->builder_ = + std::make_shared(this->type_, pool, std::move(child_builders)); struct_builder_ = checked_cast(this->builder_.get()); + return Status::OK(); } @@ -140,7 +165,10 @@ class DictionaryConverter : public BaseConverter { using BuilderType = DictionaryBuilder; protected: - Status Init() override { + Status Init(MemoryPool* pool) override { + std::unique_ptr builder; + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder)); + this->builder_ = std::move(builder); dict_type_ = checked_cast(this->type_.get()); value_type_ = checked_cast(dict_type_->value_type().get()); value_builder_ = checked_cast(this->builder_.get()); @@ -152,139 +180,60 @@ class DictionaryConverter : public BaseConverter { BuilderType* value_builder_; }; -template class ConverterTrait> -struct MakeConverterImpl; - -template class ConverterTrait> -static Result> MakeConverter( - std::shared_ptr type, MemoryPool* pool, - typename Converter::OptionsType options) { - std::shared_ptr out; - MakeConverterImpl visitor = {type, pool, options, &out}; - ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &visitor)); - return out; -} - -#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - return Finish::template type>( \ - std::move(builder), {}); \ - break; - -template class ConverterTrait> +template class ConverterTrait> struct MakeConverterImpl { - Status Visit(const NullType& t) { - using ConverterType = typename ConverterTrait::type; - - auto builder = std::make_shared(pool); - return Finish(std::move(builder), {}); - } - - template - enable_if_t::value && !is_interval_type::value && - !is_dictionary_type::value && !is_extension_type::value, - Status> - Visit(const T& t) { - using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename ConverterTrait::type; - - auto builder = std::make_shared(type, pool); - return Finish(std::move(builder), {}); - } - - template - enable_if_t::value && !std::is_same::value, Status> - Visit(const T& t) { - using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename ConverterTrait::type; - - ARROW_ASSIGN_OR_RAISE(auto child_converter, (MakeConverter( - t.value_type(), pool, options))); - auto builder = std::make_shared(pool, child_converter->builder(), type); - return Finish(std::move(builder), {std::move(child_converter)}); - } - - Status Visit(const MapType& t) { - using ConverterType = typename ConverterTrait::type; - - // TODO(kszucs): seems like builders not respect field nullability - std::vector> struct_fields{t.key_field(), t.item_field()}; - auto struct_type = std::make_shared(struct_fields); - ARROW_ASSIGN_OR_RAISE( - auto struct_converter, - (MakeConverter(struct_type, pool, options))); - - auto struct_builder = struct_converter->builder(); - auto key_builder = struct_builder->child_builder(0); - auto item_builder = struct_builder->child_builder(1); - auto builder = std::make_shared(pool, key_builder, item_builder, type); - - return Finish(std::move(builder), {std::move(struct_converter)}); + template ::type> + Status Visit(const T&) { + out.reset(new ConverterType()); + return out->Construct(std::move(type), std::move(options), pool); } Status Visit(const DictionaryType& t) { - std::unique_ptr builder; - ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, NULLPTR, &builder)); - switch (t.value_type()->id()) { - DICTIONARY_CASE(BOOL, BooleanType); - DICTIONARY_CASE(INT8, Int8Type); - DICTIONARY_CASE(INT16, Int16Type); - DICTIONARY_CASE(INT32, Int32Type); - DICTIONARY_CASE(INT64, Int64Type); - DICTIONARY_CASE(UINT8, UInt8Type); - DICTIONARY_CASE(UINT16, UInt16Type); - DICTIONARY_CASE(UINT32, UInt32Type); - DICTIONARY_CASE(UINT64, UInt64Type); - DICTIONARY_CASE(FLOAT, FloatType); - DICTIONARY_CASE(DOUBLE, DoubleType); - DICTIONARY_CASE(BINARY, BinaryType); - DICTIONARY_CASE(STRING, StringType); - DICTIONARY_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType); +#define DICTIONARY_CASE(TYPE) \ + case TYPE::type_id: \ + out = std::make_shared< \ + typename ConverterTrait::template type>(); \ + break; + DICTIONARY_CASE(BooleanType); + DICTIONARY_CASE(Int8Type); + DICTIONARY_CASE(Int16Type); + DICTIONARY_CASE(Int32Type); + DICTIONARY_CASE(Int64Type); + DICTIONARY_CASE(UInt8Type); + DICTIONARY_CASE(UInt16Type); + DICTIONARY_CASE(UInt32Type); + DICTIONARY_CASE(UInt64Type); + DICTIONARY_CASE(FloatType); + DICTIONARY_CASE(DoubleType); + DICTIONARY_CASE(BinaryType); + DICTIONARY_CASE(StringType); + DICTIONARY_CASE(FixedSizeBinaryType); default: return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(), " not implemented"); } - } - - Status Visit(const StructType& t) { - using ConverterType = typename ConverterTrait::type; - - std::shared_ptr child_converter; - std::vector> child_converters; - std::vector> child_builders; - - for (const auto& field : t.fields()) { - ARROW_ASSIGN_OR_RAISE(child_converter, (MakeConverter( - field->type(), pool, options))); - - child_builders.push_back(child_converter->builder()); - child_converters.push_back(std::move(child_converter)); - } - - auto builder = - std::make_shared(std::move(type), pool, std::move(child_builders)); - return Finish(std::move(builder), std::move(child_converters)); + return out->Construct(std::move(type), std::move(options), pool); } Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } - template - Status Finish(std::shared_ptr builder, - std::vector> children) { - auto converter = new ConverterType(); - ARROW_RETURN_NOT_OK(converter->Construct(std::move(type), std::move(builder), - std::move(children), std::move(options))); - out->reset(converter); - return Status::OK(); - } - std::shared_ptr type; + typename BaseConverter::OptionsType options; MemoryPool* pool; - typename Converter::OptionsType options; - std::shared_ptr* out; + std::shared_ptr out; }; +template class ConverterTrait> +static Result> MakeConverter( + std::shared_ptr type, typename BaseConverter::OptionsType options, + MemoryPool* pool) { + MakeConverterImpl visitor{ + std::move(type), std::move(options), pool, nullptr}; + ARROW_RETURN_NOT_OK(VisitTypeInline(*visitor.type, &visitor)); + return std::move(visitor.out); +} + template class Chunker { public: From 55452816ec3a9d994b3e45074a4419edc093de42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 18:28:00 +0200 Subject: [PATCH 63/80] fix specialization --- cpp/src/arrow/array/builder_primitive.h | 5 +---- cpp/src/arrow/python/python_to_arrow.cc | 14 +++----------- cpp/src/arrow/util/converter.h | 1 - 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index b875dd2cf59..e6bd03946f8 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -32,10 +32,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {} explicit NullBuilder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()) - : NullBuilder(pool) { - // ARROW_CHECK_EQ(type->id(), NullType::type_id); - } + MemoryPool* pool = default_memory_pool()) : NullBuilder(pool) {} /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index d4eeeed57b8..ee69be7f216 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -405,20 +405,12 @@ template struct PyConverterTrait; template -struct PyConverterTrait> { +struct PyConverterTrait< + T, enable_if_t::value && !is_interval_type::value && + !is_extension_type::value>> { using type = PyPrimitiveConverter; }; -template -struct PyConverterTrait> { - using type = void; -}; - -template -struct PyConverterTrait> { - using type = void; -}; - template struct PyConverterTrait> { using type = PyListConverter; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index d798f7d6ad0..2138234fb3a 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -26,7 +26,6 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" - #include "arrow/visitor_inline.h" namespace arrow { From c52c367f6cc0eeb0fdfac88edb07a881c4d0f7a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 18:28:25 +0200 Subject: [PATCH 64/80] clang format --- cpp/src/arrow/array/builder_primitive.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e6bd03946f8..cc907fb6b8a 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -32,7 +32,8 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {} explicit NullBuilder(const std::shared_ptr& type, - MemoryPool* pool = default_memory_pool()) : NullBuilder(pool) {} + MemoryPool* pool = default_memory_pool()) + : NullBuilder(pool) {} /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { From 4b426d5e406fc8fd91954e778aaa38a851842fd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 20:01:32 +0200 Subject: [PATCH 65/80] fix overflow checking --- cpp/src/arrow/array/builder_binary.h | 2 +- cpp/src/arrow/util/converter.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index dbcba374230..45e9aedd6f5 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -329,7 +329,7 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow(num_bytes)); + ARROW_RETURN_NOT_OK(ValidateOverflow()); return offsets_builder_.Append(static_cast(num_bytes)); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 2138234fb3a..f6705bbeff7 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -236,7 +236,6 @@ static Result> MakeConverter( template class Chunker { public: - using Self = Chunker; using InputType = typename Converter::InputType; explicit Chunker(std::shared_ptr converter) From d823c6493abd1bdf8d6d3f9cebc807b37d3fcd6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 20:14:07 +0200 Subject: [PATCH 66/80] address review comments --- cpp/src/arrow/python/python_to_arrow.cc | 36 ++++++++++++------------- cpp/src/arrow/util/converter.h | 11 ++++---- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index ee69be7f216..0620d0400a6 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -424,7 +424,7 @@ struct PyConverterTrait { template <> struct PyConverterTrait { template - using type = PyDictionaryConverter; + using dictionary_type = PyDictionaryConverter; }; template @@ -547,22 +547,6 @@ class PyDictionaryConverter> } }; -// If the value type does not match the expected NumPy dtype, then fall through -// to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ - case Type::TYPE_ID: { \ - if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ - return Extend(this->value_converter_.get(), value, size); \ - } \ - return AppendNdarrayTyped(ndarray); \ - } - -// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE_ID) \ - case Type::TYPE_ID: { \ - return Extend(this->value_converter_.get(), value, size); \ - } - template class PyListConverter : public ListConverter { public: @@ -585,7 +569,7 @@ class PyListConverter : public ListConverter { } protected: - Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } + //Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } Status ValidateOverflow(const BaseListType*, int64_t size) { return this->list_builder_->ValidateOverflow(size); @@ -617,6 +601,20 @@ class PyListConverter : public ListConverter { const auto value_type = this->value_converter_->builder()->type(); switch (value_type->id()) { +// If the value type does not match the expected NumPy dtype, then fall through +// to a slower PySequence-based path +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return Extend(this->value_converter_.get(), value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ + } +// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return Extend(this->value_converter_.get(), value, size); \ + } LIST_SLOW_CASE(NA) LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) LIST_FAST_CASE(INT8, Int8Type, NPY_INT8) @@ -638,6 +636,8 @@ class PyListConverter : public ListConverter { LIST_SLOW_CASE(BINARY) LIST_SLOW_CASE(FIXED_SIZE_BINARY) LIST_SLOW_CASE(STRING) +#undef LIST_FAST_CASE +#undef LIST_SLOW_CASE case Type::LIST: { if (PyArray_DESCR(ndarray)->type_num != NPY_OBJECT) { return Status::Invalid( diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index f6705bbeff7..cf0fedbbec6 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -189,10 +189,10 @@ struct MakeConverterImpl { Status Visit(const DictionaryType& t) { switch (t.value_type()->id()) { -#define DICTIONARY_CASE(TYPE) \ - case TYPE::type_id: \ - out = std::make_shared< \ - typename ConverterTrait::template type>(); \ +#define DICTIONARY_CASE(TYPE) \ + case TYPE::type_id: \ + out = std::make_shared< \ + typename ConverterTrait::template dictionary_type>(); \ break; DICTIONARY_CASE(BooleanType); DICTIONARY_CASE(Int8Type); @@ -208,6 +208,7 @@ struct MakeConverterImpl { DICTIONARY_CASE(BinaryType); DICTIONARY_CASE(StringType); DICTIONARY_CASE(FixedSizeBinaryType); +#undef DICTIONARY_CASE default: return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(), " not implemented"); @@ -228,7 +229,7 @@ static Result> MakeConverter( std::shared_ptr type, typename BaseConverter::OptionsType options, MemoryPool* pool) { MakeConverterImpl visitor{ - std::move(type), std::move(options), pool, nullptr}; + std::move(type), std::move(options), pool, NULLPTR}; ARROW_RETURN_NOT_OK(VisitTypeInline(*visitor.type, &visitor)); return std::move(visitor.out); } From 57e6c7f495b75e8ee4ae5eb3c6ea47ea915fd8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 21 Sep 2020 21:17:55 +0200 Subject: [PATCH 67/80] implement ValidateOverflow for map type; test list overflow chunking --- cpp/src/arrow/array/builder_nested.h | 4 ++++ cpp/src/arrow/python/python_to_arrow.cc | 2 -- python/pyarrow/tests/test_convert_builtin.py | 16 +++++++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 7735d0df338..bb16170565b 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -289,6 +289,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { return map(key_builder_->type(), item_builder_->type(), keys_sorted_); } + Status ValidateOverflow(int64_t new_elements) { + return list_builder_->ValidateOverflow(new_elements); + } + protected: inline Status AdjustStructBuilderLength(); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 0620d0400a6..c679782b642 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -569,8 +569,6 @@ class PyListConverter : public ListConverter { } protected: - //Status ValidateOverflow(const MapType*, int64_t size) { return Status::OK(); } - Status ValidateOverflow(const BaseListType*, int64_t size) { return this->list_builder_->ValidateOverflow(size); } diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 708656e876b..a3750de4892 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1884,7 +1884,7 @@ def test_array_to_pylist_roundtrip(arr): @pytest.mark.large_memory -def test_auto_chunking(): +def test_auto_chunking_binary_like(): v1 = b'x' * 100000000 v2 = b'x' * 147483646 data = [v1] * 20 + [v2] @@ -1899,6 +1899,20 @@ def test_auto_chunking(): assert arr.chunk(1).to_pylist() == [b'x'] +@pytest.mark.slow +@pytest.mark.large_memory +def test_auto_chunking_list_like(): + item = np.ones((2**28,), dtype='uint8') + data = [item] * (2**3 - 1) + arr = pa.array(data, type=pa.list_(pa.uint8())) + assert isinstance(arr, pa.Array) + + item = np.ones((2**28,), dtype='uint8') + data = [item] * 2**3 + arr = pa.array(data, type=pa.list_(pa.uint8())) + assert isinstance(arr, pa.ChunkedArray) + + @pytest.mark.large_memory @pytest.mark.parametrize(('ty', 'char'), [ (pa.string(), 'x'), From 45a93cfe682f4ea2e201e2cfc6a6df7b71954d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 22 Sep 2020 14:06:31 +0200 Subject: [PATCH 68/80] address review comments --- cpp/src/arrow/util/converter.h | 5 +---- python/pyarrow/array.pxi | 3 ++- python/pyarrow/scalar.pxi | 1 + python/pyarrow/tests/test_types.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index cf0fedbbec6..2b85a483185 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -52,10 +52,7 @@ class Converter { return Init(pool); } - virtual Status Append(InputType value) { - return Status::NotImplemented("Converter not implemented for type ", - type()->ToString()); - } + virtual Status Append(InputType value) = 0; const std::shared_ptr& builder() const { return builder_; } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9c2c0395758..d70e15c50a5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -33,6 +33,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, options.size = size options.from_pandas = from_pandas + options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) with nogil: chunked = GetResultValue( @@ -194,7 +195,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, ] >>> import numpy as np - >>> pa.array(pd.Series([1, 2]), np.array([0, 1], dtype=bool)) + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) [ 1, diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 53219dee238..0ea197df34e 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -609,6 +609,7 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): def __getitem__(self, key): """ Return the child value for the given field. + Parameters ---------- index : Union[int, str] diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 345f469be1e..9ac2f01686f 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -285,7 +285,7 @@ def test_tzinfo_to_string_errors(): @h.given(tzst.timezones()) -def test_timezone_roundtrip(tz): +def test_pytz_timezone_roundtrip(tz): timezone_string = pa.lib.tzinfo_to_string(tz) timezone_tzinfo = pa.lib.string_to_tzinfo(timezone_string) assert timezone_tzinfo == tz From 3359f47fabc5e3d137f6ca7fea2b4cb77649a2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 22 Sep 2020 15:57:26 +0200 Subject: [PATCH 69/80] test PYARROW_IGNORE_TIMEZONE; test map array overflow --- cpp/src/arrow/python/python_to_arrow.cc | 15 +++---- python/pyarrow/tests/test_convert_builtin.py | 45 ++++++++++++++++++++ 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index c679782b642..870fa8e7ba4 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -236,11 +236,12 @@ class PyValue { } static Result Convert(const TimestampType* type, const O& options, I obj) { - int64_t value; + int64_t value, offset; if (PyDateTime_Check(obj)) { - ARROW_ASSIGN_OR_RAISE(int64_t offset, internal::PyDateTime_utcoffset_s(obj)); - if (options.ignore_timezone) { + if (ARROW_PREDICT_FALSE(options.ignore_timezone)) { offset = 0; + } else { + ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj)); } auto dt = reinterpret_cast(obj); switch (type->unit()) { @@ -569,10 +570,6 @@ class PyListConverter : public ListConverter { } protected: - Status ValidateOverflow(const BaseListType*, int64_t size) { - return this->list_builder_->ValidateOverflow(size); - } - Status ValidateBuilder(const MapType*) { if (this->list_builder_->key_builder()->null_count() > 0) { return Status::Invalid("Invalid Map: key field can not contain null values"); @@ -585,7 +582,7 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); - RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); + RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); return Extend(this->value_converter_.get(), value, size); } @@ -595,7 +592,7 @@ class PyListConverter : public ListConverter { return Status::Invalid("Can only convert 1-dimensional array values"); } const int64_t size = PyArray_SIZE(ndarray); - RETURN_NOT_OK(ValidateOverflow(this->list_type_, size)); + RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); const auto value_type = this->value_converter_->builder()->type(); switch (value_type->id()) { diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index a3750de4892..51b3df48d4d 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -966,6 +966,38 @@ def expected_datetime_value(dt): assert arr[i].as_py() == expected_datetime_value(utcdata[i]) +@pytest.mark.parametrize('timezone', [ + None, + 'UTC', + 'Etc/GMT-1', + 'Europe/Budapest', +]) +def test_pyarrow_ignore_timezone_environment_variable(monkeypatch, timezone): + # note that any non-empty value will evaluate to true + monkeypatch.setenv("PYARROW_IGNORE_TIMEZONE", "1") + data = [ + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + pytz.timezone('US/Eastern').localize( + datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) + ), + pytz.timezone('Europe/Moscow').localize( + datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) + ), + ] + + expected = [dt.replace(tzinfo=None) for dt in data] + if timezone is not None: + tzinfo = pytz.timezone(timezone) + expected = [tzinfo.fromutc(dt) for dt in expected] + + ty = pa.timestamp('us', tz=timezone) + arr = pa.array(data, type=ty) + assert arr.to_pylist() == expected + + def test_sequence_timestamp_with_timezone_inference(): data = [ datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive @@ -1913,6 +1945,19 @@ def test_auto_chunking_list_like(): assert isinstance(arr, pa.ChunkedArray) +@pytest.mark.slow +@pytest.mark.large_memory +def test_auto_chunking_map_type(): + # takes ~20 minutes locally + ty = pa.map_(pa.int8(), pa.int8()) + item = [(1, 1)] * 2**28 + data = [item] * 2**3 + arr = pa.array(data, type=ty) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr.chunk(0)) == 7 + assert len(arr.chunk(1)) == 1 + + @pytest.mark.large_memory @pytest.mark.parametrize(('ty', 'char'), [ (pa.string(), 'x'), From e8a7475cc5fae40e40d9d4039285c62d1dc4ea5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 23 Sep 2020 11:18:26 +0200 Subject: [PATCH 70/80] address remaining review comments --- cpp/src/arrow/ipc/metadata_internal.cc | 2 +- cpp/src/arrow/util/converter.h | 27 +++++++++++++------------- cpp/src/arrow/util/uri.cc | 2 +- python/pyarrow/array.pxi | 4 ++++ 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index fe4314921e0..9c967a5423d 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1154,7 +1154,7 @@ Status GetKeyValueMetadata(const KVVector* fb_metadata, auto metadata = std::make_shared(); metadata->reserve(fb_metadata->size()); - for (const auto& pair : *fb_metadata) { + for (const auto pair : *fb_metadata) { CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key"); CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value"); metadata->Append(pair->key()->str(), pair->value()->str()); diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 2b85a483185..e208b21a94f 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -84,32 +84,33 @@ class Converter { OptionsType options_; }; -template +template class PrimitiveConverter : public BaseConverter { public: - using BuilderType = typename TypeTraits::BuilderType; + using BuilderType = typename TypeTraits::BuilderType; protected: Status Init(MemoryPool* pool) override { this->builder_ = std::make_shared(this->type_, pool); - this->primitive_type_ = checked_cast(this->type_.get()); + this->primitive_type_ = checked_cast(this->type_.get()); this->primitive_builder_ = checked_cast(this->builder_.get()); return Status::OK(); } - const T* primitive_type_; + const ArrowType* primitive_type_; BuilderType* primitive_builder_; }; -template class ConverterTrait> +template class ConverterTrait> class ListConverter : public BaseConverter { public: - using BuilderType = typename TypeTraits::BuilderType; - using ConverterType = typename ConverterTrait::type; + using BuilderType = typename TypeTraits::BuilderType; + using ConverterType = typename ConverterTrait::type; protected: Status Init(MemoryPool* pool) override { - list_type_ = checked_cast(this->type_.get()); + list_type_ = checked_cast(this->type_.get()); ARROW_ASSIGN_OR_RAISE(value_converter_, (MakeConverter( list_type_->value_type(), this->options_, pool))); @@ -120,7 +121,7 @@ class ListConverter : public BaseConverter { return Status::OK(); } - const T* list_type_; + const ArrowType* list_type_; BuilderType* list_builder_; std::shared_ptr value_converter_; }; @@ -155,10 +156,10 @@ class StructConverter : public BaseConverter { StructBuilder* struct_builder_; }; -template +template class DictionaryConverter : public BaseConverter { public: - using BuilderType = DictionaryBuilder; + using BuilderType = DictionaryBuilder; protected: Status Init(MemoryPool* pool) override { @@ -166,13 +167,13 @@ class DictionaryConverter : public BaseConverter { ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder)); this->builder_ = std::move(builder); dict_type_ = checked_cast(this->type_.get()); - value_type_ = checked_cast(dict_type_->value_type().get()); + value_type_ = checked_cast(dict_type_->value_type().get()); value_builder_ = checked_cast(this->builder_.get()); return Status::OK(); } const DictionaryType* dict_type_; - const U* value_type_; + const ValueType* value_type_; BuilderType* value_builder_; }; diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc index 795e3fa2c8b..1261607b6c1 100644 --- a/cpp/src/arrow/util/uri.cc +++ b/cpp/src/arrow/util/uri.cc @@ -169,7 +169,7 @@ std::string Uri::path() const { ss << "/"; } bool first = true; - for (const auto seg : segments) { + for (const auto& seg : segments) { if (!first) { ss << "/"; } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index d70e15c50a5..c5c06cec031 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -201,6 +201,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, 1, null ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) """ cdef: CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) From e04e245d06ff2c65d103f49bbd12c6efd537bef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 23 Sep 2020 11:45:25 +0200 Subject: [PATCH 71/80] remove leftover --- cpp/src/arrow/array/array_list_test.cc | 4 ++++ python/pyarrow/tests/test_convert_builtin.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 6a2b47e8e20..0dff0f48b00 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -534,7 +534,11 @@ TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } +#ifndef ARROW_LARGE_MEMORY_TESTS +TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck(); } +#else TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } +#endif // ---------------------------------------------------------------------- // Map tests diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 51b3df48d4d..a81b4d1c087 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -133,10 +133,6 @@ def _as_tuple(xs): return tuple(xs) -def _as_pairs(xs): - return [None if x is None else list(x.items()) for x in xs] - - def _as_deque(xs): # deque is a sequence while neither tuple nor list return collections.deque(xs) From fe3ceb279b6a856ffa12810ef41424e0b02242fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 23 Sep 2020 15:17:01 +0200 Subject: [PATCH 72/80] test cases for two additional auto chunking issues --- python/pyarrow/tests/test_convert_builtin.py | 11 +++++++++++ python/pyarrow/tests/test_pandas.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index a81b4d1c087..330ba69b961 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1927,6 +1927,17 @@ def test_auto_chunking_binary_like(): assert arr.chunk(1).to_pylist() == [b'x'] +@pytest.mark.large_memory +def test_auto_chunking_list_of_binary(): + # ARROW-6281 + vals = [['x' * 1024]] * ((2 << 20) + 1) + arr = pa.array(vals) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 2**21 - 1 + assert len(arr.chunk(1)) == 2 + + @pytest.mark.slow @pytest.mark.large_memory def test_auto_chunking_list_like(): diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 03407521c12..5bed7609461 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1497,6 +1497,22 @@ def test_bytes_exceed_2gb(self): table = pa.Table.from_pandas(df) assert table[0].num_chunks == 2 + @pytest.mark.large_memory + @pytest.mark.parametrize('char', ['x', b'x']) + def test_auto_chunking_pandas_series_of_strings(self, char): + # ARROW-2367 + v1 = char * 100000000 + v2 = char * 147483646 + + df = pd.DataFrame({ + 'strings': [[v1]] * 20 + [[v2]] + [[b'x']] + }) + arr = pa.array(df['strings'], from_pandas=True) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(1)) == 1 + def test_fixed_size_bytes(self): values = [b'foo', None, bytearray(b'bar'), None, None, b'hey'] df = pd.DataFrame({'strings': values}) From 01a89fc2ec2b96e8deb9ac2ef7f51c0337d6bc71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 23 Sep 2020 16:18:38 +0200 Subject: [PATCH 73/80] additional overflow tests --- python/pyarrow/tests/test_convert_builtin.py | 20 +++++++++++++++++++- python/pyarrow/tests/test_pandas.py | 16 ++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 330ba69b961..698c6db55b2 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1913,19 +1913,32 @@ def test_array_to_pylist_roundtrip(arr): @pytest.mark.large_memory def test_auto_chunking_binary_like(): + # single chunk v1 = b'x' * 100000000 v2 = b'x' * 147483646 data = [v1] * 20 + [v2] arr = pa.array(data, type=pa.binary()) assert isinstance(arr, pa.Array) - data += ['x'] * 1 + # two chunks + data = [v1] * 20 + [v2] + ['x'] * 1 arr = pa.array(data, type=pa.binary()) assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 assert len(arr.chunk(0)) == 21 assert len(arr.chunk(1)) == 1 assert arr.chunk(1).to_pylist() == [b'x'] + # three chunks + data = ([v1] * 20 + [v2]) + ([v1] * 20 + [v2]) + ['x'] * 2 + arr = pa.array(data, type=pa.binary()) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 3 + assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(1)) == 21 + assert len(arr.chunk(2)) == 2 + assert arr.chunk(2).to_pylist() == [b'x', b'x'] + @pytest.mark.large_memory def test_auto_chunking_list_of_binary(): @@ -1945,11 +1958,15 @@ def test_auto_chunking_list_like(): data = [item] * (2**3 - 1) arr = pa.array(data, type=pa.list_(pa.uint8())) assert isinstance(arr, pa.Array) + assert len(arr) == 7 item = np.ones((2**28,), dtype='uint8') data = [item] * 2**3 arr = pa.array(data, type=pa.list_(pa.uint8())) assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 7 + assert len(arr.chunk(1)) == 1 @pytest.mark.slow @@ -1988,6 +2005,7 @@ def test_nested_auto_chunking(ty, char): data.append({'bool': True, 'integer': 1, 'string-like': char}) arr = pa.array(data, type=struct_type) assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 assert len(arr.chunk(0)) == 21 assert len(arr.chunk(1)) == 1 diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 5bed7609461..5de84b30432 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2103,6 +2103,22 @@ def test_large_binary_list(self): s, pd.Series([["aa", "bb"], None, ["cc"], []]), check_names=False) + @pytest.mark.slow + @pytest.mark.large_memory + def test_auto_chunking_on_list_overflow(self): + # ARROW-9976 + n = 2**24 + df = pd.DataFrame.from_dict({ + "a": list(np.zeros((n, 2**7), dtype='uint8')), + "b": range(n) + }) + table = pa.Table.from_pandas(df) + + column_a = table[0] + assert column_a.num_chunks == 2 + assert len(column_a.chunk(0)) == 2**24 - 1 + assert len(column_a.chunk(1)) == 1 + class TestConvertStructTypes: """ From aaf92b8f0df65a7415be68b582b1d85a7050c6ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 23 Sep 2020 17:44:38 +0200 Subject: [PATCH 74/80] missing newline in docstrings --- python/pyarrow/scalar.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 0ea197df34e..3e72d060d69 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -614,6 +614,7 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): ---------- index : Union[int, str] Index / position or name of the field. + Returns ------- result : Scalar From 3115303092c1c7edefc8cb03d7bf8df289ed188e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 25 Sep 2020 14:57:34 +0200 Subject: [PATCH 75/80] conversion optimizations --- cpp/src/arrow/array/builder_binary.h | 63 +++++---- cpp/src/arrow/array/builder_nested.h | 19 +-- cpp/src/arrow/python/common.h | 64 ++++++--- cpp/src/arrow/python/python_to_arrow.cc | 137 +++++++++++++------ cpp/src/arrow/type_traits.h | 3 +- cpp/src/arrow/util/converter.h | 88 ++++++++---- python/pyarrow/tests/test_convert_builtin.py | 40 ++++-- 7 files changed, 261 insertions(+), 153 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 45e9aedd6f5..21993ce5493 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -76,21 +76,9 @@ class BaseBinaryBuilder : public ArrayBuilder { return Append(value.data(), static_cast(value.size())); } - Status ValidateOverflow() { return ValidateOverflow(0); } - - Status ValidateOverflow(int64_t new_bytes) { - auto new_size = value_data_builder_.length() + new_bytes; - if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { - return Status::CapacityError("array cannot contain more than ", memory_limit(), - " bytes, have ", new_size); - } else { - return Status::OK(); - } - } - Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -242,6 +230,16 @@ class BaseBinaryBuilder : public ArrayBuilder { value_data_builder_.Reset(); } + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = value_data_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + Status Resize(int64_t capacity) override { // XXX Why is this check necessary? There is no reason to disallow, say, // binary arrays with more than 2**31 empty or null values. @@ -259,12 +257,8 @@ class BaseBinaryBuilder : public ArrayBuilder { /// \brief Ensures there is enough allocated capacity to append the indicated /// number of bytes to the value data buffer without additional allocations Status ReserveData(int64_t elements) { - const int64_t size = value_data_length() + elements; - ARROW_RETURN_IF(size > memory_limit(), - Status::CapacityError("Cannot reserve capacity larger than ", - memory_limit(), " bytes")); - return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) - : Status::OK(); + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return value_data_builder_.Reserve(elements); } Status FinishInternal(std::shared_ptr* out) override { @@ -329,7 +323,7 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - ARROW_RETURN_NOT_OK(ValidateOverflow()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); return offsets_builder_.Append(static_cast(num_bytes)); } @@ -425,18 +419,6 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status ValidateOverflow() { return ValidateOverflow(0); } - - Status ValidateOverflow(int64_t new_bytes) { - auto new_size = byte_builder_.length() + new_bytes; - if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { - return Status::CapacityError("array cannot contain more than ", memory_limit(), - " bytes, have ", new_size); - } else { - return Status::OK(); - } - } - Status Append(const std::string& s) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(s); @@ -477,6 +459,23 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); } + Status ValidateOverflow(int64_t new_bytes) const { + auto new_size = byte_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return byte_builder_.Reserve(elements); + } + void Reset() override; Status Resize(int64_t capacity) override; Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index bb16170565b..b8948403acc 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -100,7 +100,7 @@ class BaseListBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); UnsafeAppendToBitmap(length, false); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { @@ -131,11 +131,11 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } - Status ValidateOverflow(int64_t new_elements) { + Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { - return Status::CapacityError("array cannot contain more than ", maximum_elements(), - " elements, have ", new_elements); + return Status::CapacityError("List array cannot contain more than ", + maximum_elements(), " elements, have ", new_elements); } else { return Status::OK(); } @@ -157,17 +157,8 @@ class BaseListBuilder : public ArrayBuilder { std::shared_ptr value_builder_; std::shared_ptr value_field_; - Status CheckNextOffset() const { - const int64_t num_values = value_builder_->length(); - ARROW_RETURN_IF( - num_values > maximum_elements(), - Status::CapacityError("List array cannot contain more than ", maximum_elements(), - " child elements,", " have ", num_values)); - return Status::OK(); - } - Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); const int64_t num_values = value_builder_->length(); return offsets_builder_.Append(static_cast(num_values)); } diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 9f4d7c15bee..557a7ff24da 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -191,58 +191,78 @@ struct PyBytesView { Py_ssize_t size; bool is_utf8; - // View the given Python object as string-like, i.e. str or (utf8) bytes static Result FromString(PyObject* obj, bool check_utf8 = false) { + PyBytesView self; + ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8)); + return self; + } + + static Result FromUnicode(PyObject* obj) { + PyBytesView self; + ARROW_RETURN_NOT_OK(self.ParseUnicode(obj)); + return self; + } + + static Result FromBinary(PyObject* obj) { + PyBytesView self; + ARROW_RETURN_NOT_OK(self.ParseBinary(obj)); + return self; + } + + // View the given Python object as string-like, i.e. str or (utf8) bytes + Status ParseString(PyObject* obj, bool check_utf8 = false) { if (PyUnicode_Check(obj)) { - return FromUnicode(obj); + return ParseUnicode(obj); } else { - ARROW_ASSIGN_OR_RAISE(auto result, FromBinary(obj)); + ARROW_RETURN_NOT_OK(ParseBinary(obj)); if (check_utf8) { // Check the bytes are utf8 utf-8 - OwnedRef decoded(PyUnicode_FromStringAndSize(result.bytes, result.size)); + OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size)); if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - result.is_utf8 = true; + is_utf8 = true; } else { PyErr_Clear(); - result.is_utf8 = false; + is_utf8 = false; } } - return std::move(result); + return Status::OK(); } } // View the given Python object as unicode string - static Result FromUnicode(PyObject* obj) { - Py_ssize_t size; + Status ParseUnicode(PyObject* obj) { // The utf-8 representation is cached on the unicode object - const char* data = PyUnicode_AsUTF8AndSize(obj, &size); + bytes = PyUnicode_AsUTF8AndSize(obj, &size); RETURN_IF_PYERROR(); - return PyBytesView(data, size, true); + is_utf8 = true; + return Status::OK(); } // View the given Python object as binary-like, i.e. bytes - static Result FromBinary(PyObject* obj) { + Status ParseBinary(PyObject* obj) { if (PyBytes_Check(obj)) { - return PyBytesView(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), false); + bytes = PyBytes_AS_STRING(obj); + size = PyBytes_GET_SIZE(obj); + is_utf8 = false; } else if (PyByteArray_Check(obj)) { - return PyBytesView(PyByteArray_AS_STRING(obj), PyByteArray_GET_SIZE(obj), false); + bytes = PyByteArray_AS_STRING(obj); + size = PyByteArray_GET_SIZE(obj); + is_utf8 = false; } else if (PyMemoryView_Check(obj)) { - PyObject* contig_view = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); + PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); RETURN_IF_PYERROR(); - Py_buffer* buffer = PyMemoryView_GET_BUFFER(contig_view); - return PyBytesView(reinterpret_cast(buffer->buf), buffer->len, false, - contig_view); + Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref); + bytes = reinterpret_cast(buffer->buf); + size = buffer->len; + is_utf8 = false; } else { return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name, "' object"); } + return Status::OK(); } protected: - PyBytesView(const char* bytes, Py_ssize_t size, bool is_utf8 = false, - PyObject* obj = NULLPTR) - : bytes(bytes), size(size), is_utf8(is_utf8), ref(obj) {} - OwnedRef ref; }; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 870fa8e7ba4..319810e00e1 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -117,7 +117,7 @@ class PyValue { I obj) { typename T::c_type value; auto status = internal::CIntFromPython(obj, &value); - if (status.ok()) { + if (ARROW_PREDICT_TRUE(status.ok())) { return value; } else if (!internal::PyIntScalar_Check(obj)) { return internal::InvalidValue(obj, "tried to convert to int"); @@ -323,35 +323,36 @@ class PyValue { // object was unicode encoded or not, which is used for unicode -> bytes coersion if // there is a non-unicode object observed. - static Result Convert(const BaseBinaryType*, const O&, I obj) { - return PyBytesView::FromString(obj); + static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); } - static Result Convert(const FixedSizeBinaryType* type, const O&, I obj) { - ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj)); - if (ARROW_PREDICT_TRUE(view.size == type->byte_width())) { - return std::move(view); - } else { + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, + PyBytesView& view) { + ARROW_RETURN_NOT_OK(view.ParseString(obj)); + if (view.size != type->byte_width()) { std::stringstream ss; ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); + } else { + return Status::OK(); } } template - static enable_if_string> Convert(const T*, const O& options, - I obj) { + static enable_if_string Convert(const T*, const O& options, I obj, + PyBytesView& view) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 - ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromString(obj, true)); + ARROW_RETURN_NOT_OK(view.ParseString(obj, true)); if (!view.is_utf8) { return internal::InvalidValue(obj, "was not a utf8 string"); } - return std::move(view); + return Status::OK(); } else { // Non-strict conversion; keep track of whether values are unicode or bytes - return PyBytesView::FromString(obj); + return view.ParseString(obj); } } @@ -429,10 +430,7 @@ struct PyConverterTrait { }; template -class PyPrimitiveConverter< - T, enable_if_t::value || is_boolean_type::value || - is_number_type::value || is_decimal_type::value || - is_date_type::value || is_time_type::value>> +class PyPrimitiveConverter> : public PrimitiveConverter { public: Status Append(PyObject* value) override { @@ -446,6 +444,24 @@ class PyPrimitiveConverter< } }; +template +class PyPrimitiveConverter< + T, enable_if_t::value || is_number_type::value || + is_decimal_type::value || is_date_type::value || + is_time_type::value>> : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + this->primitive_builder_->UnsafeAppend(converted); + } + return Status::OK(); + } +}; + template class PyPrimitiveConverter< T, enable_if_t::value || is_duration_type::value>> @@ -453,18 +469,19 @@ class PyPrimitiveConverter< public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->primitive_builder_->AppendNull(); + this->primitive_builder_->UnsafeAppendNull(); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); // Numpy NaT sentinels can be checked after the conversion if (PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->primitive_type_, converted)) { - return this->primitive_builder_->AppendNull(); + this->primitive_builder_->UnsafeAppendNull(); } else { - return this->primitive_builder_->Append(converted); + this->primitive_builder_->UnsafeAppend(converted); } } + return Status::OK(); } }; @@ -474,14 +491,38 @@ class PyPrimitiveConverter> public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->primitive_builder_->AppendNull(); + this->primitive_builder_->UnsafeAppendNull(); } else { - ARROW_ASSIGN_OR_RAISE( - auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes, view_.size); + } + return Status::OK(); + } + + protected: + PyBytesView view_; +}; + +template +class PyPrimitiveConverter::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes); } + return Status::OK(); } + + protected: + PyBytesView view_; }; template @@ -490,17 +531,18 @@ class PyPrimitiveConverter> public: Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { - return this->primitive_builder_->AppendNull(); + this->primitive_builder_->UnsafeAppendNull(); } else { - ARROW_ASSIGN_OR_RAISE( - auto view, PyValue::Convert(this->primitive_type_, this->options_, value)); - if (!view.is_utf8) { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + if (!view_.is_utf8) { // observed binary value observed_binary_ = true; } - ARROW_RETURN_NOT_OK(this->primitive_builder_->ValidateOverflow(view.size)); - return this->primitive_builder_->Append(util::string_view(view.bytes, view.size)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes, view_.size); } + return Status::OK(); } Result> ToArray() override { @@ -515,6 +557,7 @@ class PyPrimitiveConverter> } protected: + PyBytesView view_; bool observed_binary_ = false; }; @@ -541,11 +584,14 @@ class PyDictionaryConverter> if (PyValue::IsNull(this->options_, value)) { return this->value_builder_->AppendNull(); } else { - ARROW_ASSIGN_OR_RAISE(auto view, - PyValue::Convert(this->value_type_, this->options_, value)); - return this->value_builder_->Append(util::string_view(view.bytes, view.size)); + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->value_type_, this->options_, value, view_)); + return this->value_builder_->Append(view_.bytes, view_.size); } } + + protected: + PyBytesView view_; }; template @@ -962,15 +1008,24 @@ Result> ConvertPySequence(PyObject* obj, PyObject* ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); - ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(converter)); - - // Convert values - if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); + if (converter->may_overflow()) { + ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); + // Convert values + if (mask != nullptr && mask != Py_None) { + RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); + } else { + RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); + } + return chunked_converter->ToChunkedArray(); } else { - RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); + // Convert values + if (mask != nullptr && mask != Py_None) { + RETURN_NOT_OK(ExtendMasked(converter.get(), seq, mask, size)); + } else { + RETURN_NOT_OK(Extend(converter.get(), seq, size)); + } + return converter->ToChunkedArray(); } - return chunked_converter->ToChunkedArray(); } } // namespace py diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 6035a1cacbb..d2abe573cd5 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -532,8 +532,7 @@ using enable_if_base_binary = enable_if_t::value, R>; template using is_binary_type = std::integral_constant::value || - std::is_same::value || - std::is_same::value>; + std::is_same::value>; template using enable_if_binary = enable_if_t::value, R>; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index e208b21a94f..1184821b7e8 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -26,13 +26,14 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/make_unique.h" #include "arrow/visitor_inline.h" namespace arrow { namespace internal { template class ConverterTrait> -static Result> MakeConverter( +static Result> MakeConverter( std::shared_ptr type, typename BaseConverter::OptionsType options, MemoryPool* pool); @@ -60,9 +61,9 @@ class Converter { OptionsType options() const { return options_; } - const std::vector>& children() const { return children_; } + bool may_overflow() const { return may_overflow_; } - Status Reserve(int64_t additional_capacity) { + virtual Status Reserve(int64_t additional_capacity) { return builder_->Reserve(additional_capacity); } @@ -75,13 +76,19 @@ class Converter { return arr->Slice(0, length); } + virtual Result> ToChunkedArray() { + ARROW_ASSIGN_OR_RAISE(auto array, ToArray()); + std::vector> chunks = {std::move(array)}; + return std::make_shared(chunks); + } + protected: virtual Status Init(MemoryPool* pool) { return Status::OK(); } std::shared_ptr type_; std::shared_ptr builder_; - std::vector> children_; OptionsType options_; + bool may_overflow_ = false; }; template @@ -92,8 +99,10 @@ class PrimitiveConverter : public BaseConverter { protected: Status Init(MemoryPool* pool) override { this->builder_ = std::make_shared(this->type_, pool); - this->primitive_type_ = checked_cast(this->type_.get()); - this->primitive_builder_ = checked_cast(this->builder_.get()); + this->may_overflow_ = + is_base_binary_like(this->type_->id()) || is_fixed_size_binary(this->type_->id()); + primitive_type_ = checked_cast(this->type_.get()); + primitive_builder_ = checked_cast(this->builder_.get()); return Status::OK(); } @@ -116,14 +125,14 @@ class ListConverter : public BaseConverter { list_type_->value_type(), this->options_, pool))); this->builder_ = std::make_shared(pool, value_converter_->builder(), this->type_); - this->children_ = {value_converter_}; list_builder_ = checked_cast(this->builder_.get()); + this->may_overflow_ = true; return Status::OK(); } const ArrowType* list_type_; BuilderType* list_builder_; - std::shared_ptr value_converter_; + std::unique_ptr value_converter_; }; template class ConverterTrait> @@ -131,9 +140,17 @@ class StructConverter : public BaseConverter { public: using ConverterType = typename ConverterTrait::type; + Status Reserve(int64_t additional_capacity) override { + ARROW_RETURN_NOT_OK(this->builder_->Reserve(additional_capacity)); + for (const auto& child : children_) { + ARROW_RETURN_NOT_OK(child->Reserve(additional_capacity)); + } + return Status::OK(); + } + protected: Status Init(MemoryPool* pool) override { - std::shared_ptr child_converter; + std::unique_ptr child_converter; std::vector> child_builders; struct_type_ = checked_cast(this->type_.get()); @@ -141,8 +158,9 @@ class StructConverter : public BaseConverter { ARROW_ASSIGN_OR_RAISE(child_converter, (MakeConverter( field->type(), this->options_, pool))); + this->may_overflow_ |= child_converter->may_overflow(); child_builders.push_back(child_converter->builder()); - this->children_.push_back(std::move(child_converter)); + children_.push_back(std::move(child_converter)); } this->builder_ = @@ -154,6 +172,7 @@ class StructConverter : public BaseConverter { const StructType* struct_type_; StructBuilder* struct_builder_; + std::vector> children_; }; template @@ -166,6 +185,7 @@ class DictionaryConverter : public BaseConverter { std::unique_ptr builder; ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder)); this->builder_ = std::move(builder); + this->may_overflow_ = false; dict_type_ = checked_cast(this->type_.get()); value_type_ = checked_cast(dict_type_->value_type().get()); value_builder_ = checked_cast(this->builder_.get()); @@ -189,7 +209,7 @@ struct MakeConverterImpl { switch (t.value_type()->id()) { #define DICTIONARY_CASE(TYPE) \ case TYPE::type_id: \ - out = std::make_shared< \ + out = make_unique< \ typename ConverterTrait::template dictionary_type>(); \ break; DICTIONARY_CASE(BooleanType); @@ -219,11 +239,11 @@ struct MakeConverterImpl { std::shared_ptr type; typename BaseConverter::OptionsType options; MemoryPool* pool; - std::shared_ptr out; + std::unique_ptr out; }; template class ConverterTrait> -static Result> MakeConverter( +static Result> MakeConverter( std::shared_ptr type, typename BaseConverter::OptionsType options, MemoryPool* pool) { MakeConverterImpl visitor{ @@ -237,41 +257,44 @@ class Chunker { public: using InputType = typename Converter::InputType; - explicit Chunker(std::shared_ptr converter) + explicit Chunker(std::unique_ptr converter) : converter_(std::move(converter)) {} Status Reserve(int64_t additional_capacity) { - return converter_->Reserve(additional_capacity); + ARROW_RETURN_NOT_OK(converter_->Reserve(additional_capacity)); + reserved_ += additional_capacity; + return Status::OK(); } Status AppendNull() { auto status = converter_->AppendNull(); - if (status.ok()) { - length_ = converter_->builder()->length(); - } else if (status.IsCapacityError()) { + if (ARROW_PREDICT_FALSE(status.IsCapacityError())) { ARROW_RETURN_NOT_OK(FinishChunk()); return converter_->AppendNull(); } - return status; + ++length_; + return std::move(status); } Status Append(InputType value) { auto status = converter_->Append(value); - if (status.ok()) { - length_ = converter_->builder()->length(); - } else if (status.IsCapacityError()) { + if (ARROW_PREDICT_FALSE(status.IsCapacityError())) { ARROW_RETURN_NOT_OK(FinishChunk()); return Append(value); } - return status; + ++length_; + return std::move(status); } Status FinishChunk() { ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_)); - converter_->builder()->Reset(); - length_ = 0; chunks_.push_back(chunk); - return Status::OK(); + // reserve space for the remaining items, besides being an optimization it is also + // required if the converter's implementation relies on unsafe builder methods in + // conveter->Append() + auto remaining = reserved_ - length_; + Reset(); + return Reserve(remaining); } Result> ToChunkedArray() { @@ -280,14 +303,21 @@ class Chunker { } protected: + void Reset() { + converter_->builder()->Reset(); + length_ = 0; + reserved_ = 0; + } + int64_t length_ = 0; - std::shared_ptr converter_; + int64_t reserved_ = 0; + std::unique_ptr converter_; std::vector> chunks_; }; template -static Result>> MakeChunker(std::shared_ptr converter) { - return std::make_shared>(std::move(converter)); +static Result>> MakeChunker(std::unique_ptr converter) { + return make_unique>(std::move(converter)); } } // namespace internal diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 698c6db55b2..f4894a4226a 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -382,7 +382,7 @@ def test_sequence_custom_integers(seq): @parametrize_with_iterable_types def test_broken_integers(seq): data = [MyBrokenInt()] - with pytest.raises(pa.ArrowInvalid): + with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): pa.array(seq(data), type=pa.int64()) @@ -1916,28 +1916,41 @@ def test_auto_chunking_binary_like(): # single chunk v1 = b'x' * 100000000 v2 = b'x' * 147483646 - data = [v1] * 20 + [v2] - arr = pa.array(data, type=pa.binary()) + + # single chunk + one_chunk_data = [v1] * 20 + [b'', None, v2] + arr = pa.array(one_chunk_data, type=pa.binary()) assert isinstance(arr, pa.Array) + assert len(arr) == 23 + assert arr[20].as_py() == b'' + assert arr[21].as_py() is None + assert arr[22].as_py() == v2 # two chunks - data = [v1] * 20 + [v2] + ['x'] * 1 - arr = pa.array(data, type=pa.binary()) + two_chunk_data = one_chunk_data + [b'two'] + arr = pa.array(two_chunk_data, type=pa.binary()) assert isinstance(arr, pa.ChunkedArray) assert arr.num_chunks == 2 - assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(0)) == 23 assert len(arr.chunk(1)) == 1 - assert arr.chunk(1).to_pylist() == [b'x'] + assert arr.chunk(0)[20].as_py() == b'' + assert arr.chunk(0)[21].as_py() is None + assert arr.chunk(0)[22].as_py() == v2 + assert arr.chunk(1).to_pylist() == [b'two'] # three chunks - data = ([v1] * 20 + [v2]) + ([v1] * 20 + [v2]) + ['x'] * 2 - arr = pa.array(data, type=pa.binary()) + three_chunk_data = one_chunk_data * 2 + [b'three', b'three'] + arr = pa.array(three_chunk_data, type=pa.binary()) assert isinstance(arr, pa.ChunkedArray) assert arr.num_chunks == 3 - assert len(arr.chunk(0)) == 21 - assert len(arr.chunk(1)) == 21 + assert len(arr.chunk(0)) == 23 + assert len(arr.chunk(1)) == 23 assert len(arr.chunk(2)) == 2 - assert arr.chunk(2).to_pylist() == [b'x', b'x'] + for i in range(2): + assert arr.chunk(i)[20].as_py() == b'' + assert arr.chunk(i)[21].as_py() is None + assert arr.chunk(i)[22].as_py() == v2 + assert arr.chunk(2).to_pylist() == [b'three', b'three'] @pytest.mark.large_memory @@ -1949,6 +1962,7 @@ def test_auto_chunking_list_of_binary(): assert arr.num_chunks == 2 assert len(arr.chunk(0)) == 2**21 - 1 assert len(arr.chunk(1)) == 2 + assert arr.chunk(1).to_pylist() == [['x' * 1024]] * 2 @pytest.mark.slow @@ -1967,6 +1981,7 @@ def test_auto_chunking_list_like(): assert arr.num_chunks == 2 assert len(arr.chunk(0)) == 7 assert len(arr.chunk(1)) == 1 + assert arr.chunk(1)[0].as_py() == list(item) @pytest.mark.slow @@ -2008,7 +2023,6 @@ def test_nested_auto_chunking(ty, char): assert arr.num_chunks == 2 assert len(arr.chunk(0)) == 21 assert len(arr.chunk(1)) == 1 - assert arr.chunk(1)[0].as_py() == { 'bool': True, 'integer': 1, From 6c709e2671bb7247fe7d4a009f0365aceb79f92e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 25 Sep 2020 15:13:22 +0200 Subject: [PATCH 76/80] please gcc 4.8 --- cpp/src/arrow/python/common.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 557a7ff24da..8560fa2d6f4 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -194,19 +194,19 @@ struct PyBytesView { static Result FromString(PyObject* obj, bool check_utf8 = false) { PyBytesView self; ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8)); - return self; + return std::move(self); } static Result FromUnicode(PyObject* obj) { PyBytesView self; ARROW_RETURN_NOT_OK(self.ParseUnicode(obj)); - return self; + return std::move(self); } static Result FromBinary(PyObject* obj) { PyBytesView self; ARROW_RETURN_NOT_OK(self.ParseBinary(obj)); - return self; + return std::move(self); } // View the given Python object as string-like, i.e. str or (utf8) bytes From b551d1253f0f5da499f2dfb040279c28781774f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 25 Sep 2020 15:34:42 +0200 Subject: [PATCH 77/80] some inline notes describing performance improvements --- cpp/src/arrow/python/python_to_arrow.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 319810e00e1..055d38e491a 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -451,6 +451,8 @@ class PyPrimitiveConverter< is_time_type::value>> : public PrimitiveConverter { public: Status Append(PyObject* value) override { + // Since the required space has been already allocated in the Extend functions we can + // rely on the Unsafe builder API which improves the performance. if (PyValue::IsNull(this->options_, value)) { this->primitive_builder_->UnsafeAppendNull(); } else { @@ -495,6 +497,9 @@ class PyPrimitiveConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + // Since we don't know the varying length input size in advance, we need to + // reserve space in the value builder one by one. ReserveData raises CapacityError + // if the value would not fit into the array. ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); this->primitive_builder_->UnsafeAppend(view_.bytes, view_.size); } @@ -502,6 +507,8 @@ class PyPrimitiveConverter> } protected: + // Create a single instance of PyBytesView here to prevent unnecessary object + // creation/destruction. This significantly improves the conversion performance. PyBytesView view_; }; @@ -1009,8 +1016,10 @@ Result> ConvertPySequence(PyObject* obj, PyObject* ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); if (converter->may_overflow()) { + // The converter hierarchy contains binary- or list-like builders which can overflow + // depending on the input values. Wrap the converter with a chunker which detects + // the overflow and automatically creates new chunks. ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); - // Convert values if (mask != nullptr && mask != Py_None) { RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); } else { @@ -1018,7 +1027,8 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } return chunked_converter->ToChunkedArray(); } else { - // Convert values + // If the converter can't overflow spare the capacity error checking on the hot-path, + // this improves the performance roughly by ~10 for primitive types. if (mask != nullptr && mask != Py_None) { RETURN_NOT_OK(ExtendMasked(converter.get(), seq, mask, size)); } else { From 806e4c01022ecfa822a3a04a7e1752fbbccf6ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 25 Sep 2020 16:44:33 +0200 Subject: [PATCH 78/80] try to please compilers --- cpp/src/arrow/python/python_to_arrow.cc | 2 +- cpp/src/arrow/util/converter.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 055d38e491a..9b1d2447efa 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -593,7 +593,7 @@ class PyDictionaryConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); - return this->value_builder_->Append(view_.bytes, view_.size); + return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); } } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index 1184821b7e8..bf559150d2c 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -273,7 +273,7 @@ class Chunker { return converter_->AppendNull(); } ++length_; - return std::move(status); + return status; } Status Append(InputType value) { @@ -283,7 +283,7 @@ class Chunker { return Append(value); } ++length_; - return std::move(status); + return status; } Status FinishChunk() { From b527a0e534d1c25422b571eb9d0deb24af2a63e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 25 Sep 2020 16:48:29 +0200 Subject: [PATCH 79/80] fix msvc issues --- cpp/src/arrow/python/python_to_arrow.cc | 5 ++++- cpp/src/arrow/util/converter.h | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 9b1d2447efa..79d6dc2c3e6 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -491,6 +491,8 @@ template class PyPrimitiveConverter> : public PrimitiveConverter { public: + using OffsetType = typename T::offset_type; + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { this->primitive_builder_->UnsafeAppendNull(); @@ -501,7 +503,8 @@ class PyPrimitiveConverter> // reserve space in the value builder one by one. ReserveData raises CapacityError // if the value would not fit into the array. ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); - this->primitive_builder_->UnsafeAppend(view_.bytes, view_.size); + this->primitive_builder_->UnsafeAppend(view_.bytes, + static_cast(view_.size)); } return Status::OK(); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index bf559150d2c..d60d8d056fa 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -209,7 +209,7 @@ struct MakeConverterImpl { switch (t.value_type()->id()) { #define DICTIONARY_CASE(TYPE) \ case TYPE::type_id: \ - out = make_unique< \ + out = internal::make_unique< \ typename ConverterTrait::template dictionary_type>(); \ break; DICTIONARY_CASE(BooleanType); @@ -317,7 +317,7 @@ class Chunker { template static Result>> MakeChunker(std::unique_ptr converter) { - return make_unique>(std::move(converter)); + return internal::make_unique>(std::move(converter)); } } // namespace internal From 2213c2543f0918fa2395f50bc2efd2701cdde152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 25 Sep 2020 17:04:02 +0200 Subject: [PATCH 80/80] cast to offset type when appending to string builders --- cpp/src/arrow/python/python_to_arrow.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 79d6dc2c3e6..252577eef01 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -539,6 +539,8 @@ template class PyPrimitiveConverter> : public PrimitiveConverter { public: + using OffsetType = typename T::offset_type; + Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { this->primitive_builder_->UnsafeAppendNull(); @@ -550,7 +552,8 @@ class PyPrimitiveConverter> observed_binary_ = true; } ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); - this->primitive_builder_->UnsafeAppend(view_.bytes, view_.size); + this->primitive_builder_->UnsafeAppend(view_.bytes, + static_cast(view_.size)); } return Status::OK(); }