diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index b136bec9709..b2d9f1cb5a3 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -388,36 +388,36 @@ class PyValue { } }; -template -Status Extend(T* converter, PyObject* values, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(converter->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { - return converter->Append(item); - }); -} - -// Convert and append a sequence of values masked with a numpy array -template -Status ExtendMasked(T* converter, PyObject* values, PyObject* mask, int64_t size) { - /// Ensure we've allocated enough space - RETURN_NOT_OK(converter->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return converter->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return converter->Append(item); // perhaps use AppendValue instead? - } - }); -} - // The base Converter class is a mixin with predefined behavior and constructors. -using PyConverter = Converter; +class PyConverter : public Converter { + public: + // Iterate over the input values and defer the conversion to the Append method + Status Extend(PyObject* values, int64_t size) override { + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [this](PyObject* item, bool* /* unused */) { + return this->Append(item); + }); + } + + // Convert and append a sequence of values masked with a numpy array + Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size) override { + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return this->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return this->Append(item); // perhaps use AppendValue instead? + } + }); + } +}; template class PyPrimitiveConverter; @@ -669,7 +669,7 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); - return Extend(this->value_converter_.get(), value, size); + return this->value_converter_->Extend(value, size); } Status AppendNdarray(PyObject* value) { @@ -684,12 +684,12 @@ class PyListConverter : public ListConverter { switch (value_type->id()) { // If the value type does not match the expected NumPy dtype, then fall through // to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ - case Type::TYPE_ID: { \ - if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ - return Extend(this->value_converter_.get(), value, size); \ - } \ - return AppendNdarrayTyped(ndarray); \ +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return this->value_converter_->Extend(value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ } LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL) LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) @@ -707,7 +707,7 @@ class PyListConverter : public ListConverter { LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA) #undef LIST_FAST_CASE default: { - return Extend(this->value_converter_.get(), value, size); + return this->value_converter_->Extend(value, size); } } } @@ -1041,18 +1041,18 @@ Result> ConvertPySequence(PyObject* obj, PyObject* // the overflow and automatically creates new chunks. ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); + RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); } else { - RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); + RETURN_NOT_OK(chunked_converter->Extend(seq, size)); } return chunked_converter->ToChunkedArray(); } else { // If the converter can't overflow spare the capacity error checking on the hot-path, // this improves the performance roughly by ~10% for primitive types. if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(ExtendMasked(converter.get(), seq, mask, size)); + RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); } else { - RETURN_NOT_OK(Extend(converter.get(), seq, size)); + RETURN_NOT_OK(converter->Extend(seq, size)); } return converter->ToChunkedArray(); } diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index e18f6e350d7..2c40a48726b 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -52,7 +52,15 @@ class Converter { return Init(pool); } - virtual Status Append(InputType value) = 0; + virtual Status Append(InputType value) { return Status::NotImplemented("Append"); } + + virtual Status Extend(InputType values, int64_t size) { + return Status::NotImplemented("Extend"); + } + + virtual Status ExtendMasked(InputType values, InputType mask, int64_t size) { + return Status::NotImplemented("ExtendMasked"); + } const std::shared_ptr& builder() const { return builder_; } @@ -294,6 +302,34 @@ class Chunker { return status; } + // we could get bit smarter here since the whole batch of appendable values + // will be rejected if a capacity error is raised + Status Extend(InputType values, int64_t size) { + auto status = converter_->Extend(values, size); + if (ARROW_PREDICT_FALSE(status.IsCapacityError())) { + if (converter_->builder()->length() == 0) { + return status; + } + ARROW_RETURN_NOT_OK(FinishChunk()); + return Extend(values, size); + } + length_ += size; + return status; + } + + Status ExtendMasked(InputType values, InputType mask, int64_t size) { + auto status = converter_->ExtendMasked(values, mask, size); + if (ARROW_PREDICT_FALSE(status.IsCapacityError())) { + if (converter_->builder()->length() == 0) { + return status; + } + ARROW_RETURN_NOT_OK(FinishChunk()); + return ExtendMasked(values, mask, size); + } + length_ += size; + return status; + } + Status FinishChunk() { ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_)); chunks_.push_back(chunk);