diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 0781dd4a2df..900e8d2b38f 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -161,7 +161,13 @@ struct ScalarFromArraySlotImpl { } if (array_.IsNull(index_)) { - return MakeNullScalar(array_.type()); + auto null = MakeNullScalar(array_.type()); + if (is_dictionary(array_.type()->id())) { + auto& dict_null = checked_cast(*null); + const auto& dict_array = checked_cast(array_); + dict_null.value.dictionary = dict_array.dictionary(); + } + return null; } RETURN_NOT_OK(VisitArrayInline(array_, this)); diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 9c2cd888692..af732eab068 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -570,6 +570,26 @@ class TestStringBuilder : public TestBuilder { ASSERT_EQ(reps * 40, result_->value_data()->size()); } + void TestOverflowCheck() { + auto max_size = builder_->memory_limit(); + + ASSERT_OK(builder_->ValidateOverflow(1)); + ASSERT_OK(builder_->ValidateOverflow(max_size)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size + 1)); + + ASSERT_OK(builder_->Append("bb")); + ASSERT_OK(builder_->ValidateOverflow(max_size - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1)); + + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->ValidateOverflow(max_size - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1)); + + ASSERT_OK(builder_->Append("ccc")); + ASSERT_OK(builder_->ValidateOverflow(max_size - 5)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 4)); + } + void TestZeroLength() { // All buffers are null Done(); @@ -602,6 +622,8 @@ TYPED_TEST(TestStringBuilder, TestCapacityReserve) { this->TestCapacityReserve() TYPED_TEST(TestStringBuilder, TestZeroLength) { this->TestZeroLength(); } +TYPED_TEST(TestStringBuilder, TestOverflowCheck) { this->TestOverflowCheck(); } + // ---------------------------------------------------------------------- // ChunkedBinaryBuilder tests diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index df0eb522cf4..0dff0f48b00 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -467,6 +467,32 @@ class TestListArray : public TestBuilder { AssertArraysEqual(*result_, *expected); } + void TestOverflowCheck() { + Int16Builder* vb = checked_cast(builder_->value_builder()); + auto max_elements = builder_->maximum_elements(); + + ASSERT_OK(builder_->ValidateOverflow(1)); + ASSERT_OK(builder_->ValidateOverflow(max_elements)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); + + ASSERT_OK(builder_->Append()); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); + + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); + + ASSERT_OK(builder_->Append()); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->ValidateOverflow(max_elements - 5)); + ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 4)); + } + protected: std::shared_ptr value_type_; @@ -508,6 +534,12 @@ TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } +#ifndef ARROW_LARGE_MEMORY_TESTS +TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck(); } +#else +TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } +#endif + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 6f015dda3e1..b92cc285894 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -99,6 +99,12 @@ Status ArrayBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } +Result> ArrayBuilder::Finish() { + std::shared_ptr out; + RETURN_NOT_OK(Finish(&out)); + return out; +} + void ArrayBuilder::Reset() { capacity_ = length_ = null_count_ = 0; null_bitmap_builder_.Reset(); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 33b1f9b3e66..d73681756ba 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -56,6 +56,8 @@ class ARROW_EXPORT ArrayBuilder { /// skip shared pointers and just return a raw pointer ArrayBuilder* child(int i) { return children_[i].get(); } + const std::shared_ptr& child_builder(int i) const { return children_[i]; } + int num_children() const { return static_cast(children_.size()); } virtual int64_t length() const { return length_; } @@ -118,6 +120,13 @@ class ARROW_EXPORT ArrayBuilder { /// \return Status Status Finish(std::shared_ptr* out); + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \return The finalized Array object + Result> Finish(); + /// \brief Return the type of the built Array virtual std::shared_ptr type() const = 0; diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 593b533a19c..21993ce5493 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -78,9 +78,7 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast(num_bytes)); @@ -232,6 +230,16 @@ class BaseBinaryBuilder : public ArrayBuilder { value_data_builder_.Reset(); } + Status ValidateOverflow(int64_t new_bytes) { + auto new_size = value_data_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + Status Resize(int64_t capacity) override { // XXX Why is this check necessary? There is no reason to disallow, say, // binary arrays with more than 2**31 empty or null values. @@ -249,12 +257,8 @@ class BaseBinaryBuilder : public ArrayBuilder { /// \brief Ensures there is enough allocated capacity to append the indicated /// number of bytes to the value data buffer without additional allocations Status ReserveData(int64_t elements) { - const int64_t size = value_data_length() + elements; - ARROW_RETURN_IF(size > memory_limit(), - Status::CapacityError("Cannot reserve capacity larger than ", - memory_limit(), " bytes")); - return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) - : Status::OK(); + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return value_data_builder_.Reserve(elements); } Status FinishInternal(std::shared_ptr* out) override { @@ -317,16 +321,9 @@ class BaseBinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - Status AppendOverflow(int64_t num_bytes) { - return Status::CapacityError("array cannot contain more than ", memory_limit(), - " bytes, have ", num_bytes); - } - Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) { - return AppendOverflow(num_bytes); - } + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); return offsets_builder_.Append(static_cast(num_bytes)); } @@ -462,6 +459,23 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); } + Status ValidateOverflow(int64_t new_bytes) const { + auto new_size = byte_builder_.length() + new_bytes; + if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { + return Status::CapacityError("array cannot contain more than ", memory_limit(), + " bytes, have ", new_size); + } else { + return Status::OK(); + } + } + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements) { + ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); + return byte_builder_.Reserve(elements); + } + void Reset() override; Status Resize(int64_t capacity) override; Status FinishInternal(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index b8af62fab14..c3786a8fab4 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -54,6 +54,18 @@ MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr& ke : MapBuilder(pool, key_builder, item_builder, map(key_builder->type(), item_builder->type(), keys_sorted)) {} +MapBuilder::MapBuilder(MemoryPool* pool, + const std::shared_ptr& struct_builder, + const std::shared_ptr& type) + : ArrayBuilder(pool) { + auto map_type = internal::checked_cast(type.get()); + keys_sorted_ = map_type->keys_sorted(); + key_builder_ = struct_builder->child_builder(0); + item_builder_ = struct_builder->child_builder(1); + list_builder_ = + std::make_shared(pool, struct_builder, struct_builder->type()); +} + Status MapBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(list_builder_->Resize(capacity)); capacity_ = list_builder_->capacity(); @@ -170,6 +182,19 @@ Status FixedSizeListBuilder::AppendNulls(int64_t length) { return value_builder_->AppendNulls(list_size_ * length); } +Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) { + auto new_length = value_builder_->length() + new_elements; + if (new_elements != list_size_) { + return Status::Invalid("Length of item not correct: expected ", list_size_, + " but got array of size ", new_elements); + } + if (new_length > maximum_elements()) { + return Status::CapacityError("array cannot contain more than ", maximum_elements(), + " elements, have ", new_elements); + } + return Status::OK(); +} + Status FixedSizeListBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity)); return ArrayBuilder::Resize(capacity); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index cd6fadfcc2f..b8948403acc 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -100,7 +100,7 @@ class BaseListBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); UnsafeAppendToBitmap(length, false); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { @@ -131,6 +131,16 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status ValidateOverflow(int64_t new_elements) const { + auto new_length = value_builder_->length() + new_elements; + if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { + return Status::CapacityError("List array cannot contain more than ", + maximum_elements(), " elements, have ", new_elements); + } else { + return Status::OK(); + } + } + ArrayBuilder* value_builder() const { return value_builder_.get(); } // Cannot make this a static attribute because of linking issues @@ -147,17 +157,8 @@ class BaseListBuilder : public ArrayBuilder { std::shared_ptr value_builder_; std::shared_ptr value_field_; - Status CheckNextOffset() const { - const int64_t num_values = value_builder_->length(); - ARROW_RETURN_IF( - num_values > maximum_elements(), - Status::CapacityError("List array cannot contain more than ", maximum_elements(), - " child elements,", " have ", num_values)); - return Status::OK(); - } - Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(CheckNextOffset()); + ARROW_RETURN_NOT_OK(ValidateOverflow(0)); const int64_t num_values = value_builder_->length(); return offsets_builder_.Append(static_cast(num_values)); } @@ -227,6 +228,9 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, const std::shared_ptr& item_builder, bool keys_sorted = false); + MapBuilder(MemoryPool* pool, const std::shared_ptr& item_builder, + const std::shared_ptr& type); + Status Resize(int64_t capacity) override; void Reset() override; Status FinishInternal(std::shared_ptr* out) override; @@ -276,6 +280,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { return map(key_builder_->type(), item_builder_->type(), keys_sorted_); } + Status ValidateOverflow(int64_t new_elements) { + return list_builder_->ValidateOverflow(new_elements); + } + protected: inline Status AdjustStructBuilderLength(); @@ -343,12 +351,19 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { /// automatically. Status AppendNulls(int64_t length) final; + Status ValidateOverflow(int64_t new_elements); + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); } + // Cannot make this a static attribute because of linking issues + static constexpr int64_t maximum_elements() { + return std::numeric_limits::max() - 1; + } + protected: std::shared_ptr value_field_; const int32_t list_size_; diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e6b1baa5879..cc907fb6b8a 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -31,6 +31,9 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {} + explicit NullBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool()) + : NullBuilder(pool) {} /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 52a3f334d4e..8560fa2d6f4 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -23,6 +23,7 @@ #include "arrow/buffer.h" #include "arrow/python/pyarrow.h" #include "arrow/python/visibility.h" +#include "arrow/result.h" #include "arrow/util/macros.h" namespace arrow { @@ -188,84 +189,80 @@ class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { struct PyBytesView { const char* bytes; Py_ssize_t size; + bool is_utf8; - PyBytesView() : bytes(NULLPTR), size(0), ref(NULLPTR) {} - - // View the given Python object as binary-like, i.e. bytes - Status FromBinary(PyObject* obj) { return FromBinary(obj, "a bytes object"); } - - Status FromString(PyObject* obj) { - bool ignored = false; - return FromString(obj, false, &ignored); + static Result FromString(PyObject* obj, bool check_utf8 = false) { + PyBytesView self; + ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8)); + return std::move(self); } - Status FromString(PyObject* obj, bool* is_utf8) { - return FromString(obj, true, is_utf8); + static Result FromUnicode(PyObject* obj) { + PyBytesView self; + ARROW_RETURN_NOT_OK(self.ParseUnicode(obj)); + return std::move(self); } - Status FromUnicode(PyObject* obj) { - Py_ssize_t size; - // The utf-8 representation is cached on the unicode object - const char* data = PyUnicode_AsUTF8AndSize(obj, &size); - RETURN_IF_PYERROR(); - this->bytes = data; - this->size = size; - this->ref.reset(); - return Status::OK(); + static Result FromBinary(PyObject* obj) { + PyBytesView self; + ARROW_RETURN_NOT_OK(self.ParseBinary(obj)); + return std::move(self); } - protected: - PyBytesView(const char* b, Py_ssize_t s, PyObject* obj = NULLPTR) - : bytes(b), size(s), ref(obj) {} - // View the given Python object as string-like, i.e. str or (utf8) bytes - Status FromString(PyObject* obj, bool check_utf8, bool* is_utf8) { + Status ParseString(PyObject* obj, bool check_utf8 = false) { if (PyUnicode_Check(obj)) { - *is_utf8 = true; - return FromUnicode(obj); + return ParseUnicode(obj); } else { - ARROW_RETURN_NOT_OK(FromBinary(obj, "a string or bytes object")); + ARROW_RETURN_NOT_OK(ParseBinary(obj)); if (check_utf8) { // Check the bytes are utf8 utf-8 OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size)); if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - *is_utf8 = true; + is_utf8 = true; } else { - *is_utf8 = false; PyErr_Clear(); + is_utf8 = false; } - } else { - *is_utf8 = false; } return Status::OK(); } } - Status FromBinary(PyObject* obj, const char* expected_msg) { + // View the given Python object as unicode string + Status ParseUnicode(PyObject* obj) { + // The utf-8 representation is cached on the unicode object + bytes = PyUnicode_AsUTF8AndSize(obj, &size); + RETURN_IF_PYERROR(); + is_utf8 = true; + return Status::OK(); + } + + // View the given Python object as binary-like, i.e. bytes + Status ParseBinary(PyObject* obj) { if (PyBytes_Check(obj)) { - this->bytes = PyBytes_AS_STRING(obj); - this->size = PyBytes_GET_SIZE(obj); - this->ref.reset(); - return Status::OK(); + bytes = PyBytes_AS_STRING(obj); + size = PyBytes_GET_SIZE(obj); + is_utf8 = false; } else if (PyByteArray_Check(obj)) { - this->bytes = PyByteArray_AS_STRING(obj); - this->size = PyByteArray_GET_SIZE(obj); - this->ref.reset(); - return Status::OK(); + bytes = PyByteArray_AS_STRING(obj); + size = PyByteArray_GET_SIZE(obj); + is_utf8 = false; } else if (PyMemoryView_Check(obj)) { - PyObject* contig_view = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); + PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C'); RETURN_IF_PYERROR(); - this->ref.reset(contig_view); - Py_buffer* buf = PyMemoryView_GET_BUFFER(contig_view); - this->bytes = reinterpret_cast(buf->buf); - this->size = buf->len; - return Status::OK(); + Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref); + bytes = reinterpret_cast(buffer->buf); + size = buffer->len; + is_utf8 = false; } else { - return Status::TypeError("Expected ", expected_msg, ", got a '", - Py_TYPE(obj)->tp_name, "' object"); + return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name, + "' object"); } + return Status::OK(); } + protected: OwnedRef ref; }; diff --git a/cpp/src/arrow/python/datetime.cc b/cpp/src/arrow/python/datetime.cc index 4eeab7f5a69..07df5e76b45 100644 --- a/cpp/src/arrow/python/datetime.cc +++ b/cpp/src/arrow/python/datetime.cc @@ -419,6 +419,15 @@ Result TzinfoToString(PyObject* tzinfo) { return PyTZInfo_utcoffset_hhmm(tzinfo); } + // try to look up zone attribute + if (PyObject_HasAttrString(tzinfo, "zone")) { + OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone")); + RETURN_IF_PYERROR(); + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result)); + return result; + } + // attempt to call tzinfo.tzname(None) OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); RETURN_IF_PYERROR(); diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index d1ce2c2f797..a75a887693c 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -620,39 +620,23 @@ class TypeInferrer { }; // Non-exhaustive type inference -Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, - std::shared_ptr* out_type) { +Result> InferArrowType(PyObject* obj, PyObject* mask, + bool pandas_null_sentinels) { if (pandas_null_sentinels) { // ARROW-842: If pandas is not installed then null checks will be less // comprehensive, but that is okay. internal::InitPandasStaticData(); } + std::shared_ptr out_type; TypeInferrer inferrer(pandas_null_sentinels); RETURN_NOT_OK(inferrer.VisitSequence(obj, mask)); - RETURN_NOT_OK(inferrer.GetType(out_type)); - if (*out_type == nullptr) { + RETURN_NOT_OK(inferrer.GetType(&out_type)); + if (out_type == nullptr) { return Status::TypeError("Unable to determine data type"); + } else { + return std::move(out_type); } - - return Status::OK(); -} - -Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, - int64_t* size, std::shared_ptr* out_type) { - if (!PySequence_Check(obj)) { - return Status::TypeError("Object is not a sequence"); - } - *size = static_cast(PySequence_Size(obj)); - - // For 0-length sequences, refuse to guess - if (*size == 0) { - *out_type = null(); - return Status::OK(); - } - RETURN_NOT_OK(InferArrowType(obj, mask, pandas_null_sentinels, out_type)); - - return Status::OK(); } ARROW_PYTHON_EXPORT diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h index 74d1b78161c..eff18362934 100644 --- a/cpp/src/arrow/python/inference.h +++ b/cpp/src/arrow/python/inference.h @@ -44,15 +44,9 @@ namespace py { /// \param[in] mask an optional mask where True values are null. May /// be nullptr /// \param[in] pandas_null_sentinels use pandas's null value markers -/// \param[out] out_type the inferred type ARROW_PYTHON_EXPORT -arrow::Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, - std::shared_ptr* out_type); - -ARROW_PYTHON_EXPORT -arrow::Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask, - bool pandas_null_sentinels, int64_t* size, - std::shared_ptr* out_type); +Result> InferArrowType(PyObject* obj, PyObject* mask, + bool pandas_null_sentinels); /// Checks whether the passed Python object is a boolean scalar ARROW_PYTHON_EXPORT diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index af608dfc360..2847c4aa6b5 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -314,11 +314,11 @@ Status NumPyConverter::Convert() { PyConversionOptions py_options; py_options.type = type_; py_options.from_pandas = from_pandas_; - std::shared_ptr res; - RETURN_NOT_OK(ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options, - &res)); - out_arrays_ = res->chunks(); + ARROW_ASSIGN_OR_RAISE( + auto chunked_array, + ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options, pool_)); + out_arrays_ = chunked_array->chunks(); return Status::OK(); } diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index b21c16af50a..80bda384bde 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -329,8 +329,7 @@ TEST(BuiltinConversionTest, TestMixedTypeFails) { ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); - std::shared_ptr arr; - ASSERT_RAISES(TypeError, ConvertPySequence(list, {}, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {})); } TEST_F(DecimalTest, FromPythonDecimalRescaleNotTruncateable) { @@ -422,17 +421,18 @@ TEST_F(DecimalTest, TestNoneAndNaN) { ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2)); ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3)); - std::shared_ptr arr, arr_from_pandas; PyConversionOptions options; - ASSERT_RAISES(TypeError, ConvertPySequence(list, options, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); options.from_pandas = true; - ASSERT_OK(ConvertPySequence(list, options, &arr_from_pandas)); - auto c0 = arr_from_pandas->chunk(0); - ASSERT_TRUE(c0->IsValid(0)); - ASSERT_TRUE(c0->IsNull(1)); - ASSERT_TRUE(c0->IsNull(2)); - ASSERT_TRUE(c0->IsNull(3)); + ASSERT_OK_AND_ASSIGN(auto chunked, ConvertPySequence(list, nullptr, options)); + ASSERT_EQ(chunked->num_chunks(), 1); + + auto arr = chunked->chunk(0); + ASSERT_TRUE(arr->IsValid(0)); + ASSERT_TRUE(arr->IsNull(1)); + ASSERT_TRUE(arr->IsNull(2)); + ASSERT_TRUE(arr->IsNull(3)); } TEST_F(DecimalTest, TestMixedPrecisionAndScale) { @@ -451,8 +451,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScale) { ASSERT_EQ(0, result); } - std::shared_ptr arr; - ASSERT_OK(ConvertPySequence(list, {}, &arr)); + ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, {})) const auto& type = checked_cast(*arr->type()); int32_t expected_precision = 9; @@ -476,9 +475,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) { ASSERT_EQ(PyList_SetItem(list, 0, value1), 0); ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); - std::shared_ptr arr; - ASSERT_OK(ConvertPySequence(list, {}, &arr)); - + ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, {})); const auto& type = checked_cast(*arr->type()); ASSERT_EQ(3, type.precision()); ASSERT_EQ(3, type.scale()); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 849c474ded3..252577eef01 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -35,6 +35,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/converter.h" #include "arrow/util/decimal.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" @@ -46,48 +47,60 @@ #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/type_traits.h" +#include "arrow/visitor_inline.h" namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -namespace py { - -// ---------------------------------------------------------------------- -// NullCoding +using internal::Converter; +using internal::DictionaryConverter; +using internal::ListConverter; +using internal::PrimitiveConverter; +using internal::StructConverter; -enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS }; +using internal::MakeChunker; +using internal::MakeConverter; -template -struct NullChecker {}; +namespace py { -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return obj == Py_None; } -}; +// Utility for converting single python objects to their intermediate C representations +// which can be fed to the typed builders +class PyValue { + public: + // Type aliases for shorter signature definitions + using I = PyObject*; + using O = PyConversionOptions; + + // Used for null checking before actually converting the values + static bool IsNull(const O& options, I obj) { + if (options.from_pandas) { + return internal::PandasObjectIsNull(obj); + } else { + return obj == Py_None; + } + } -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } -}; + // Used for post-conversion numpy NaT sentinel checking + static bool IsNaT(const TimestampType*, int64_t value) { + return internal::npy_traits::isnull(value); + } -// ---------------------------------------------------------------------- -// ValueConverters -// -// Typed conversion logic for single python objects are encapsulated in -// ValueConverter structs using SFINAE for specialization. -// -// The FromPython medthod is responsible to convert the python object to the -// C++ value counterpart which can be directly appended to the ArrayBuilder or -// Scalar can be constructed from. + // Used for post-conversion numpy NaT sentinel checking + static bool IsNaT(const DurationType*, int64_t value) { + return internal::npy_traits::isnull(value); + } -template -struct ValueConverter {}; + static Result Convert(const NullType*, const O&, I obj) { + if (obj == Py_None) { + return nullptr; + } else { + return Status::Invalid("Invalid null value"); + } + } -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const BooleanType*, const O&, I obj) { if (obj == Py_True) { return true; } else if (obj == Py_False) { @@ -98,38 +111,28 @@ struct ValueConverter { return internal::InvalidValue(obj, "tried to convert to boolean"); } } -}; -template -struct ValueConverter> { - using ValueType = typename Type::c_type; - - static inline Result FromPython(PyObject* obj) { - ValueType value; - arrow::Status s_ = internal::CIntFromPython(obj, &value); - if (!s_.ok() && !internal::PyIntScalar_Check(obj)) { + template + static enable_if_integer> Convert(const T*, const O&, + I obj) { + typename T::c_type value; + auto status = internal::CIntFromPython(obj, &value); + if (ARROW_PREDICT_TRUE(status.ok())) { + return value; + } else if (!internal::PyIntScalar_Check(obj)) { return internal::InvalidValue(obj, "tried to convert to int"); } else { - RETURN_NOT_OK(s_); + return status; } - return value; } -}; - -template <> -struct ValueConverter { - using ValueType = typename HalfFloatType::c_type; - static inline Result FromPython(PyObject* obj) { - ValueType value; + static Result Convert(const HalfFloatType*, const O&, I obj) { + uint16_t value; RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const FloatType*, const O&, I obj) { float value; if (internal::PyFloatScalar_Check(obj)) { value = static_cast(PyFloat_AsDouble(obj)); @@ -141,11 +144,8 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const DoubleType*, const O&, I obj) { double value; if (PyFloat_Check(obj)) { value = PyFloat_AS_DOUBLE(obj); @@ -160,11 +160,14 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const Decimal128Type* type, const O&, I obj) { + Decimal128 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); + return value; + } + + static Result Convert(const Date32Type*, const O&, I obj) { int32_t value; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -175,16 +178,14 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj) { + static Result Convert(const Date64Type*, const O&, I obj) { int64_t value; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); value = internal::PyDateTime_to_ms(pydate); // Truncate any intraday milliseconds + // TODO: introduce an option for this value -= value % 86400000LL; } else if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -195,16 +196,11 @@ struct ValueConverter { } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool /*ignore_timezone*/) { + static Result Convert(const Time32Type* type, const O&, I obj) { int32_t value; if (PyTime_Check(obj)) { - // TODO(kszucs): consider to raise if a timezone aware time object is encountered - switch (unit) { + switch (type->unit()) { case TimeUnit::SECOND: value = static_cast(internal::PyTime_to_s(obj)); break; @@ -215,21 +211,15 @@ struct ValueConverter { return Status::UnknownError("Invalid time unit"); } } else { - // TODO(kszucs): validate maximum value? RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool /*ignore_timezone=*/) { + static Result Convert(const Time64Type* type, const O&, I obj) { int64_t value; if (PyTime_Check(obj)) { - // TODO(kszucs): consider to raise if a timezone aware time object is encountered - switch (unit) { + switch (type->unit()) { case TimeUnit::MICRO: value = internal::PyTime_to_us(obj); break; @@ -240,25 +230,21 @@ struct ValueConverter { return Status::UnknownError("Invalid time unit"); } } else { - // TODO(kszucs): validate maximum value? RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); } return value; } -}; -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool ignore_timezone) { - int64_t value; + static Result Convert(const TimestampType* type, const O& options, I obj) { + int64_t value, offset; if (PyDateTime_Check(obj)) { - ARROW_ASSIGN_OR_RAISE(int64_t offset, internal::PyDateTime_utcoffset_s(obj)); - if (ignore_timezone) { + if (ARROW_PREDICT_FALSE(options.ignore_timezone)) { offset = 0; + } else { + ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj)); } auto dt = reinterpret_cast(obj); - switch (unit) { + switch (type->unit()) { case TimeUnit::SECOND: value = internal::PyDateTime_to_s(dt) - offset; break; @@ -282,38 +268,26 @@ struct ValueConverter { default: return Status::UnknownError("Invalid time unit"); } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { + return Status::NotImplemented("Expected np.datetime64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) { - // validate that the numpy scalar has np.datetime64 dtype - std::shared_ptr type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != TimestampType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.datetime64 but got: ", type->ToString()); - } - // validate that the time units are matching - if (unit != checked_cast(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.datetime64 objects with differing unit"); - } - // convert the numpy value - return reinterpret_cast(obj)->obval; - } -}; - -template <> -struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit, - bool /*ignore_timezone*/) { + static Result Convert(const DurationType* type, const O&, I obj) { int64_t value; if (PyDelta_Check(obj)) { auto dt = reinterpret_cast(obj); - switch (unit) { + switch (type->unit()) { case TimeUnit::SECOND: value = internal::PyDelta_to_s(dt); break; @@ -329,985 +303,655 @@ struct ValueConverter { default: return Status::UnknownError("Invalid time unit"); } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { + return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; } else { RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); } return value; } - static inline Result FromNumpy(PyObject* obj, TimeUnit::type unit) { - // validate that the numpy scalar has np.timedelta64 dtype - std::shared_ptr type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != DurationType::type_id) { - // TODO(kszucs): the message should highlight the received numpy dtype - return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); - } - // validate that the time units are matching - if (unit != checked_cast(*type).unit()) { - return Status::NotImplemented( - "Cannot convert NumPy np.timedelta64 objects with differing unit"); - } - // convert the numpy value - return reinterpret_cast(obj)->obval; - } -}; - -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return std::move(view); - } -}; + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj) { - // strict conversion, force output to be unicode / utf8 and validate that - // any binary values are utf8 - bool is_utf8 = false; - PyBytesView view; - - RETURN_NOT_OK(view.FromString(obj, &is_utf8)); - if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); - } - return std::move(view); + static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); } - static inline Result FromPython(PyObject* obj, bool* is_utf8) { - PyBytesView view; - - // Non-strict conversion; keep track of whether values are unicode or bytes - if (PyUnicode_Check(obj)) { - *is_utf8 = true; - RETURN_NOT_OK(view.FromUnicode(obj)); - } else { - // If not unicode or bytes, FromBinary will error - *is_utf8 = false; - RETURN_NOT_OK(view.FromBinary(obj)); - } - return std::move(view); - } -}; - -template -struct ValueConverter> { - static inline Result FromPython(PyObject* obj, int32_t byte_width) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - if (ARROW_PREDICT_FALSE(view.size != byte_width)) { + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, + PyBytesView& view) { + ARROW_RETURN_NOT_OK(view.ParseString(obj)); + if (view.size != type->byte_width()) { std::stringstream ss; - ss << "expected to be length " << byte_width << " was " << view.size; + ss << "expected to be length " << type->byte_width() << " was " << view.size; return internal::InvalidValue(obj, ss.str()); } else { - return std::move(view); + return Status::OK(); } } -}; - -// ---------------------------------------------------------------------- -// Sequence converter base and CRTP "middle" subclasses - -class SeqConverter; - -// Forward-declare converter factory -Status GetConverter(const std::shared_ptr& type, bool from_pandas, - bool strict_conversions, bool ignore_timezone, - std::unique_ptr* out); - -// Marshal Python sequence (list, tuple, etc.) to Arrow array -class SeqConverter { - public: - virtual ~SeqConverter() = default; - - // Initialize the sequence converter with an ArrayBuilder created - // externally. The reason for this interface is that we have - // arrow::MakeBuilder which also creates child builders for nested types, so - // we have to pass in the child builders to child SeqConverter in the case of - // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; - - // Append a single null value to the builder - virtual Status AppendNull() = 0; - // Append a valid value - virtual Status AppendValue(PyObject* seq) = 0; - - // Append a single python object handling Null values - virtual Status Append(PyObject* seq) = 0; - - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status Extend(PyObject* seq, int64_t size) = 0; - - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; - - virtual Status Close() { - if (chunks_.size() == 0 || builder_->length() > 0) { - std::shared_ptr last_chunk; - RETURN_NOT_OK(builder_->Finish(&last_chunk)); - chunks_.emplace_back(std::move(last_chunk)); + template + static enable_if_string Convert(const T*, const O& options, I obj, + PyBytesView& view) { + if (options.strict) { + // Strict conversion, force output to be unicode / utf8 and validate that + // any binary values are utf8 + ARROW_RETURN_NOT_OK(view.ParseString(obj, true)); + if (!view.is_utf8) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } + return Status::OK(); + } else { + // Non-strict conversion; keep track of whether values are unicode or bytes + return view.ParseString(obj); } - return Status::OK(); } - virtual Status GetResult(std::shared_ptr* out) { - // Still some accumulated data in the builder. If there are no chunks, we - // always call Finish to deal with the edge case where a size-0 sequence - // was converted with a specific output type, like array([], type=t) - RETURN_NOT_OK(Close()); - *out = std::make_shared(this->chunks_, builder_->type()); - return Status::OK(); + static Result Convert(const DataType* type, const O&, I obj) { + return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); } - - ArrayBuilder* builder() const { return builder_; } - - int num_chunks() const { return static_cast(chunks_.size()); } - - protected: - ArrayBuilder* builder_; - bool unfinished_builder_; - std::vector> chunks_; }; -template -class TypedConverter : public SeqConverter { - public: - using BuilderType = typename TypeTraits::BuilderType; - - Status Init(ArrayBuilder* builder) override { - builder_ = builder; - DCHECK_NE(builder_, nullptr); - typed_builder_ = checked_cast(builder); - return Status::OK(); - } - - // Append a missing item (default implementation) - Status AppendNull() override { return this->typed_builder_->AppendNull(); } - - // Append null if the obj is None or pandas null otherwise the valid value - Status Append(PyObject* obj) override { - return NullChecker::Check(obj) ? AppendNull() : AppendValue(obj); - } - - Status Extend(PyObject* obj, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence( - obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); - } - - Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } +template +Status Extend(T* converter, PyObject* values, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(values, [converter](PyObject* item, bool* /* unused */) { + return converter->Append(item); + }); +} - protected: - BuilderType* typed_builder_; -}; +// Convert and append a sequence of values masked with a numpy array +template +Status ExtendMasked(T* converter, PyObject* values, PyObject* mask, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(converter->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, [converter](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return converter->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return converter->Append(item); // perhaps use AppendValue instead? + } + }); +} -// ---------------------------------------------------------------------- -// Sequence converter for null type +// The base Converter class is a mixin with predefined behavior and constructors. +using PyConverter = Converter; -template -class NullConverter : public TypedConverter { - public: - Status AppendValue(PyObject* obj) override { - return internal::InvalidValue(obj, "converting to null type"); - } -}; +template +class PyPrimitiveConverter; -// ---------------------------------------------------------------------- -// Sequence converter template for primitive (integer and floating point bool) types +template +class PyListConverter; -template -class PrimitiveConverter : public TypedConverter { - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); - return this->typed_builder_->Append(value); - } -}; +template +class PyDictionaryConverter; -// ---------------------------------------------------------------------- -// Sequence converters for temporal types +class PyStructConverter; -template -class TimeConverter : public TypedConverter { - public: - explicit TimeConverter(TimeUnit::type unit, bool ignore_timezone) - : unit_(unit), ignore_timezone_(ignore_timezone) {} - - // TODO(kszucs): support numpy values for date and time converters - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto value, - ValueConverter::FromPython(obj, unit_, ignore_timezone_)); - return this->typed_builder_->Append(value); - } +template +struct PyConverterTrait; - protected: - TimeUnit::type unit_; - bool ignore_timezone_; +template +struct PyConverterTrait< + T, enable_if_t::value && !is_interval_type::value && + !is_extension_type::value>> { + using type = PyPrimitiveConverter; }; -// TODO(kszucs): move it to the type_traits template -struct NumpyType {}; +struct PyConverterTrait> { + using type = PyListConverter; +}; template <> -struct NumpyType { - static inline bool isnull(int64_t v) { - return internal::npy_traits::isnull(v); - } +struct PyConverterTrait { + using type = PyStructConverter; }; template <> -struct NumpyType { - static inline bool isnull(int64_t v) { - return internal::npy_traits::isnull(v); - } +struct PyConverterTrait { + template + using dictionary_type = PyDictionaryConverter; }; -template -class TemporalConverter : public TimeConverter { +template +class PyPrimitiveConverter> + : public PrimitiveConverter { public: - using TimeConverter::TimeConverter; - - Status AppendValue(PyObject* obj) override { - int64_t value; - if (PyArray_CheckAnyScalarExact(obj)) { - // convert np.datetime64 / np.timedelta64 depending on Type - ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromNumpy(obj, this->unit_)); - if (NumpyType::isnull(value)) { - // checks numpy NaT sentinel after conversion - return this->typed_builder_->AppendNull(); - } + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); } else { ARROW_ASSIGN_OR_RAISE( - value, - ValueConverter::FromPython( - obj, this->unit_, TimeConverter::ignore_timezone_)); + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); } - return this->typed_builder_->Append(value); } }; -// ---------------------------------------------------------------------- -// Sequence converters for Binary, FixedSizeBinary, String - -template -class BinaryLikeConverter : public TypedConverter { +template +class PyPrimitiveConverter< + T, enable_if_t::value || is_number_type::value || + is_decimal_type::value || is_date_type::value || + is_time_type::value>> : public PrimitiveConverter { public: - using BuilderType = typename TypeTraits::BuilderType; - - inline Status AutoChunk(Py_ssize_t size) { - // did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(this->typed_builder_->value_data_length() + size > - BuilderType::memory_limit())) { - // builder would be full, so need to add a new chunk - std::shared_ptr chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); + Status Append(PyObject* value) override { + // Since the required space has been already allocated in the Extend functions we can + // rely on the Unsafe builder API which improves the performance. + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + this->primitive_builder_->UnsafeAppend(converted); } return Status::OK(); } +}; - Status AppendString(const PyBytesView& view) { - // check that the value fits in the datatype - if (view.size > BuilderType::memory_limit()) { - return Status::Invalid("string too large for datatype"); +template +class PyPrimitiveConverter< + T, enable_if_t::value || is_duration_type::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + // Numpy NaT sentinels can be checked after the conversion + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + this->primitive_builder_->UnsafeAppend(converted); + } } - DCHECK_GE(view.size, 0); - - // create a new chunk if the value would overflow the builder - RETURN_NOT_OK(AutoChunk(view.size)); + return Status::OK(); + } +}; - // now we can safely append the value to the builder - RETURN_NOT_OK( - this->typed_builder_->Append(::arrow::util::string_view(view.bytes, view.size))); +template +class PyPrimitiveConverter> + : public PrimitiveConverter { + public: + using OffsetType = typename T::offset_type; + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + // Since we don't know the varying length input size in advance, we need to + // reserve space in the value builder one by one. ReserveData raises CapacityError + // if the value would not fit into the array. + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes, + static_cast(view_.size)); + } return Status::OK(); } protected: // Create a single instance of PyBytesView here to prevent unnecessary object - // creation/destruction - PyBytesView string_view_; + // creation/destruction. This significantly improves the conversion performance. + PyBytesView view_; }; -template -class BinaryConverter : public BinaryLikeConverter { - public: - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE(auto view, ValueConverter::FromPython(obj)); - return this->AppendString(view); - } -}; - -template -class FixedSizeBinaryConverter - : public BinaryLikeConverter { +template +class PyPrimitiveConverter::value>> + : public PrimitiveConverter { public: - explicit FixedSizeBinaryConverter(int32_t byte_width) : byte_width_(byte_width) {} - - Status AppendValue(PyObject* obj) override { - ARROW_ASSIGN_OR_RAISE( - this->string_view_, - ValueConverter::FromPython(obj, byte_width_)); - return this->AppendString(this->string_view_); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes); + } + return Status::OK(); } protected: - int32_t byte_width_; + PyBytesView view_; }; -// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, -// otherwise we allow but return results as BinaryArray -template -class StringConverter : public BinaryLikeConverter { +template +class PyPrimitiveConverter> + : public PrimitiveConverter { public: - StringConverter() : binary_count_(0) {} + using OffsetType = typename T::offset_type; - Status AppendValue(PyObject* obj) override { - if (Strict) { - // raise if the object is not unicode or not an utf-8 encoded bytes - ARROW_ASSIGN_OR_RAISE(this->string_view_, ValueConverter::FromPython(obj)); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); } else { - // keep track of whether values are unicode or bytes; if any bytes are - // observe, the result will be bytes - bool is_utf8; - ARROW_ASSIGN_OR_RAISE(this->string_view_, - ValueConverter::FromPython(obj, &is_utf8)); - if (!is_utf8) { - ++binary_count_; + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + if (!view_.is_utf8) { + // observed binary value + observed_binary_ = true; } + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes, + static_cast(view_.size)); } - return this->AppendString(this->string_view_); + return Status::OK(); } - Status GetResult(std::shared_ptr* out) override { - RETURN_NOT_OK(SeqConverter::GetResult(out)); - - // If we saw any non-unicode, cast results to BinaryArray - if (binary_count_) { - // We should have bailed out earlier - DCHECK(!Strict); - auto binary_type = TypeTraits::type_singleton(); - return (*out)->View(binary_type).Value(out); + Result> ToArray() override { + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); + if (observed_binary_) { + // if we saw any non-unicode, cast results to BinaryArray + auto binary_type = TypeTraits::type_singleton(); + return array->View(binary_type); + } else { + return array; } - return Status::OK(); } protected: - int64_t binary_count_; + PyBytesView view_; + bool observed_binary_ = false; }; -// ---------------------------------------------------------------------- -// Convert lists (NumPy arrays containing lists or ndarrays as values) - -// If the value type does not match the expected NumPy dtype, then fall through -// to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ - case Type::TYPE: { \ - if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ - return value_converter_->Extend(obj, value_length); \ - } \ - return AppendNdarrayTypedItem(arr); \ - } - -// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE) \ - case Type::TYPE: { \ - return value_converter_->Extend(obj, value_length); \ +template +class PyDictionaryConverter> + : public DictionaryConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(converted); + } } +}; -// Base class for ListConverter and FixedSizeListConverter (to have both work with CRTP) -template -class BaseListConverter : public TypedConverter { +template +class PyDictionaryConverter> + : public DictionaryConverter { public: - using BuilderType = typename TypeTraits::BuilderType; - - explicit BaseListConverter(bool from_pandas, bool strict_conversions, - bool ignore_timezone) - : from_pandas_(from_pandas), - strict_conversions_(strict_conversions), - ignore_timezone_(ignore_timezone) {} - - Status Init(ArrayBuilder* builder) override { - this->builder_ = builder; - this->typed_builder_ = checked_cast(builder); - - this->value_type_ = checked_cast(*builder->type()).value_type(); - RETURN_NOT_OK(GetConverter(value_type_, from_pandas_, strict_conversions_, - ignore_timezone_, &value_converter_)); - return this->value_converter_->Init(this->typed_builder_->value_builder()); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->value_type_, this->options_, value, view_)); + return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + } } - template - Status AppendNdarrayTypedItem(PyArrayObject* arr) { - using traits = internal::npy_traits; - using T = typename traits::value_type; - using ValueBuilderType = typename TypeTraits::BuilderType; + protected: + PyBytesView view_; +}; + +template +class PyListConverter : public ListConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->list_builder_->AppendNull(); + } - const bool null_sentinels_possible = - // Always treat Numpy's NaT as null - NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || - // Observing pandas's null sentinels - (from_pandas_ && traits::supports_nulls); + RETURN_NOT_OK(this->list_builder_->Append()); + if (PyArray_Check(value)) { + RETURN_NOT_OK(AppendNdarray(value)); + } else if (PySequence_Check(value)) { + RETURN_NOT_OK(AppendSequence(value)); + } else { + return internal::InvalidType( + value, "was not a sequence or recognized null for conversion to list type"); + } - auto child_builder = checked_cast(value_converter_->builder()); + return ValidateBuilder(this->list_type_); + } - // TODO(wesm): Vector append when not strided - Ndarray1DIndexer values(arr); - if (null_sentinels_possible) { - for (int64_t i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - RETURN_NOT_OK(child_builder->AppendNull()); - } else { - RETURN_NOT_OK(child_builder->Append(values[i])); - } - } + protected: + Status ValidateBuilder(const MapType*) { + if (this->list_builder_->key_builder()->null_count() > 0) { + return Status::Invalid("Invalid Map: key field can not contain null values"); } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(child_builder->Append(values[i])); - } + return Status::OK(); } - return Status::OK(); } - Status AppendNdarrayItem(PyObject* obj) { - PyArrayObject* arr = reinterpret_cast(obj); + Status ValidateBuilder(const BaseListType*) { return Status::OK(); } + + Status AppendSequence(PyObject* value) { + int64_t size = static_cast(PySequence_Size(value)); + RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); + return Extend(this->value_converter_.get(), value, size); + } - if (PyArray_NDIM(arr) != 1) { + Status AppendNdarray(PyObject* value) { + PyArrayObject* ndarray = reinterpret_cast(value); + if (PyArray_NDIM(ndarray) != 1) { return Status::Invalid("Can only convert 1-dimensional array values"); } + const int64_t size = PyArray_SIZE(ndarray); + RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); - const int64_t value_length = PyArray_SIZE(arr); - - switch (value_type_->id()) { + const auto value_type = this->value_converter_->builder()->type(); + switch (value_type->id()) { +// If the value type does not match the expected NumPy dtype, then fall through +// to a slower PySequence-based path +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return Extend(this->value_converter_.get(), value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ + } +// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise +#define LIST_SLOW_CASE(TYPE_ID) \ + case Type::TYPE_ID: { \ + return Extend(this->value_converter_.get(), value, size); \ + } LIST_SLOW_CASE(NA) - LIST_FAST_CASE(UINT8, NPY_UINT8, UInt8Type) - LIST_FAST_CASE(INT8, NPY_INT8, Int8Type) - LIST_FAST_CASE(UINT16, NPY_UINT16, UInt16Type) - LIST_FAST_CASE(INT16, NPY_INT16, Int16Type) - LIST_FAST_CASE(UINT32, NPY_UINT32, UInt32Type) - LIST_FAST_CASE(INT32, NPY_INT32, Int32Type) - LIST_FAST_CASE(UINT64, NPY_UINT64, UInt64Type) - LIST_FAST_CASE(INT64, NPY_INT64, Int64Type) + LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) + LIST_FAST_CASE(INT8, Int8Type, NPY_INT8) + LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16) + LIST_FAST_CASE(INT16, Int16Type, NPY_INT16) + LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32) + LIST_FAST_CASE(INT32, Int32Type, NPY_INT32) + LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64) + LIST_FAST_CASE(INT64, Int64Type, NPY_INT64) + LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16) + LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT) + LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE) + LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME) + LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA) LIST_SLOW_CASE(DATE32) LIST_SLOW_CASE(DATE64) LIST_SLOW_CASE(TIME32) LIST_SLOW_CASE(TIME64) - LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) - LIST_FAST_CASE(DURATION, NPY_TIMEDELTA, DurationType) - LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType) - LIST_FAST_CASE(FLOAT, NPY_FLOAT, FloatType) - LIST_FAST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) LIST_SLOW_CASE(BINARY) LIST_SLOW_CASE(FIXED_SIZE_BINARY) LIST_SLOW_CASE(STRING) +#undef LIST_FAST_CASE +#undef LIST_SLOW_CASE case Type::LIST: { - if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) { + if (PyArray_DESCR(ndarray)->type_num != NPY_OBJECT) { return Status::Invalid( - "Can only convert list types from NumPy object " - "array input"); + "Can only convert list types from NumPy object array input"); } - return internal::VisitSequence(obj, [this](PyObject* item, bool*) { - return value_converter_->Append(item); - }); + return Extend(this->value_converter_.get(), value, /*reserved=*/0); } default: { - return Status::TypeError("Unknown list item type: ", value_type_->ToString()); + return Status::TypeError("Unknown list item type: ", value_type->ToString()); } } } - Status AppendValue(PyObject* obj) override { - RETURN_NOT_OK(this->typed_builder_->Append()); - if (PyArray_Check(obj)) { - return AppendNdarrayItem(obj); - } - if (!PySequence_Check(obj)) { - return internal::InvalidType(obj, - "was not a sequence or recognized null" - " for conversion to list type"); - } - int64_t list_size = static_cast(PySequence_Size(obj)); - return value_converter_->Extend(obj, list_size); - } - - Status GetResult(std::shared_ptr* out) override { - // TODO: Improved handling of chunked children - if (value_converter_->num_chunks() > 0) { - return Status::Invalid("List child type ", - value_converter_->builder()->type()->ToString(), - " overflowed the capacity of a single chunk"); - } - return SeqConverter::GetResult(out); - } - - protected: - std::shared_ptr value_type_; - std::unique_ptr value_converter_; - const bool from_pandas_; - const bool strict_conversions_; - const bool ignore_timezone_; -}; - -template -class ListConverter : public BaseListConverter { - public: - using BASE = BaseListConverter; - using BASE::BASE; -}; + template + Status AppendNdarrayTyped(PyArrayObject* ndarray) { + // no need to go through the conversion + using NumpyTrait = internal::npy_traits; + using NumpyType = typename NumpyTrait::value_type; + using ValueBuilderType = typename TypeTraits::BuilderType; -template -class FixedSizeListConverter : public BaseListConverter { - public: - using BASE = BaseListConverter; - using BASE::BASE; + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || + // Observing pandas's null sentinels + (this->options_.from_pandas && NumpyTrait::supports_nulls); - Status Init(ArrayBuilder* builder) override { - RETURN_NOT_OK(BASE::Init(builder)); - list_size_ = checked_pointer_cast(builder->type())->list_size(); - return Status::OK(); - } + auto value_builder = + checked_cast(this->value_converter_->builder().get()); - Status AppendValue(PyObject* obj) override { - // the same as BaseListConverter but with additional length checks - RETURN_NOT_OK(this->typed_builder_->Append()); - if (PyArray_Check(obj)) { - int64_t list_size = static_cast(PyArray_Size(obj)); - if (list_size != list_size_) { - return Status::Invalid("Length of item not correct: expected ", list_size_, - " but got array of size ", list_size); + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(ndarray); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (NumpyTrait::isnull(values[i])) { + RETURN_NOT_OK(value_builder->AppendNull()); + } else { + RETURN_NOT_OK(value_builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(value_builder->Append(values[i])); } - return this->AppendNdarrayItem(obj); - } - if (!PySequence_Check(obj)) { - return internal::InvalidType(obj, - "was not a sequence or recognized null" - " for conversion to list type"); - } - int64_t list_size = static_cast(PySequence_Size(obj)); - if (list_size != list_size_) { - return Status::Invalid("Length of item not correct: expected ", list_size_, - " but got list of size ", list_size); } - return this->value_converter_->Extend(obj, list_size); + return Status::OK(); } - - protected: - int64_t list_size_; }; -// ---------------------------------------------------------------------- -// Convert maps - -// Define a MapConverter as a ListConverter that uses MapBuilder.value_builder -// to append struct of key/value pairs -template -class MapConverter : public BaseListConverter { +class PyStructConverter : public StructConverter { public: - using BASE = BaseListConverter; - - explicit MapConverter(bool from_pandas, bool strict_conversions, bool ignore_timezone) - : BASE(from_pandas, strict_conversions, ignore_timezone), key_builder_(nullptr) {} - - Status Append(PyObject* obj) override { - RETURN_NOT_OK(BASE::Append(obj)); - return VerifyLastStructAppended(); - } - - Status Extend(PyObject* seq, int64_t size) override { - RETURN_NOT_OK(BASE::Extend(seq, size)); - return VerifyLastStructAppended(); - } - - Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) override { - RETURN_NOT_OK(BASE::ExtendMasked(seq, mask, size)); - return VerifyLastStructAppended(); - } - - protected: - Status VerifyLastStructAppended() { - // The struct_builder may not have field_builders initialized in constructor, so - // assign key_builder lazily - if (key_builder_ == nullptr) { - auto struct_builder = - checked_cast(BASE::value_converter_->builder()); - key_builder_ = struct_builder->field_builder(0); + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->struct_builder_->AppendNull(); } - if (key_builder_->null_count() > 0) { - return Status::Invalid("Invalid Map: key field can not contain null values"); + switch (input_kind_) { + case InputKind::DICT: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendDict(value); + case InputKind::TUPLE: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendTuple(value); + case InputKind::ITEMS: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendItems(value); + default: + RETURN_NOT_OK(InferInputKind(value)); + return Append(value); } - return Status::OK(); } - private: - ArrayBuilder* key_builder_; -}; - -// ---------------------------------------------------------------------- -// Convert structs + protected: + Status Init(MemoryPool* pool) override { + RETURN_NOT_OK((StructConverter::Init(pool))); -template -class StructConverter : public TypedConverter { - public: - explicit StructConverter(bool from_pandas, bool strict_conversions, - bool ignore_timezone) - : from_pandas_(from_pandas), - strict_conversions_(strict_conversions), - ignore_timezone_(ignore_timezone) {} - - Status Init(ArrayBuilder* builder) override { - this->builder_ = builder; - this->typed_builder_ = checked_cast(builder); - auto struct_type = checked_pointer_cast(builder->type()); - - num_fields_ = this->typed_builder_->num_fields(); - DCHECK_EQ(num_fields_, struct_type->num_fields()); - - field_name_bytes_list_.reset(PyList_New(num_fields_)); - field_name_unicode_list_.reset(PyList_New(num_fields_)); + // Store the field names as a PyObjects for dict matching + num_fields_ = this->struct_type_->num_fields(); + bytes_field_names_.reset(PyList_New(num_fields_)); + unicode_field_names_.reset(PyList_New(num_fields_)); RETURN_IF_PYERROR(); - // Initialize the child converters and field names for (int i = 0; i < num_fields_; i++) { - const std::string& field_name(struct_type->field(i)->name()); - std::shared_ptr field_type(struct_type->field(i)->type()); - - std::unique_ptr value_converter; - RETURN_NOT_OK(GetConverter(field_type, from_pandas_, strict_conversions_, - ignore_timezone_, &value_converter)); - RETURN_NOT_OK(value_converter->Init(this->typed_builder_->field_builder(i))); - value_converters_.push_back(std::move(value_converter)); - - // Store the field name as a PyObject, for dict matching - PyObject* bytesobj = - PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); - PyObject* unicodeobj = + const auto& field_name = this->struct_type_->field(i)->name(); + PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); + PyObject* unicode = PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); RETURN_IF_PYERROR(); - PyList_SET_ITEM(field_name_bytes_list_.obj(), i, bytesobj); - PyList_SET_ITEM(field_name_unicode_list_.obj(), i, unicodeobj); + PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes); + PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode); } - return Status::OK(); } - Status AppendValue(PyObject* obj) override { - RETURN_NOT_OK(this->typed_builder_->Append()); - // Note heterogeneous sequences are not allowed - if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) { - if (PyDict_Check(obj)) { - source_kind_ = SourceKind::DICTS; - } else if (PyTuple_Check(obj)) { - source_kind_ = SourceKind::TUPLES; - } - } - if (PyDict_Check(obj) && source_kind_ == SourceKind::DICTS) { - return AppendDictItem(obj); - } else if (PyTuple_Check(obj) && source_kind_ == SourceKind::TUPLES) { - return AppendTupleItem(obj); + Status InferInputKind(PyObject* value) { + // Infer input object's type, note that heterogeneous sequences are not allowed + if (PyDict_Check(value)) { + input_kind_ = InputKind::DICT; + } else if (PyTuple_Check(value)) { + input_kind_ = InputKind::TUPLE; + } else if (PySequence_Check(value)) { + input_kind_ = InputKind::ITEMS; } else { - return internal::InvalidType(obj, - "was not a dict, tuple, or recognized null value" - " for conversion to struct type"); + return internal::InvalidType(value, + "was not a dict, tuple, or recognized null value " + "for conversion to struct type"); } + return Status::OK(); } - // Append a missing item - Status AppendNull() override { return this->typed_builder_->AppendNull(); } + Status InferKeyKind(PyObject* items) { + for (int i = 0; i < PySequence_Length(items); i++) { + // retrieve the key from the passed key-value pairs + ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); - protected: - Status AppendDictItem(PyObject* obj) { - if (dict_key_kind_ == DictKeyKind::UNICODE) { - return AppendDictItemWithUnicodeKeys(obj); - } - if (dict_key_kind_ == DictKeyKind::BYTES) { - return AppendDictItemWithBytesKeys(obj); - } - for (int i = 0; i < num_fields_; i++) { - PyObject* nameobj = PyList_GET_ITEM(field_name_unicode_list_.obj(), i); - PyObject* valueobj = PyDict_GetItem(obj, nameobj); - if (valueobj != NULL) { - dict_key_kind_ = DictKeyKind::UNICODE; - return AppendDictItemWithUnicodeKeys(obj); - } + // check key exists between the unicode field names + bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first); RETURN_IF_PYERROR(); - // Unicode key not present, perhaps bytes key is? - nameobj = PyList_GET_ITEM(field_name_bytes_list_.obj(), i); - valueobj = PyDict_GetItem(obj, nameobj); - if (valueobj != NULL) { - dict_key_kind_ = DictKeyKind::BYTES; - return AppendDictItemWithBytesKeys(obj); + if (do_contain) { + key_kind_ = KeyKind::UNICODE; + return Status::OK(); } + + // check key exists between the bytes field names + do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first); RETURN_IF_PYERROR(); - } - // If we come here, it means all keys are absent - for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(value_converters_[i]->Append(Py_None)); + if (do_contain) { + key_kind_ = KeyKind::BYTES; + return Status::OK(); + } } return Status::OK(); } - Status AppendDictItemWithBytesKeys(PyObject* obj) { - return AppendDictItem(obj, field_name_bytes_list_.obj()); - } - - Status AppendDictItemWithUnicodeKeys(PyObject* obj) { - return AppendDictItem(obj, field_name_unicode_list_.obj()); - } - - Status AppendDictItem(PyObject* obj, PyObject* field_name_list) { - // NOTE we're ignoring any extraneous dict items + Status AppendEmpty() { for (int i = 0; i < num_fields_; i++) { - PyObject* nameobj = PyList_GET_ITEM(field_name_list, i); // borrowed - PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed - if (valueobj == NULL) { - RETURN_IF_PYERROR(); - } - RETURN_NOT_OK(value_converters_[i]->Append(valueobj ? valueobj : Py_None)); + RETURN_NOT_OK(this->children_[i]->Append(Py_None)); } return Status::OK(); } - Status AppendTupleItem(PyObject* obj) { - if (PyTuple_GET_SIZE(obj) != num_fields_) { + Status AppendTuple(PyObject* tuple) { + if (!PyTuple_Check(tuple)) { + return internal::InvalidType(tuple, "was expecting a tuple"); + } + if (PyTuple_GET_SIZE(tuple) != num_fields_) { return Status::Invalid("Tuple size must be equal to number of struct fields"); } for (int i = 0; i < num_fields_; i++) { - PyObject* valueobj = PyTuple_GET_ITEM(obj, i); - RETURN_NOT_OK(value_converters_[i]->Append(valueobj)); + PyObject* value = PyTuple_GET_ITEM(tuple, i); + RETURN_NOT_OK(this->children_[i]->Append(value)); } return Status::OK(); } - std::vector> value_converters_; - OwnedRef field_name_unicode_list_; - OwnedRef field_name_bytes_list_; - int num_fields_; - // Whether we're converting from a sequence of dicts or tuples - enum class SourceKind { UNKNOWN, DICTS, TUPLES } source_kind_ = SourceKind::UNKNOWN; - enum class DictKeyKind { - UNKNOWN, - BYTES, - UNICODE - } dict_key_kind_ = DictKeyKind::UNKNOWN; - bool from_pandas_; - bool strict_conversions_; - bool ignore_timezone_; -}; - -template -class DecimalConverter : public TypedConverter { - public: - using BASE = TypedConverter; - - Status Init(ArrayBuilder* builder) override { - RETURN_NOT_OK(BASE::Init(builder)); - decimal_type_ = checked_pointer_cast(this->typed_builder_->type()); - return Status::OK(); + Status AppendDict(PyObject* dict) { + if (!PyDict_Check(dict)) { + return internal::InvalidType(dict, "was expecting a dict"); + } + switch (key_kind_) { + case KeyKind::UNICODE: + return AppendDict(dict, unicode_field_names_.obj()); + case KeyKind::BYTES: + return AppendDict(dict, bytes_field_names_.obj()); + default: + RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict))); + if (key_kind_ == KeyKind::UNKNOWN) { + // was unable to infer the type which means that all keys are absent + return AppendEmpty(); + } else { + return AppendDict(dict); + } + } } - Status AppendValue(PyObject* obj) override { - Decimal128 value; - RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *decimal_type_, &value)); - return this->typed_builder_->Append(value); + Status AppendItems(PyObject* items) { + if (!PySequence_Check(items)) { + return internal::InvalidType(items, "was expecting a sequence of key-value items"); + } + switch (key_kind_) { + case KeyKind::UNICODE: + return AppendItems(items, unicode_field_names_.obj()); + case KeyKind::BYTES: + return AppendItems(items, bytes_field_names_.obj()); + default: + RETURN_NOT_OK(InferKeyKind(items)); + if (key_kind_ == KeyKind::UNKNOWN) { + // was unable to infer the type which means that all keys are absent + return AppendEmpty(); + } else { + return AppendItems(items); + } + } } - private: - std::shared_ptr decimal_type_; -}; - -#define PRIMITIVE(TYPE_ENUM, TYPE) \ - case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new PrimitiveConverter); \ - break; - -#define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE_CLASS) \ - case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new TYPE_CLASS); \ - break; - -// Dynamic constructor for sequence converters -template -Status GetConverterFlat(const std::shared_ptr& type, bool strict_conversions, - bool ignore_timezone, std::unique_ptr* out) { - switch (type->id()) { - SIMPLE_CONVERTER_CASE(NA, NullConverter); - PRIMITIVE(BOOL, BooleanType); - PRIMITIVE(INT8, Int8Type); - PRIMITIVE(INT16, Int16Type); - PRIMITIVE(INT32, Int32Type); - PRIMITIVE(INT64, Int64Type); - PRIMITIVE(UINT8, UInt8Type); - PRIMITIVE(UINT16, UInt16Type); - PRIMITIVE(UINT32, UInt32Type); - PRIMITIVE(UINT64, UInt64Type); - PRIMITIVE(HALF_FLOAT, HalfFloatType); - PRIMITIVE(FLOAT, FloatType); - PRIMITIVE(DOUBLE, DoubleType); - PRIMITIVE(DATE32, Date32Type); - PRIMITIVE(DATE64, Date64Type); - SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); - case Type::BINARY: - *out = - std::unique_ptr(new BinaryConverter()); - break; - case Type::LARGE_BINARY: - *out = std::unique_ptr( - new BinaryConverter()); - break; - case Type::FIXED_SIZE_BINARY: - *out = std::unique_ptr(new FixedSizeBinaryConverter( - checked_cast(*type).byte_width())); - break; - case Type::STRING: - if (strict_conversions) { - *out = std::unique_ptr( - new StringConverter()); - } else { - *out = std::unique_ptr( - new StringConverter()); - } - break; - case Type::LARGE_STRING: - if (strict_conversions) { - *out = std::unique_ptr( - new StringConverter()); - } else { - *out = std::unique_ptr( - new StringConverter()); + Status AppendDict(PyObject* dict, PyObject* field_names) { + // NOTE we're ignoring any extraneous dict items + for (int i = 0; i < num_fields_; i++) { + PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed + PyObject* value = PyDict_GetItem(dict, name); // borrowed + if (value == NULL) { + RETURN_IF_PYERROR(); } - break; - case Type::TIME32: { - auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TimeConverter(unit, ignore_timezone)); - break; + RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); } - case Type::TIME64: { - auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TimeConverter(unit, ignore_timezone)); - break; - } - case Type::TIMESTAMP: { - auto unit = checked_cast(*type).unit(); - *out = std::unique_ptr( - new TemporalConverter(unit, ignore_timezone)); - break; - } - case Type::DURATION: { - auto unit = checked_cast(*type).unit(); - *out = - std::unique_ptr(new TemporalConverter( - unit, /*ignore_timezone=*/false)); - break; - } - default: - return Status::NotImplemented("Sequence converter for type ", type->ToString(), - " not implemented"); + return Status::OK(); } - return Status::OK(); -} -Status GetConverter(const std::shared_ptr& type, bool from_pandas, - bool strict_conversions, bool ignore_timezone, - std::unique_ptr* out) { - if (from_pandas) { - // ARROW-842: If pandas is not installed then null checks will be less - // comprehensive, but that is okay. - internal::InitPandasStaticData(); + Result> GetKeyValuePair(PyObject* seq, int index) { + PyObject* pair = PySequence_GetItem(seq, index); + RETURN_IF_PYERROR(); + if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) { + return internal::InvalidType(pair, "was expecting tuple of (key, value) pair"); + } + PyObject* key = PyTuple_GetItem(pair, 0); + RETURN_IF_PYERROR(); + PyObject* value = PyTuple_GetItem(pair, 1); + RETURN_IF_PYERROR(); + return std::make_pair(key, value); } - switch (type->id()) { - case Type::LIST: - if (from_pandas) { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::LARGE_LIST: - if (from_pandas) { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr( - new ListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::MAP: - if (from_pandas) { - *out = - std::unique_ptr(new MapConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr(new MapConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::FIXED_SIZE_LIST: - if (from_pandas) { - *out = std::unique_ptr( - new FixedSizeListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } else { - *out = std::unique_ptr( - new FixedSizeListConverter( - from_pandas, strict_conversions, ignore_timezone)); - } - return Status::OK(); - case Type::STRUCT: - if (from_pandas) { - *out = std::unique_ptr( - new StructConverter( - from_pandas, strict_conversions, ignore_timezone)); + Status AppendItems(PyObject* items, PyObject* field_names) { + auto length = static_cast(PySequence_Size(items)); + RETURN_IF_PYERROR(); + + // append the values for the defined fields + for (int i = 0; i < std::min(num_fields_, length); i++) { + // retrieve the key-value pair + ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); + + // validate that the key and the field name are equal + PyObject* name = PyList_GET_ITEM(field_names, i); + bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ); + RETURN_IF_PYERROR(); + + // finally append to the respective child builder + if (are_equal) { + RETURN_NOT_OK(this->children_[i]->Append(pair.second)); } else { - *out = std::unique_ptr(new StructConverter( - from_pandas, strict_conversions, ignore_timezone)); + ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first)); + ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name)); + return Status::Invalid("The expected field name is `", name_view.bytes, "` but `", + key_view.bytes, "` was given"); } - return Status::OK(); - default: - break; - } - - if (from_pandas) { - RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, - ignore_timezone, out)); - } else { - RETURN_NOT_OK(GetConverterFlat(type, strict_conversions, - ignore_timezone, out)); + } + // insert null values for missing fields + for (int i = length; i < num_fields_; i++) { + RETURN_NOT_OK(this->children_[i]->AppendNull()); + } + return Status::OK(); } - return Status::OK(); -} -// ---------------------------------------------------------------------- + // Whether we're converting from a sequence of dicts or tuples or list of pairs + enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN; + // Whether the input dictionary keys' type is python bytes or unicode + enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN; + // Store the field names as a PyObjects for dict matching + OwnedRef bytes_field_names_; + OwnedRef unicode_field_names_; + // Store the number of fields for later reuse + int num_fields_; +}; // Convert *obj* to a sequence if necessary // Fill *size* to its length. If >= 0 on entry, *size* is an upper size @@ -1352,64 +996,52 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } -Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr* out) { +Result> ConvertPySequence(PyObject* obj, PyObject* mask, + PyConversionOptions options, + MemoryPool* pool) { PyAcquireGIL lock; PyObject* seq; OwnedRef tmp_seq_nanny; - std::shared_ptr real_type; - int64_t size = options.size; - RETURN_NOT_OK(ConvertToSequenceAndInferSize(sequence_source, &seq, &size)); + RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); tmp_seq_nanny.reset(seq); // In some cases, type inference may be "loose", like strings. If the user // passed pa.string(), then we will error if we encounter any non-UTF8 // value. If not, then we will allow the result to be a BinaryArray - bool strict_conversions = false; - if (options.type == nullptr) { - RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); - if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { - const auto& ts_type = checked_cast(*real_type); - real_type = timestamp(ts_type.unit()); - } + ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); + options.strict = false; } else { - real_type = options.type; - strict_conversions = true; + options.strict = true; } DCHECK_GE(size, 0); - // Create the sequence converter, initialize with the builder - std::unique_ptr converter; - RETURN_NOT_OK(GetConverter(real_type, options.from_pandas, strict_conversions, - options.ignore_timezone, &converter)); - - // Create ArrayBuilder for type, then pass into the SeqConverter - // instance. The reason this is created here rather than in GetConverter is - // because of nested types (child SeqConverter objects need the child - // builders created by MakeBuilder) - std::unique_ptr type_builder; - RETURN_NOT_OK(MakeBuilder(options.pool, real_type, &type_builder)); - RETURN_NOT_OK(converter->Init(type_builder.get())); - - // Convert values - if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); + ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( + options.type, options, pool))); + if (converter->may_overflow()) { + // The converter hierarchy contains binary- or list-like builders which can overflow + // depending on the input values. Wrap the converter with a chunker which detects + // the overflow and automatically creates new chunks. + ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); + if (mask != nullptr && mask != Py_None) { + RETURN_NOT_OK(ExtendMasked(chunked_converter.get(), seq, mask, size)); + } else { + RETURN_NOT_OK(Extend(chunked_converter.get(), seq, size)); + } + return chunked_converter->ToChunkedArray(); } else { - RETURN_NOT_OK(converter->Extend(seq, size)); + // If the converter can't overflow spare the capacity error checking on the hot-path, + // this improves the performance roughly by ~10 for primitive types. + if (mask != nullptr && mask != Py_None) { + RETURN_NOT_OK(ExtendMasked(converter.get(), seq, mask, size)); + } else { + RETURN_NOT_OK(Extend(converter.get(), seq, size)); + } + return converter->ToChunkedArray(); } - - // Retrieve result. Conversion may yield one or more array values - return converter->GetResult(out); -} - -Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, - std::shared_ptr* out) { - return ConvertPySequence(obj, nullptr, options, out); } } // namespace py diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index 5108e752e8f..d167996ba8d 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -39,20 +39,17 @@ class Status; namespace py { struct PyConversionOptions { - PyConversionOptions() : type(NULLPTR), size(-1), pool(NULLPTR), from_pandas(false) {} + PyConversionOptions() = default; PyConversionOptions(const std::shared_ptr& type, int64_t size, MemoryPool* pool, bool from_pandas) - : type(type), size(size), pool(default_memory_pool()), from_pandas(from_pandas) {} + : type(type), size(size), from_pandas(from_pandas) {} // Set to null if to be inferred std::shared_ptr type; - // Default is -1: infer from data - int64_t size; - - // Memory pool to use for allocations - MemoryPool* pool; + // Default is -1, which indicates the size should the same as the input sequence + int64_t size = -1; bool from_pandas = false; @@ -60,6 +57,8 @@ struct PyConversionOptions { /// timezone bugs (see ARROW-9528). Should be removed /// after Arrow 2.0 release. bool ignore_timezone = false; + + bool strict = false; }; /// \brief Convert sequence (list, generator, NumPy array with dtype object) of @@ -69,16 +68,13 @@ struct PyConversionOptions { /// values in the sequence are null (true) or not null (false). This parameter /// may be null /// \param[in] options various conversion options -/// \param[out] out a ChunkedArray containing one or more chunks -/// \return Status +/// \param[in] pool MemoryPool to use for allocations +/// \return Result ChunkedArray ARROW_PYTHON_EXPORT -Status ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT -Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, - std::shared_ptr* out); +Result> ConvertPySequence( + PyObject* obj, PyObject* mask, PyConversionOptions options, + MemoryPool* pool = default_memory_pool()); } // namespace py + } // namespace arrow diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index cefa97abeea..7b91c24f63c 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -37,6 +37,7 @@ #include "arrow/ipc/writer.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/result.h" #include "arrow/tensor.h" #include "arrow/util/logging.h" @@ -482,8 +483,7 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size)); RETURN_NOT_OK(builder->AppendBytes(data, size)); } else if (PyUnicode_Check(elem)) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(elem)); + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromUnicode(elem)); int32_t size = -1; RETURN_NOT_OK(internal::CastSize(view.size, &size)); RETURN_NOT_OK(builder->AppendString(view.bytes, size)); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 4a007dd8782..946d3bfe44f 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -428,8 +428,8 @@ struct ARROW_EXPORT DictionaryScalar : public Scalar { explicit DictionaryScalar(std::shared_ptr type); - DictionaryScalar(ValueType value, std::shared_ptr type) - : Scalar(std::move(type), true), value(std::move(value)) {} + DictionaryScalar(ValueType value, std::shared_ptr type, bool is_valid = true) + : Scalar(std::move(type), is_valid), value(std::move(value)) {} Result> GetEncodedValue() const; }; diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 4171164899d..dc8708f689e 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -627,6 +627,8 @@ TEST(TestDictionaryScalar, Basics) { gamma.dictionary = dict; auto scalar_null = MakeNullScalar(ty); + checked_cast(*scalar_null).value.dictionary = dict; + auto scalar_alpha = DictionaryScalar(alpha, ty); auto scalar_gamma = DictionaryScalar(gamma, ty); @@ -654,6 +656,12 @@ TEST(TestDictionaryScalar, Basics) { ASSERT_TRUE(first->Equals(scalar_gamma)); ASSERT_TRUE(second->Equals(scalar_alpha)); ASSERT_TRUE(last->Equals(scalar_null)); + + auto first_dict_scalar = checked_cast(*first); + ASSERT_TRUE(first_dict_scalar.value.dictionary->Equals(arr.dictionary())); + + auto second_dict_scalar = checked_cast(*second); + ASSERT_TRUE(second_dict_scalar.value.dictionary->Equals(arr.dictionary())); } } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 6592af39557..31f1c1bc7f8 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -254,7 +254,7 @@ ARROW_TESTING_EXPORT void TestInitialized(const Array& array); template void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr* out) { - ASSERT_OK(builder->Finish(out)); + ASSERT_OK_AND_ASSIGN(*out, builder->Finish()); AssertZeroPadded(**out); TestInitialized(**out); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index f95edb0c896..d2abe573cd5 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -530,12 +530,20 @@ using enable_if_base_binary = enable_if_t::value, R>; // Any binary excludes string from Base binary template -using is_any_binary_type = +using is_binary_type = std::integral_constant::value || std::is_same::value>; template -using enable_if_any_binary = enable_if_t::value, R>; +using enable_if_binary = enable_if_t::value, R>; + +template +using is_string_type = + std::integral_constant::value || + std::is_same::value>; + +template +using enable_if_string = enable_if_t::value, R>; template using is_string_like_type = @@ -544,6 +552,9 @@ using is_string_like_type = template using enable_if_string_like = enable_if_t::value, R>; +template +using enable_if_same = enable_if_t::value, R>; + // Note that this also includes DecimalType template using is_fixed_size_binary_type = std::is_base_of; @@ -574,6 +585,9 @@ using is_nested_type = std::is_base_of; template using enable_if_nested = enable_if_t::value, R>; +template +using enable_if_not_nested = enable_if_t::value, R>; + template using is_var_length_list_type = std::integral_constant::value || @@ -596,6 +610,15 @@ using is_fixed_size_list_type = std::is_same; template using enable_if_fixed_size_list = enable_if_t::value, R>; +template +using is_list_type = + std::integral_constant::value || + std::is_same::value || + std::is_same::valuae>; + +template +using enable_if_list_type = enable_if_t::value, R>; + template using is_list_like_type = std::integral_constant::value || @@ -654,6 +677,18 @@ using is_interval_type = std::is_base_of; template using enable_if_interval = enable_if_t::value, R>; +template +using is_dictionary_type = std::is_base_of; + +template +using enable_if_dictionary = enable_if_t::value, R>; + +template +using is_extension_type = std::is_base_of; + +template +using enable_if_extension = enable_if_t::value, R>; + // Attribute differentiation template @@ -670,8 +705,12 @@ template using enable_if_has_c_type = enable_if_t::value, R>; template -using has_string_view = std::integral_constant::value || - is_string_like_type::value>; +using has_string_view = + std::integral_constant::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value>; template using enable_if_has_string_view = enable_if_t::value, R>; diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h new file mode 100644 index 00000000000..d60d8d056fa --- /dev/null +++ b/cpp/src/arrow/util/converter.h @@ -0,0 +1,324 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/make_unique.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace internal { + +template class ConverterTrait> +static Result> MakeConverter( + std::shared_ptr type, typename BaseConverter::OptionsType options, + MemoryPool* pool); + +template +class Converter { + public: + using Self = Converter; + using InputType = Input; + using OptionsType = Options; + + virtual ~Converter() = default; + + Status Construct(std::shared_ptr type, OptionsType options, + MemoryPool* pool) { + type_ = std::move(type); + options_ = std::move(options); + return Init(pool); + } + + virtual Status Append(InputType value) = 0; + + const std::shared_ptr& builder() const { return builder_; } + + const std::shared_ptr& type() const { return type_; } + + OptionsType options() const { return options_; } + + bool may_overflow() const { return may_overflow_; } + + virtual Status Reserve(int64_t additional_capacity) { + return builder_->Reserve(additional_capacity); + } + + Status AppendNull() { return builder_->AppendNull(); } + + virtual Result> ToArray() { return builder_->Finish(); } + + virtual Result> ToArray(int64_t length) { + ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray()); + return arr->Slice(0, length); + } + + virtual Result> ToChunkedArray() { + ARROW_ASSIGN_OR_RAISE(auto array, ToArray()); + std::vector> chunks = {std::move(array)}; + return std::make_shared(chunks); + } + + protected: + virtual Status Init(MemoryPool* pool) { return Status::OK(); } + + std::shared_ptr type_; + std::shared_ptr builder_; + OptionsType options_; + bool may_overflow_ = false; +}; + +template +class PrimitiveConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + + protected: + Status Init(MemoryPool* pool) override { + this->builder_ = std::make_shared(this->type_, pool); + this->may_overflow_ = + is_base_binary_like(this->type_->id()) || is_fixed_size_binary(this->type_->id()); + primitive_type_ = checked_cast(this->type_.get()); + primitive_builder_ = checked_cast(this->builder_.get()); + return Status::OK(); + } + + const ArrowType* primitive_type_; + BuilderType* primitive_builder_; +}; + +template class ConverterTrait> +class ListConverter : public BaseConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; + using ConverterType = typename ConverterTrait::type; + + protected: + Status Init(MemoryPool* pool) override { + list_type_ = checked_cast(this->type_.get()); + ARROW_ASSIGN_OR_RAISE(value_converter_, + (MakeConverter( + list_type_->value_type(), this->options_, pool))); + this->builder_ = + std::make_shared(pool, value_converter_->builder(), this->type_); + list_builder_ = checked_cast(this->builder_.get()); + this->may_overflow_ = true; + return Status::OK(); + } + + const ArrowType* list_type_; + BuilderType* list_builder_; + std::unique_ptr value_converter_; +}; + +template class ConverterTrait> +class StructConverter : public BaseConverter { + public: + using ConverterType = typename ConverterTrait::type; + + Status Reserve(int64_t additional_capacity) override { + ARROW_RETURN_NOT_OK(this->builder_->Reserve(additional_capacity)); + for (const auto& child : children_) { + ARROW_RETURN_NOT_OK(child->Reserve(additional_capacity)); + } + return Status::OK(); + } + + protected: + Status Init(MemoryPool* pool) override { + std::unique_ptr child_converter; + std::vector> child_builders; + + struct_type_ = checked_cast(this->type_.get()); + for (const auto& field : struct_type_->fields()) { + ARROW_ASSIGN_OR_RAISE(child_converter, + (MakeConverter( + field->type(), this->options_, pool))); + this->may_overflow_ |= child_converter->may_overflow(); + child_builders.push_back(child_converter->builder()); + children_.push_back(std::move(child_converter)); + } + + this->builder_ = + std::make_shared(this->type_, pool, std::move(child_builders)); + struct_builder_ = checked_cast(this->builder_.get()); + + return Status::OK(); + } + + const StructType* struct_type_; + StructBuilder* struct_builder_; + std::vector> children_; +}; + +template +class DictionaryConverter : public BaseConverter { + public: + using BuilderType = DictionaryBuilder; + + protected: + Status Init(MemoryPool* pool) override { + std::unique_ptr builder; + ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder)); + this->builder_ = std::move(builder); + this->may_overflow_ = false; + dict_type_ = checked_cast(this->type_.get()); + value_type_ = checked_cast(dict_type_->value_type().get()); + value_builder_ = checked_cast(this->builder_.get()); + return Status::OK(); + } + + const DictionaryType* dict_type_; + const ValueType* value_type_; + BuilderType* value_builder_; +}; + +template class ConverterTrait> +struct MakeConverterImpl { + template ::type> + Status Visit(const T&) { + out.reset(new ConverterType()); + return out->Construct(std::move(type), std::move(options), pool); + } + + Status Visit(const DictionaryType& t) { + switch (t.value_type()->id()) { +#define DICTIONARY_CASE(TYPE) \ + case TYPE::type_id: \ + out = internal::make_unique< \ + typename ConverterTrait::template dictionary_type>(); \ + break; + DICTIONARY_CASE(BooleanType); + DICTIONARY_CASE(Int8Type); + DICTIONARY_CASE(Int16Type); + DICTIONARY_CASE(Int32Type); + DICTIONARY_CASE(Int64Type); + DICTIONARY_CASE(UInt8Type); + DICTIONARY_CASE(UInt16Type); + DICTIONARY_CASE(UInt32Type); + DICTIONARY_CASE(UInt64Type); + DICTIONARY_CASE(FloatType); + DICTIONARY_CASE(DoubleType); + DICTIONARY_CASE(BinaryType); + DICTIONARY_CASE(StringType); + DICTIONARY_CASE(FixedSizeBinaryType); +#undef DICTIONARY_CASE + default: + return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(), + " not implemented"); + } + return out->Construct(std::move(type), std::move(options), pool); + } + + Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); } + + std::shared_ptr type; + typename BaseConverter::OptionsType options; + MemoryPool* pool; + std::unique_ptr out; +}; + +template class ConverterTrait> +static Result> MakeConverter( + std::shared_ptr type, typename BaseConverter::OptionsType options, + MemoryPool* pool) { + MakeConverterImpl visitor{ + std::move(type), std::move(options), pool, NULLPTR}; + ARROW_RETURN_NOT_OK(VisitTypeInline(*visitor.type, &visitor)); + return std::move(visitor.out); +} + +template +class Chunker { + public: + using InputType = typename Converter::InputType; + + explicit Chunker(std::unique_ptr converter) + : converter_(std::move(converter)) {} + + Status Reserve(int64_t additional_capacity) { + ARROW_RETURN_NOT_OK(converter_->Reserve(additional_capacity)); + reserved_ += additional_capacity; + return Status::OK(); + } + + Status AppendNull() { + auto status = converter_->AppendNull(); + if (ARROW_PREDICT_FALSE(status.IsCapacityError())) { + ARROW_RETURN_NOT_OK(FinishChunk()); + return converter_->AppendNull(); + } + ++length_; + return status; + } + + Status Append(InputType value) { + auto status = converter_->Append(value); + if (ARROW_PREDICT_FALSE(status.IsCapacityError())) { + ARROW_RETURN_NOT_OK(FinishChunk()); + return Append(value); + } + ++length_; + return status; + } + + Status FinishChunk() { + ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_)); + chunks_.push_back(chunk); + // reserve space for the remaining items, besides being an optimization it is also + // required if the converter's implementation relies on unsafe builder methods in + // conveter->Append() + auto remaining = reserved_ - length_; + Reset(); + return Reserve(remaining); + } + + Result> ToChunkedArray() { + ARROW_RETURN_NOT_OK(FinishChunk()); + return std::make_shared(chunks_); + } + + protected: + void Reset() { + converter_->builder()->Reset(); + length_ = 0; + reserved_ = 0; + } + + int64_t length_ = 0; + int64_t reserved_ = 0; + std::unique_ptr converter_; + std::vector> chunks_; +}; + +template +static Result>> MakeChunker(std::unique_ptr converter) { + return internal::make_unique>(std::move(converter)); +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 639421cefe9..f1c4b1e6318 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -842,6 +842,11 @@ struct HashTraits::value && using MemoTableType = BinaryMemoTable; }; +template <> +struct HashTraits { + using MemoTableType = BinaryMemoTable; +}; + template struct HashTraits::value>> { using MemoTableType = BinaryMemoTable; diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index a9f1be221f9..c5c06cec031 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -21,8 +21,10 @@ import warnings cdef _sequence_to_array(object sequence, object mask, object size, DataType type, CMemoryPool* pool, c_bool from_pandas): - cdef int64_t c_size - cdef PyConversionOptions options + cdef: + int64_t c_size + PyConversionOptions options + shared_ptr[CChunkedArray] chunked if type is not None: options.type = type.sp_type @@ -30,19 +32,18 @@ cdef _sequence_to_array(object sequence, object mask, object size, if size is not None: options.size = size - options.pool = pool options.from_pandas = from_pandas options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) - cdef shared_ptr[CChunkedArray] out - with nogil: - check_status(ConvertPySequence(sequence, mask, options, &out)) + chunked = GetResultValue( + ConvertPySequence(sequence, mask, options, pool) + ) - if out.get().num_chunks() == 1: - return pyarrow_wrap_array(out.get().chunk(0)) + if chunked.get().num_chunks() == 1: + return pyarrow_wrap_array(chunked.get().chunk(0)) else: - return pyarrow_wrap_chunked_array(out) + return pyarrow_wrap_chunked_array(chunked) cdef inline _is_array_like(obj): @@ -158,28 +159,52 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Notes ----- Localized timestamps will currently be returned as UTC (pandas's native - representation). Timezone-naive data will be implicitly interpreted as + representation). Timezone-naive data will be implicitly interpreted as UTC. + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. + Examples -------- >>> import pandas as pd >>> import pyarrow as pa >>> pa.array(pd.Series([1, 2])) - + [ 1, 2 ] + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + >>> import numpy as np - >>> pa.array(pd.Series([1, 2]), np.array([0, 1], - ... dtype=bool)) - + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + [ 1, null ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) """ cdef: CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -460,7 +485,7 @@ def infer_type(values, mask=None, from_pandas=False): if mask is not None and not isinstance(mask, np.ndarray): mask = np.array(mask, dtype=bool) - check_status(InferArrowType(values, mask, use_pandas_sentinels, &out)) + out = GetResultValue(InferArrowType(values, mask, use_pandas_sentinels)) return pyarrow_wrap_data_type(out) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9a3451e4f82..d1c4110cb40 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -969,12 +969,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[shared_ptr[CScalar]] value CResult[shared_ptr[CScalar]] field(CFieldRef ref) const - cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): - cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": - shared_ptr[CScalar] index - shared_ptr[CArray] dictionary + cdef cppclass CDictionaryScalarIndexAndDictionary \ + "arrow::DictionaryScalar::ValueType": + shared_ptr[CScalar] index + shared_ptr[CArray] dictionary - CDictionaryValue value + cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): + CDictionaryScalar(CDictionaryScalarIndexAndDictionary value, + shared_ptr[CDataType], c_bool is_valid) + CDictionaryScalarIndexAndDictionary value CResult[shared_ptr[CScalar]] GetEncodedValue() cdef cppclass CUnionScalar" arrow::UnionScalar"(CScalar): @@ -1746,9 +1749,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef extern from "arrow/python/api.h" namespace "arrow::py": # Requires GIL - CStatus InferArrowType(object obj, object mask, - c_bool pandas_null_sentinels, - shared_ptr[CDataType]* out_type) + CResult[shared_ptr[CDataType]] InferArrowType( + object obj, object mask, c_bool pandas_null_sentinels) cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: @@ -1764,12 +1766,13 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CMemoryPool* pool c_bool from_pandas c_bool ignore_timezone + c_bool strict # TODO Some functions below are not actually "nogil" - CStatus ConvertPySequence(object obj, object mask, - const PyConversionOptions& options, - shared_ptr[CChunkedArray]* out) + CResult[shared_ptr[CChunkedArray]] ConvertPySequence( + object obj, object mask, const PyConversionOptions& options, + CMemoryPool* pool) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index cc06ca6a2f9..3e72d060d69 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. - import collections @@ -687,6 +686,50 @@ cdef class DictionaryScalar(Scalar): Concrete class for dictionary-encoded scalars. """ + @classmethod + def _reconstruct(cls, type, is_valid, index, dictionary): + cdef: + CDictionaryScalarIndexAndDictionary value + shared_ptr[CDictionaryScalar] wrapped + DataType type_ + Scalar index_ + Array dictionary_ + + type_ = ensure_type(type, allow_none=False) + if not isinstance(type_, DictionaryType): + raise TypeError('Must pass a DictionaryType instance') + + if isinstance(index, Scalar): + if not index.type.equals(type.index_type): + raise TypeError("The Scalar value passed as index must have " + "identical type to the dictionary type's " + "index_type") + index_ = index + else: + index_ = scalar(index, type=type_.index_type) + + if isinstance(dictionary, Array): + if not dictionary.type.equals(type.value_type): + raise TypeError("The Array passed as dictionary must have " + "identical type to the dictionary type's " + "value_type") + dictionary_ = dictionary + else: + dictionary_ = array(dictionary, type=type_.value_type) + + value.index = pyarrow_unwrap_scalar(index_) + value.dictionary = pyarrow_unwrap_array(dictionary_) + + wrapped = make_shared[CDictionaryScalar]( + value, pyarrow_unwrap_data_type(type_), (is_valid) + ) + return Scalar.wrap( wrapped) + + def __reduce__(self): + return DictionaryScalar._reconstruct, ( + self.type, self.is_valid, self.index, self.dictionary + ) + @property def index(self): """ @@ -831,14 +874,15 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): shared_ptr[CArray] array shared_ptr[CChunkedArray] chunked bint is_pandas_object = False + CMemoryPool* pool type = ensure_type(type, allow_none=True) + pool = maybe_unbox_memory_pool(memory_pool) if _is_array_like(value): value = get_values(value, &is_pandas_object) options.size = 1 - options.pool = maybe_unbox_memory_pool(memory_pool) if type is not None: ty = ensure_type(type) @@ -851,9 +895,12 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): value = [value] with nogil: - check_status(ConvertPySequence(value, None, options, &chunked)) + chunked = GetResultValue(ConvertPySequence(value, None, options, pool)) + # get the first chunk assert chunked.get().num_chunks() == 1 array = chunked.get().chunk(0) + + # retrieve the scalar from the first position scalar = GetResultValue(array.get().GetScalar(0)) return Scalar.wrap(scalar) diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 088f29185bd..97e972d84d5 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import datetime + import pytz import hypothesis as h import hypothesis.strategies as st @@ -82,10 +84,16 @@ unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones() ) -duration_types = st.sampled_from([ - pa.duration(unit) for unit in ['s', 'ms', 'us', 'ns']]) +duration_types = st.builds( + pa.duration, + st.sampled_from(['s', 'ms', 'us', 'ns']) +) temporal_types = st.one_of( - date_types, time_types, timestamp_types, duration_types) + date_types, + time_types, + timestamp_types, + duration_types +) primitive_types = st.one_of( null_type, @@ -101,9 +109,16 @@ metadata = st.dictionaries(st.text(), st.text()) -def fields(type_strategy=primitive_types): - return st.builds(pa.field, name=custom_text, type=type_strategy, - nullable=st.booleans(), metadata=metadata) +@st.composite +def fields(draw, type_strategy=primitive_types): + name = draw(custom_text) + typ = draw(type_strategy) + if pa.types.is_null(typ): + nullable = True + else: + nullable = draw(st.booleans()) + meta = draw(metadata) + return pa.field(name, type=typ, nullable=nullable, metadata=meta) def list_types(item_strategy=primitive_types): @@ -152,8 +167,10 @@ def schemas(type_strategy=primitive_types, max_fields=None): @st.composite def arrays(draw, type, size=None): if isinstance(type, st.SearchStrategy): - type = draw(type) - elif not isinstance(type, pa.DataType): + ty = draw(type) + elif isinstance(type, pa.DataType): + ty = type + else: raise TypeError('Type must be a pyarrow DataType') if isinstance(size, st.SearchStrategy): @@ -165,57 +182,67 @@ def arrays(draw, type, size=None): shape = (size,) - if pa.types.is_list(type) or pa.types.is_large_list(type): + if pa.types.is_list(ty) or pa.types.is_large_list(ty): offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20 offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero - values = draw(arrays(type.value_type, size=int(offsets.sum()))) - array_type = ( - pa.LargeListArray if pa.types.is_large_list(type) - else pa.ListArray) + values = draw(arrays(ty.value_type, size=int(offsets.sum()))) + if pa.types.is_large_list(ty): + array_type = pa.LargeListArray + else: + array_type = pa.ListArray return array_type.from_arrays(offsets, values) - if pa.types.is_struct(type): - h.assume(len(type) > 0) + if pa.types.is_struct(ty): + h.assume(len(ty) > 0) fields, child_arrays = [], [] - for field in type: + for field in ty: fields.append(field) child_arrays.append(draw(arrays(field.type, size=size))) return pa.StructArray.from_arrays(child_arrays, fields=fields) - if (pa.types.is_boolean(type) or pa.types.is_integer(type) or - pa.types.is_floating(type)): - values = npst.arrays(type.to_pandas_dtype(), shape=(size,)) + if (pa.types.is_boolean(ty) or pa.types.is_integer(ty) or + pa.types.is_floating(ty)): + values = npst.arrays(ty.to_pandas_dtype(), shape=(size,)) np_arr = draw(values) - if pa.types.is_floating(type): + if pa.types.is_floating(ty): # Workaround ARROW-4952: no easy way to assert array equality # in a NaN-tolerant way. np_arr[np.isnan(np_arr)] = -42.0 - return pa.array(np_arr, type=type) + return pa.array(np_arr, type=ty) - if pa.types.is_null(type): + if pa.types.is_null(ty): value = st.none() - elif pa.types.is_time(type): + elif pa.types.is_time(ty): value = st.times() - elif pa.types.is_date(type): + elif pa.types.is_date(ty): value = st.dates() - elif pa.types.is_timestamp(type): - tz = pytz.timezone(type.tz) if type.tz is not None else None - value = st.datetimes(timezones=st.just(tz)) - elif pa.types.is_duration(type): + elif pa.types.is_timestamp(ty): + min_int64 = -(2**63) + max_int64 = 2**63 - 1 + min_datetime = datetime.datetime.fromtimestamp(min_int64 / 10**9) + max_datetime = datetime.datetime.fromtimestamp(max_int64 / 10**9) + try: + offset_hours = int(ty.tz) + tz = pytz.FixedOffset(offset_hours * 60) + except ValueError: + tz = pytz.timezone(ty.tz) + value = st.datetimes(timezones=st.just(tz), min_value=min_datetime, + max_value=max_datetime) + elif pa.types.is_duration(ty): value = st.timedeltas() - elif pa.types.is_binary(type) or pa.types.is_large_binary(type): + elif pa.types.is_binary(ty) or pa.types.is_large_binary(ty): value = st.binary() - elif pa.types.is_string(type) or pa.types.is_large_string(type): + elif pa.types.is_string(ty) or pa.types.is_large_string(ty): value = st.text() - elif pa.types.is_decimal(type): + elif pa.types.is_decimal(ty): # TODO(kszucs): properly limit the precision # value = st.decimals(places=type.scale, allow_infinity=False) h.reject() else: - raise NotImplementedError(type) + raise NotImplementedError(ty) values = st.lists(value, min_size=size, max_size=size) - return pa.array(draw(values), type=type) + return pa.array(draw(values), type=ty) @st.composite diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index feb84dd1d4c..13a167fd311 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2513,9 +2513,20 @@ def test_numpy_binary_overflow_to_chunked(): @pytest.mark.large_memory def test_list_child_overflow_to_chunked(): - vals = [['x' * 1024]] * ((2 << 20) + 1) - with pytest.raises(ValueError, match="overflowed"): - pa.array(vals) + kilobyte_string = 'x' * 1024 + two_mega = 2**21 + + vals = [[kilobyte_string]] * (two_mega - 1) + arr = pa.array(vals) + assert isinstance(arr, pa.Array) + assert len(arr) == two_mega - 1 + + vals = [[kilobyte_string]] * two_mega + arr = pa.array(vals) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr) == two_mega + assert len(arr.chunk(0)) == two_mega - 1 + assert len(arr.chunk(1)) == 1 def test_infer_type_masked(): diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index b8050f96468..f4894a4226a 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -15,19 +15,20 @@ # specific language governing permissions and limitations # under the License. -import pytest - -from pyarrow.pandas_compat import _pandas_api # noqa -import pyarrow as pa - import collections import datetime import decimal import itertools import math +import hypothesis as h import numpy as np import pytz +import pytest + +from pyarrow.pandas_compat import _pandas_api # noqa +import pyarrow as pa +import pyarrow.tests.strategies as past int_type_pairs = [ @@ -381,7 +382,7 @@ def test_sequence_custom_integers(seq): @parametrize_with_iterable_types def test_broken_integers(seq): data = [MyBrokenInt()] - with pytest.raises(pa.ArrowInvalid): + with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): pa.array(seq(data), type=pa.int64()) @@ -887,6 +888,7 @@ def test_sequence_timestamp(): @pytest.mark.parametrize('timezone', [ None, 'UTC', + 'Etc/GMT-1', 'Europe/Budapest', ]) @pytest.mark.parametrize('unit', [ @@ -960,6 +962,38 @@ def expected_datetime_value(dt): assert arr[i].as_py() == expected_datetime_value(utcdata[i]) +@pytest.mark.parametrize('timezone', [ + None, + 'UTC', + 'Etc/GMT-1', + 'Europe/Budapest', +]) +def test_pyarrow_ignore_timezone_environment_variable(monkeypatch, timezone): + # note that any non-empty value will evaluate to true + monkeypatch.setenv("PYARROW_IGNORE_TIMEZONE", "1") + data = [ + datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive + pytz.utc.localize( + datetime.datetime(2008, 1, 5, 5, 0, 0, 1000) + ), + pytz.timezone('US/Eastern').localize( + datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) + ), + pytz.timezone('Europe/Moscow').localize( + datetime.datetime(2010, 8, 13, 5, 0, 0, 437699) + ), + ] + + expected = [dt.replace(tzinfo=None) for dt in data] + if timezone is not None: + tzinfo = pytz.timezone(timezone) + expected = [tzinfo.fromutc(dt) for dt in expected] + + ty = pa.timestamp('us', tz=timezone) + arr = pa.array(data, type=ty) + assert arr.to_pylist() == expected + + def test_sequence_timestamp_with_timezone_inference(): data = [ datetime.datetime(2007, 7, 13, 8, 23, 34, 123456), # naive @@ -1513,6 +1547,108 @@ def test_struct_from_tuples(): pa.array([tup], type=ty) +def test_struct_from_list_of_pairs(): + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) + data = [ + [('a', 5), ('b', 'foo'), ('c', True)], + [('a', 6), ('b', 'bar'), ('c', False)], + None + ] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == [ + {'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False}, + None + ] + + # test with duplicated field names + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('a', pa.string()), + pa.field('b', pa.bool_()) + ]) + data = [ + [('a', 5), ('a', 'foo'), ('b', True)], + [('a', 6), ('a', 'bar'), ('b', False)], + ] + arr = pa.array(data, type=ty) + with pytest.raises(KeyError): + # TODO(kszucs): ARROW-9997 + arr.to_pylist() + + # test with empty elements + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) + data = [ + [], + [('a', 5), ('b', 'foo'), ('c', True)], + [('a', 2), ('b', 'baz')], + [('a', 1), ('b', 'bar'), ('c', False), ('d', 'julia')], + ] + expected = [ + {'a': None, 'b': None, 'c': None}, + {'a': 5, 'b': 'foo', 'c': True}, + {'a': 2, 'b': 'baz', 'c': None}, + {'a': 1, 'b': 'bar', 'c': False}, + ] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == expected + + +def test_struct_from_list_of_pairs_errors(): + ty = pa.struct([ + pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_()) + ]) + + # test that it raises if the key doesn't match the expected field name + data = [ + [], + [('a', 5), ('c', True), ('b', None)], + ] + msg = "The expected field name is `b` but `c` was given" + with pytest.raises(ValueError, match=msg): + pa.array(data, type=ty) + + # test various errors both at the first position and after because of key + # type inference + template = ( + r"Could not convert {} with type {}: was expecting tuple of " + r"\(key, value\) pair" + ) + cases = [ + tuple(), # empty key-value pair + tuple('a',), # missing value + tuple('unknown-key',), # not known field name + 'string', # not a tuple + ] + for key_value_pair in cases: + msg = template.format( + str(key_value_pair).replace('(', r'\(').replace(')', r'\)'), + type(key_value_pair).__name__ + ) + + with pytest.raises(TypeError, match=msg): + pa.array([ + [key_value_pair], + [('a', 5), ('b', 'foo'), ('c', None)], + ], type=ty) + + with pytest.raises(TypeError, match=msg): + pa.array([ + [('a', 5), ('b', 'foo'), ('c', None)], + [key_value_pair], + ], type=ty) + + def test_struct_from_mixed_sequence(): # It is forbidden to mix dicts and tuples when initializing a struct array ty = pa.struct([pa.field('a', pa.int32()), @@ -1530,6 +1666,7 @@ def test_struct_from_dicts_inference(): pa.field('c', pa.bool_())]) data = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] + arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data @@ -1543,6 +1680,7 @@ def test_struct_from_dicts_inference(): None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': 'bar', 'c': None}] + arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data @@ -1561,6 +1699,7 @@ def test_struct_from_dicts_inference(): {'a': {'aa': None, 'ab': False}, 'b': None}, {'a': None, 'b': 'bar'}] arr = pa.array(data) + assert arr.to_pylist() == data # Edge cases @@ -1664,3 +1803,228 @@ def test_map_from_tuples(): for entry in [[(5,)], [()], [('5', 'foo', True)]]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([entry], type=pa.map_('i4', 'i4')) + + +def test_dictionary_from_boolean(): + typ = pa.dictionary(pa.int8(), value_type=pa.bool_()) + a = pa.array([False, False, True, False, True], type=typ) + assert isinstance(a.type, pa.DictionaryType) + assert a.type.equals(typ) + + expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8()) + expected_dictionary = pa.array([False, True], type=pa.bool_()) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +@pytest.mark.parametrize('value_type', [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.float32(), + pa.float64(), +]) +def test_dictionary_from_integers(value_type): + typ = pa.dictionary(pa.int8(), value_type=value_type) + a = pa.array([1, 2, 1, 1, 2, 3], type=typ) + assert isinstance(a.type, pa.DictionaryType) + assert a.type.equals(typ) + + expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8()) + expected_dictionary = pa.array([1, 2, 3], type=value_type) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +@pytest.mark.parametrize('input_index_type', [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64() +]) +def test_dictionary_index_type(input_index_type): + # dictionary array is constructed using adaptive index type builder, + # but the input index type is considered as the minimal width type to use + + typ = pa.dictionary(input_index_type, value_type=pa.int64()) + arr = pa.array(range(10), type=typ) + assert arr.type.equals(typ) + + +def test_dictionary_is_always_adaptive(): + # dictionary array is constructed using adaptive index type builder, + # meaning that the output index type may be wider than the given index type + # since it depends on the input data + typ = pa.dictionary(pa.int8(), value_type=pa.int64()) + + a = pa.array(range(2**7), type=typ) + expected = pa.dictionary(pa.int8(), pa.int64()) + assert a.type.equals(expected) + + a = pa.array(range(2**7 + 1), type=typ) + expected = pa.dictionary(pa.int16(), pa.int64()) + assert a.type.equals(expected) + + +def test_dictionary_from_strings(): + for value_type in [pa.binary(), pa.string()]: + typ = pa.dictionary(pa.int8(), value_type) + a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ) + + assert isinstance(a.type, pa.DictionaryType) + + expected_indices = pa.array([0, 1, 2, 1, 2, 3], type=pa.int8()) + expected_dictionary = pa.array(["", "a", "bb", "ccc"], type=value_type) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + # fixed size binary type + typ = pa.dictionary(pa.int8(), pa.binary(3)) + a = pa.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ) + assert isinstance(a.type, pa.DictionaryType) + + expected_indices = pa.array([0, 0, 1, 2, 1], type=pa.int8()) + expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3)) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +def _has_unique_field_names(ty): + if isinstance(ty, pa.StructType): + field_names = [field.name for field in ty] + return len(set(field_names)) == len(field_names) + else: + return True + + +@h.given(past.all_arrays) +def test_array_to_pylist_roundtrip(arr): + # TODO(kszucs): ARROW-9997 + h.assume(_has_unique_field_names(arr.type)) + seq = arr.to_pylist() + restored = pa.array(seq, type=arr.type) + assert restored.equals(arr) + + +@pytest.mark.large_memory +def test_auto_chunking_binary_like(): + # single chunk + v1 = b'x' * 100000000 + v2 = b'x' * 147483646 + + # single chunk + one_chunk_data = [v1] * 20 + [b'', None, v2] + arr = pa.array(one_chunk_data, type=pa.binary()) + assert isinstance(arr, pa.Array) + assert len(arr) == 23 + assert arr[20].as_py() == b'' + assert arr[21].as_py() is None + assert arr[22].as_py() == v2 + + # two chunks + two_chunk_data = one_chunk_data + [b'two'] + arr = pa.array(two_chunk_data, type=pa.binary()) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 23 + assert len(arr.chunk(1)) == 1 + assert arr.chunk(0)[20].as_py() == b'' + assert arr.chunk(0)[21].as_py() is None + assert arr.chunk(0)[22].as_py() == v2 + assert arr.chunk(1).to_pylist() == [b'two'] + + # three chunks + three_chunk_data = one_chunk_data * 2 + [b'three', b'three'] + arr = pa.array(three_chunk_data, type=pa.binary()) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 3 + assert len(arr.chunk(0)) == 23 + assert len(arr.chunk(1)) == 23 + assert len(arr.chunk(2)) == 2 + for i in range(2): + assert arr.chunk(i)[20].as_py() == b'' + assert arr.chunk(i)[21].as_py() is None + assert arr.chunk(i)[22].as_py() == v2 + assert arr.chunk(2).to_pylist() == [b'three', b'three'] + + +@pytest.mark.large_memory +def test_auto_chunking_list_of_binary(): + # ARROW-6281 + vals = [['x' * 1024]] * ((2 << 20) + 1) + arr = pa.array(vals) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 2**21 - 1 + assert len(arr.chunk(1)) == 2 + assert arr.chunk(1).to_pylist() == [['x' * 1024]] * 2 + + +@pytest.mark.slow +@pytest.mark.large_memory +def test_auto_chunking_list_like(): + item = np.ones((2**28,), dtype='uint8') + data = [item] * (2**3 - 1) + arr = pa.array(data, type=pa.list_(pa.uint8())) + assert isinstance(arr, pa.Array) + assert len(arr) == 7 + + item = np.ones((2**28,), dtype='uint8') + data = [item] * 2**3 + arr = pa.array(data, type=pa.list_(pa.uint8())) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 7 + assert len(arr.chunk(1)) == 1 + assert arr.chunk(1)[0].as_py() == list(item) + + +@pytest.mark.slow +@pytest.mark.large_memory +def test_auto_chunking_map_type(): + # takes ~20 minutes locally + ty = pa.map_(pa.int8(), pa.int8()) + item = [(1, 1)] * 2**28 + data = [item] * 2**3 + arr = pa.array(data, type=ty) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr.chunk(0)) == 7 + assert len(arr.chunk(1)) == 1 + + +@pytest.mark.large_memory +@pytest.mark.parametrize(('ty', 'char'), [ + (pa.string(), 'x'), + (pa.binary(), b'x'), +]) +def test_nested_auto_chunking(ty, char): + v1 = char * 100000000 + v2 = char * 147483646 + + struct_type = pa.struct([ + pa.field('bool', pa.bool_()), + pa.field('integer', pa.int64()), + pa.field('string-like', ty), + ]) + + data = [{'bool': True, 'integer': 1, 'string-like': v1}] * 20 + data.append({'bool': True, 'integer': 1, 'string-like': v2}) + arr = pa.array(data, type=struct_type) + assert isinstance(arr, pa.Array) + + data.append({'bool': True, 'integer': 1, 'string-like': char}) + arr = pa.array(data, type=struct_type) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(1)) == 1 + assert arr.chunk(1)[0].as_py() == { + 'bool': True, + 'integer': 1, + 'string-like': char + } diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 03407521c12..5de84b30432 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1497,6 +1497,22 @@ def test_bytes_exceed_2gb(self): table = pa.Table.from_pandas(df) assert table[0].num_chunks == 2 + @pytest.mark.large_memory + @pytest.mark.parametrize('char', ['x', b'x']) + def test_auto_chunking_pandas_series_of_strings(self, char): + # ARROW-2367 + v1 = char * 100000000 + v2 = char * 147483646 + + df = pd.DataFrame({ + 'strings': [[v1]] * 20 + [[v2]] + [[b'x']] + }) + arr = pa.array(df['strings'], from_pandas=True) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + assert len(arr.chunk(0)) == 21 + assert len(arr.chunk(1)) == 1 + def test_fixed_size_bytes(self): values = [b'foo', None, bytearray(b'bar'), None, None, b'hey'] df = pd.DataFrame({'strings': values}) @@ -2087,6 +2103,22 @@ def test_large_binary_list(self): s, pd.Series([["aa", "bb"], None, ["cc"], []]), check_names=False) + @pytest.mark.slow + @pytest.mark.large_memory + def test_auto_chunking_on_list_overflow(self): + # ARROW-9976 + n = 2**24 + df = pd.DataFrame.from_dict({ + "a": list(np.zeros((n, 2**7), dtype='uint8')), + "b": range(n) + }) + table = pa.Table.from_pandas(df) + + column_a = table[0] + assert column_a.num_chunks == 2 + assert len(column_a.chunk(0)) == 2**24 - 1 + assert len(column_a.chunk(1)) == 1 + class TestConvertStructTypes: """ diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 091ae38e6e4..fa48ad8b5f2 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -531,27 +531,28 @@ def test_map(): def test_dictionary(): - indices = [2, 1, 2, 0] - dictionary = ['foo', 'bar', 'baz'] + indices = pa.array([2, None, 1, 2, 0, None]) + dictionary = pa.array(['foo', 'bar', 'baz']) arr = pa.DictionaryArray.from_arrays(indices, dictionary) - expected = ['baz', 'bar', 'baz', 'foo'] + expected = ['baz', None, 'bar', 'baz', 'foo', None] + assert arr.to_pylist() == expected for j, (i, v) in enumerate(zip(indices, expected)): s = arr[j] assert s.as_py() == v assert s.value.as_py() == v - assert s.index.as_py() == i - assert s.dictionary.to_pylist() == dictionary + assert s.index.equals(i) + assert s.dictionary.equals(dictionary) with pytest.warns(FutureWarning): - assert s.index_value.as_py() == i + assert s.index_value.equals(i) with pytest.warns(FutureWarning): assert s.dictionary_value.as_py() == v - with pytest.raises(pa.ArrowNotImplementedError): - pickle.loads(pickle.dumps(s)) + restored = pickle.loads(pickle.dumps(s)) + assert restored.equals(s) def test_union(): diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index c52751e91ac..9ac2f01686f 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -25,6 +25,7 @@ import pytz import hypothesis as h import hypothesis.strategies as st +import hypothesis.extra.pytz as tzst import weakref import numpy as np @@ -258,6 +259,9 @@ def test_is_primitive(): @pytest.mark.parametrize(('tz', 'expected'), [ (pytz.utc, 'UTC'), (pytz.timezone('Europe/Paris'), 'Europe/Paris'), + # StaticTzInfo.tzname returns with '-09' so we need to infer the timezone's + # name from the tzinfo.zone attribute + (pytz.timezone('Etc/GMT-9'), 'Etc/GMT-9'), (pytz.FixedOffset(180), '+03:00'), (datetime.timezone.utc, '+00:00'), (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') @@ -280,6 +284,13 @@ def test_tzinfo_to_string_errors(): pa.lib.tzinfo_to_string(tz) +@h.given(tzst.timezones()) +def test_pytz_timezone_roundtrip(tz): + timezone_string = pa.lib.tzinfo_to_string(tz) + timezone_tzinfo = pa.lib.string_to_tzinfo(timezone_string) + assert timezone_tzinfo == tz + + def test_convert_custom_tzinfo_objects_to_string(): class CorrectTimezone1(datetime.tzinfo): """