From 00cab9a533deb1c892846fe6cfd425c477bc753d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 24 Apr 2018 16:55:07 +0200 Subject: [PATCH 1/3] ARROW-2499: [C++] Refactor Python iteration --- cpp/src/arrow/python/CMakeLists.txt | 1 + cpp/src/arrow/python/builtin_convert.cc | 47 ++------- cpp/src/arrow/python/common.h | 1 + cpp/src/arrow/python/iterators.h | 103 +++++++++++++++++++ cpp/src/arrow/python/numpy-internal.h | 1 + cpp/src/arrow/python/numpy_to_arrow.cc | 71 ++----------- cpp/src/arrow/python/python_to_arrow.cc | 17 ++- python/pyarrow/tests/test_convert_builtin.py | 56 +++++++--- 8 files changed, 174 insertions(+), 123 deletions(-) create mode 100644 cpp/src/arrow/python/iterators.h diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index f6c92a79bd6..7b80b0704bb 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -109,6 +109,7 @@ install(FILES helpers.h init.h io.h + iterators.h numpy_convert.h numpy_interop.h numpy_to_arrow.h diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 740c89666a8..cb35d3d45f7 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -36,6 +36,7 @@ #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" +#include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/util/datetime.h" @@ -76,33 +77,7 @@ class TypeInferrer { // Infer value type from a sequence of values Status VisitSequence(PyObject* obj) { - // Loop through a sequence - if (PyArray_Check(obj)) { - Py_ssize_t size = PySequence_Size(obj); - OwnedRef value_ref; - - for (Py_ssize_t i = 0; i < size; ++i) { - auto array = reinterpret_cast(obj); - auto ptr = reinterpret_cast(PyArray_GETPTR1(array, i)); - - value_ref.reset(PyArray_GETITEM(array, ptr)); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(Visit(value_ref.obj())); - } - } else if (PySequence_Check(obj)) { - OwnedRef seq_ref(PySequence_Fast(obj, "Object is not a sequence or iterable")); - RETURN_IF_PYERROR(); - PyObject* seq = seq_ref.obj(); - - Py_ssize_t size = PySequence_Fast_GET_SIZE(seq); - for (Py_ssize_t i = 0; i < size; ++i) { - PyObject* value = PySequence_Fast_GET_ITEM(seq, i); - RETURN_NOT_OK(Visit(value)); - } - } else { - return Status::TypeError("Object is not a sequence or iterable"); - } - return Status::OK(); + return internal::VisitSequence(obj, [this](PyObject* value) { return Visit(value); }); } Status Visit(PyObject* obj) { @@ -139,6 +114,8 @@ class TypeInferrer { return Status::Invalid(ss.str()); } } else if (PyList_Check(obj) || PyArray_Check(obj)) { + // XXX This code path is used for non-object arrays, which leads + // to wasteful creation and inspection of temporary Python objects. return VisitList(obj); } else if (PyDict_Check(obj)) { return VisitDict(obj); @@ -397,16 +374,9 @@ class TypedConverterVisitor : public TypedConverter { /// Ensure we've allocated enough space RETURN_NOT_OK(this->typed_builder_->Reserve(size)); // Iterate over the items adding each one - if (PySequence_Check(obj)) { - auto self = static_cast(this); - for (int64_t i = 0; i < size; ++i) { - OwnedRef ref(PySequence_GetItem(obj, i)); - RETURN_NOT_OK(self->AppendSingle(ref.obj())); - } - } else { - return Status::TypeError("Object is not a sequence"); - } - return Status::OK(); + auto self = static_cast(this); + auto visit = [self](PyObject* item) { return self->AppendSingle(item); }; + return internal::VisitSequence(obj, visit); } // Append a missing item (default implementation) @@ -591,6 +561,9 @@ class ListConverter : public TypedConverterVisitor { Status AppendItem(PyObject* obj) { RETURN_NOT_OK(typed_builder_->Append()); const auto list_size = static_cast(PySequence_Size(obj)); + if (ARROW_PREDICT_FALSE(list_size == -1)) { + RETURN_IF_PYERROR(); + } return value_converter_->AppendMultiple(obj, list_size); } diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 76aee16e159..a61d067e44f 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -35,6 +35,7 @@ class MemoryPool; namespace py { +// TODO: inline the successful case ARROW_EXPORT Status CheckPyError(StatusCode code = StatusCode::UnknownError); ARROW_EXPORT Status PassPyError(); diff --git a/cpp/src/arrow/python/iterators.h b/cpp/src/arrow/python/iterators.h new file mode 100644 index 00000000000..efada6e179d --- /dev/null +++ b/cpp/src/arrow/python/iterators.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_ITERATORS_H +#define ARROW_PYTHON_ITERATORS_H + +#include + +#include "arrow/python/common.h" +#include "arrow/python/numpy-internal.h" + +namespace arrow { +namespace py { +namespace internal { + +// Visit the Python sequence, calling the given callable on each element. +// If the callable returns a non-OK status, iteration stops and the status is returned. +template +inline Status VisitSequence(PyObject* obj, UnaryFunction&& func) { + if (PyArray_Check(obj)) { + PyArrayObject* arr_obj = reinterpret_cast(obj); + if (PyArray_NDIM(arr_obj) != 1) { + return Status::Invalid("Only 1D arrays accepted"); + } + + if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) { + // It's an array object, we can fetch object pointers directly + const Ndarray1DIndexer objects(arr_obj); + for (int64_t i = 0; i < objects.size(); ++i) { + RETURN_NOT_OK(func(objects[i])); + } + return Status::OK(); + } + // It's a non-object array, fall back on regular sequence access. + // (note PyArray_GETITEM() is slightly different: it returns standard + // Python types, not Numpy scalar types) + // This code path is inefficient: callers should implement dedicated + // logic for non-object arrays. + } + if (PySequence_Check(obj)) { + if (PyList_Check(obj) || PyTuple_Check(obj)) { + // Use fast item access + const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj); + for (Py_ssize_t i = 0; i < size; ++i) { + PyObject* value = PySequence_Fast_GET_ITEM(obj, i); + RETURN_NOT_OK(func(value)); + } + } else { + // Regular sequence: avoid making a potentially large copy + const Py_ssize_t size = PySequence_Size(obj); + RETURN_IF_PYERROR(); + for (Py_ssize_t i = 0; i < size; ++i) { + OwnedRef value_ref(PySequence_ITEM(obj, i)); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(func(value_ref.obj())); + } + } + } else { + return Status::TypeError("Object is not a sequence"); + } + return Status::OK(); +} + +// Like IterateSequence, but accepts any generic iterable (including +// non-restartable iterators, e.g. generators). +template +inline Status VisitIterable(PyObject* obj, UnaryFunction&& func) { + if (PySequence_Check(obj)) { + // Numpy arrays fall here as well + return VisitSequence(obj, std::forward(func)); + } + // Fall back on the iterator protocol + OwnedRef iter_ref(PyObject_GetIter(obj)); + PyObject* iter = iter_ref.obj(); + RETURN_IF_PYERROR(); + PyObject* value; + while ((value = PyIter_Next(iter))) { + OwnedRef value_ref(value); + RETURN_NOT_OK(func(value_ref.obj())); + } + RETURN_IF_PYERROR(); // __next__() might have raised + return Status::OK(); +} + +} // namespace internal +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_ITERATORS_H diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 7672861d447..72a5d6b10be 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -57,6 +57,7 @@ class Ndarray1DIndexer { T* begin() const { return data(); } T* end() const { return begin() + size() * stride_; } + // XXX should be `stride_ != 1` bool is_strided() const { return stride_ == 1; } T& operator[](size_type index) { return data_[index * stride_]; } diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index f5e093a2050..646fb74e035 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -51,6 +51,7 @@ #include "arrow/python/config.h" #include "arrow/python/decimal.h" #include "arrow/python/helpers.h" +#include "arrow/python/iterators.h" #include "arrow/python/numpy-internal.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/type_traits.h" @@ -1171,70 +1172,20 @@ Status NumPyConverter::ConvertObjects() { } } -template -Status LoopPySequence(PyObject* sequence, T func) { - if (PySequence_Check(sequence)) { - OwnedRef ref; - Py_ssize_t size = PySequence_Size(sequence); - if (PyArray_Check(sequence)) { - auto array = reinterpret_cast(sequence); - Ndarray1DIndexer objects(array); - for (int64_t i = 0; i < size; ++i) { - RETURN_NOT_OK(func(objects[i])); - } - } else { - for (int64_t i = 0; i < size; ++i) { - ref.reset(PySequence_GetItem(sequence, i)); - RETURN_NOT_OK(func(ref.obj())); - } - } - } else if (PyObject_HasAttrString(sequence, "__iter__")) { - OwnedRef iter(PyObject_GetIter(sequence)); - PyObject* item; - while ((item = PyIter_Next(iter.obj()))) { - OwnedRef ref(item); - RETURN_NOT_OK(func(ref.obj())); - } - } else { - return Status::TypeError("Object is not a sequence or iterable"); - } - - return Status::OK(); -} - -template +// Like VisitIterable, but the function takes a second boolean argument +// deducted from `have_mask` and `mask_values` +template Status LoopPySequenceWithMasks(PyObject* sequence, const Ndarray1DIndexer& mask_values, - bool have_mask, T func) { - if (PySequence_Check(sequence)) { - OwnedRef ref; - Py_ssize_t size = PySequence_Size(sequence); - if (PyArray_Check(sequence)) { - auto array = reinterpret_cast(sequence); - Ndarray1DIndexer objects(array); - for (int64_t i = 0; i < size; ++i) { - RETURN_NOT_OK(func(objects[i], have_mask && mask_values[i])); - } - } else { - for (int64_t i = 0; i < size; ++i) { - ref.reset(PySequence_GetItem(sequence, i)); - RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i])); - } - } - } else if (PyObject_HasAttrString(sequence, "__iter__")) { - OwnedRef iter(PyObject_GetIter(sequence)); - PyObject* item; + bool have_mask, BinaryFunction&& func) { + if (have_mask) { int64_t i = 0; - while ((item = PyIter_Next(iter.obj()))) { - OwnedRef ref(item); - RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i])); - i++; - } + auto visit = [&](PyObject* obj) { return func(obj, mask_values[i++] != 0); }; + return internal::VisitIterable(sequence, visit); } else { - return Status::TypeError("Object is not a sequence or iterable"); + auto visit = [&](PyObject* obj) { return func(obj, false); }; + return internal::VisitIterable(sequence, visit); } - - return Status::OK(); } template @@ -1473,7 +1424,7 @@ Status NumPyConverter::ConvertLists(const std::shared_ptr& type, } }; - return LoopPySequence(list, foreach_item); + return internal::VisitIterable(list, foreach_item); } default: { std::stringstream ss; diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 279ce1f2753..64cf2b4c168 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -40,6 +40,7 @@ #include "arrow/python/common.h" #include "arrow/python/helpers.h" +#include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/platform.h" #include "arrow/python/util/datetime.h" @@ -577,17 +578,11 @@ Status SerializeSequences(PyObject* context, std::vector sequences, SequenceBuilder builder(nullptr); std::vector sublists, subtuples, subdicts, subsets; for (const auto& sequence : sequences) { - OwnedRef iterator(PyObject_GetIter(sequence)); - RETURN_IF_PYERROR(); - OwnedRef item; - while (true) { - item.reset(PyIter_Next(iterator.obj())); - if (!item.obj()) { - break; - } - RETURN_NOT_OK(Append(context, item.obj(), &builder, &sublists, &subtuples, - &subdicts, &subsets, blobs_out)); - } + auto visit = [&](PyObject* obj) { + return Append(context, obj, &builder, &sublists, &subtuples, &subdicts, &subsets, + blobs_out); + }; + RETURN_NOT_OK(internal::VisitIterable(sequence, visit)); } std::shared_ptr list; if (sublists.size() > 0) { diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 7fb43015e3f..8a8e5542ee2 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -21,6 +21,7 @@ from pyarrow.compat import unittest, u # noqa import pyarrow as pa +import collections import datetime import decimal import itertools @@ -118,12 +119,22 @@ def _as_tuple(xs): return tuple(xs) +def _as_deque(xs): + # deque is a sequence while neither tuple nor list + return collections.deque(xs) + + def _as_dict_values(xs): + # a dict values object is not a sequence, just a regular iterable dct = {k: v for k, v in enumerate(xs)} return six.viewvalues(dct) -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +parametrize_with_iterable_types = pytest.mark.parametrize( + "seq", [_as_list, _as_tuple, _as_deque, _as_dict_values]) + + +@parametrize_with_iterable_types def test_sequence_types(seq): arr1 = pa.array(seq([1, 2, 3])) arr2 = pa.array([1, 2, 3]) @@ -131,7 +142,7 @@ def test_sequence_types(seq): assert arr1.equals(arr2) -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types def test_sequence_boolean(seq): expected = [True, None, False, None] arr = pa.array(seq(expected)) @@ -141,7 +152,7 @@ def test_sequence_boolean(seq): assert arr.to_pylist() == expected -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types def test_sequence_numpy_boolean(seq): expected = [np.bool(True), None, np.bool(False), None] arr = pa.array(seq(expected)) @@ -151,7 +162,7 @@ def test_sequence_numpy_boolean(seq): assert arr.to_pylist() == expected -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types def test_empty_list(seq): arr = pa.array(seq([])) assert len(arr) == 0 @@ -160,16 +171,30 @@ def test_empty_list(seq): assert arr.to_pylist() == [] -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types def test_nested_lists(seq): - arr = pa.array(seq([[], [1, 2], None])) + data = [[], [1, 2], None] + arr = pa.array(seq(data)) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) - assert arr.to_pylist() == [[], [1, 2], None] + assert arr.to_pylist() == data + # With explicit type + arr = pa.array(seq(data), type=pa.list_(pa.int32())) + assert len(arr) == 3 + assert arr.null_count == 1 + assert arr.type == pa.list_(pa.int32()) + assert arr.to_pylist() == data + + +@parametrize_with_iterable_types +def test_list_with_non_list(seq): + # List types don't accept non-sequences + with pytest.raises(pa.ArrowTypeError): + pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64())) -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types def test_nested_arrays(seq): arr = pa.array(seq([np.array([], dtype=int), np.array([1, 2]), None])) assert len(arr) == 3 @@ -178,15 +203,16 @@ def test_nested_arrays(seq): assert arr.to_pylist() == [[], [1, 2], None] -def test_sequence_all_none(): - arr = pa.array([None, None]) +@parametrize_with_iterable_types +def test_sequence_all_none(seq): + arr = pa.array(seq([None, None])) assert len(arr) == 2 assert arr.null_count == 2 assert arr.type == pa.null() assert arr.to_pylist() == [None, None] -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) def test_sequence_integer(seq, np_scalar_pa_type): np_scalar, pa_type = np_scalar_pa_type @@ -199,7 +225,7 @@ def test_sequence_integer(seq, np_scalar_pa_type): assert arr.to_pylist() == expected -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types def test_sequence_integer_inferred(seq): expected = [1, None, 3, None] arr = pa.array(seq(expected)) @@ -209,7 +235,7 @@ def test_sequence_integer_inferred(seq): assert arr.to_pylist() == expected -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) def test_sequence_numpy_integer(seq, np_scalar_pa_type): np_scalar, pa_type = np_scalar_pa_type @@ -223,7 +249,7 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type): assert arr.to_pylist() == expected -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type): np_scalar, pa_type = np_scalar_pa_type @@ -282,7 +308,7 @@ def test_sequence_double(): assert arr.to_pylist() == data -@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values]) +@parametrize_with_iterable_types @pytest.mark.parametrize("np_scalar", [np.float16, np.float32, np.float64]) def test_sequence_numpy_double(seq, np_scalar): data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, None] From 91c5af164ef5f7c00163c07e3d365a0ba023b6b8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 26 Apr 2018 17:31:28 +0200 Subject: [PATCH 2/3] Add TODO for performance issue --- cpp/src/arrow/python/builtin_convert.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index cb35d3d45f7..88674d095c9 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -114,8 +114,8 @@ class TypeInferrer { return Status::Invalid(ss.str()); } } else if (PyList_Check(obj) || PyArray_Check(obj)) { - // XXX This code path is used for non-object arrays, which leads - // to wasteful creation and inspection of temporary Python objects. + // TODO(ARROW-2514): This code path is used for non-object arrays, which + // leads to wasteful creation and inspection of temporary Python objects. return VisitList(obj); } else if (PyDict_Check(obj)) { return VisitDict(obj); From ac31c6c03c3443bb294b1a148bc017bf3a693677 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 1 May 2018 13:21:52 +0200 Subject: [PATCH 3/3] Fix Ndarray1DIndexer::is_strided (unused) --- cpp/src/arrow/python/numpy-internal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 72a5d6b10be..12319aae9da 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -57,8 +57,7 @@ class Ndarray1DIndexer { T* begin() const { return data(); } T* end() const { return begin() + size() * stride_; } - // XXX should be `stride_ != 1` - bool is_strided() const { return stride_ == 1; } + bool is_strided() const { return stride_ != 1; } T& operator[](size_type index) { return data_[index * stride_]; } T& operator[](size_type index) const { return data_[index * stride_]; }