Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ install(FILES
helpers.h
init.h
io.h
iterators.h
numpy_convert.h
numpy_interop.h
numpy_to_arrow.h
Expand Down
47 changes: 10 additions & 37 deletions cpp/src/arrow/python/builtin_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/python/iterators.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/util/datetime.h"

Expand Down Expand Up @@ -76,33 +77,7 @@ class TypeInferrer {

// Infer value type from a sequence of values
Status VisitSequence(PyObject* obj) {
// Loop through a sequence
if (PyArray_Check(obj)) {
Py_ssize_t size = PySequence_Size(obj);
OwnedRef value_ref;

for (Py_ssize_t i = 0; i < size; ++i) {
auto array = reinterpret_cast<PyArrayObject*>(obj);
auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i));

value_ref.reset(PyArray_GETITEM(array, ptr));
RETURN_IF_PYERROR();
RETURN_NOT_OK(Visit(value_ref.obj()));
}
} else if (PySequence_Check(obj)) {
OwnedRef seq_ref(PySequence_Fast(obj, "Object is not a sequence or iterable"));
RETURN_IF_PYERROR();
PyObject* seq = seq_ref.obj();

Py_ssize_t size = PySequence_Fast_GET_SIZE(seq);
for (Py_ssize_t i = 0; i < size; ++i) {
PyObject* value = PySequence_Fast_GET_ITEM(seq, i);
RETURN_NOT_OK(Visit(value));
}
} else {
return Status::TypeError("Object is not a sequence or iterable");
}
return Status::OK();
return internal::VisitSequence(obj, [this](PyObject* value) { return Visit(value); });
}

Status Visit(PyObject* obj) {
Expand Down Expand Up @@ -139,6 +114,8 @@ class TypeInferrer {
return Status::Invalid(ss.str());
}
} else if (PyList_Check(obj) || PyArray_Check(obj)) {
// TODO(ARROW-2514): This code path is used for non-object arrays, which
// leads to wasteful creation and inspection of temporary Python objects.
return VisitList(obj);
} else if (PyDict_Check(obj)) {
return VisitDict(obj);
Expand Down Expand Up @@ -397,16 +374,9 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
/// Ensure we've allocated enough space
RETURN_NOT_OK(this->typed_builder_->Reserve(size));
// Iterate over the items adding each one
if (PySequence_Check(obj)) {
auto self = static_cast<Derived*>(this);
for (int64_t i = 0; i < size; ++i) {
OwnedRef ref(PySequence_GetItem(obj, i));
RETURN_NOT_OK(self->AppendSingle(ref.obj()));
}
} else {
return Status::TypeError("Object is not a sequence");
}
return Status::OK();
auto self = static_cast<Derived*>(this);
auto visit = [self](PyObject* item) { return self->AppendSingle(item); };
return internal::VisitSequence(obj, visit);
}

// Append a missing item (default implementation)
Expand Down Expand Up @@ -591,6 +561,9 @@ class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> {
Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(typed_builder_->Append());
const auto list_size = static_cast<int64_t>(PySequence_Size(obj));
if (ARROW_PREDICT_FALSE(list_size == -1)) {
RETURN_IF_PYERROR();
}
return value_converter_->AppendMultiple(obj, list_size);
}

Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/python/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class MemoryPool;

namespace py {

// TODO: inline the successful case
ARROW_EXPORT Status CheckPyError(StatusCode code = StatusCode::UnknownError);

ARROW_EXPORT Status PassPyError();
Expand Down
103 changes: 103 additions & 0 deletions cpp/src/arrow/python/iterators.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef ARROW_PYTHON_ITERATORS_H
#define ARROW_PYTHON_ITERATORS_H

#include <utility>

#include "arrow/python/common.h"
#include "arrow/python/numpy-internal.h"

namespace arrow {
namespace py {
namespace internal {

// Visit the Python sequence, calling the given callable on each element.
// If the callable returns a non-OK status, iteration stops and the status is returned.
template <class UnaryFunction>
inline Status VisitSequence(PyObject* obj, UnaryFunction&& func) {
if (PyArray_Check(obj)) {
PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
if (PyArray_NDIM(arr_obj) != 1) {
return Status::Invalid("Only 1D arrays accepted");
}

if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) {
// It's an array object, we can fetch object pointers directly
const Ndarray1DIndexer<PyObject*> objects(arr_obj);
for (int64_t i = 0; i < objects.size(); ++i) {
RETURN_NOT_OK(func(objects[i]));
}
return Status::OK();
}
// It's a non-object array, fall back on regular sequence access.
// (note PyArray_GETITEM() is slightly different: it returns standard
// Python types, not Numpy scalar types)
// This code path is inefficient: callers should implement dedicated
// logic for non-object arrays.
}
if (PySequence_Check(obj)) {
if (PyList_Check(obj) || PyTuple_Check(obj)) {
// Use fast item access
const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
for (Py_ssize_t i = 0; i < size; ++i) {
PyObject* value = PySequence_Fast_GET_ITEM(obj, i);
RETURN_NOT_OK(func(value));
}
} else {
// Regular sequence: avoid making a potentially large copy
const Py_ssize_t size = PySequence_Size(obj);
RETURN_IF_PYERROR();
for (Py_ssize_t i = 0; i < size; ++i) {
OwnedRef value_ref(PySequence_ITEM(obj, i));
RETURN_IF_PYERROR();
RETURN_NOT_OK(func(value_ref.obj()));
}
}
} else {
return Status::TypeError("Object is not a sequence");
}
return Status::OK();
}

// Like IterateSequence, but accepts any generic iterable (including
// non-restartable iterators, e.g. generators).
template <class UnaryFunction>
inline Status VisitIterable(PyObject* obj, UnaryFunction&& func) {
if (PySequence_Check(obj)) {
// Numpy arrays fall here as well
return VisitSequence(obj, std::forward<UnaryFunction>(func));
}
// Fall back on the iterator protocol
OwnedRef iter_ref(PyObject_GetIter(obj));
PyObject* iter = iter_ref.obj();
RETURN_IF_PYERROR();
PyObject* value;
while ((value = PyIter_Next(iter))) {
OwnedRef value_ref(value);
RETURN_NOT_OK(func(value_ref.obj()));
}
RETURN_IF_PYERROR(); // __next__() might have raised
return Status::OK();
}

} // namespace internal
} // namespace py
} // namespace arrow

#endif // ARROW_PYTHON_ITERATORS_H
2 changes: 1 addition & 1 deletion cpp/src/arrow/python/numpy-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class Ndarray1DIndexer {
T* begin() const { return data(); }
T* end() const { return begin() + size() * stride_; }

bool is_strided() const { return stride_ == 1; }
bool is_strided() const { return stride_ != 1; }

T& operator[](size_type index) { return data_[index * stride_]; }
T& operator[](size_type index) const { return data_[index * stride_]; }
Expand Down
71 changes: 11 additions & 60 deletions cpp/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "arrow/python/config.h"
#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/python/iterators.h"
#include "arrow/python/numpy-internal.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/type_traits.h"
Expand Down Expand Up @@ -1171,70 +1172,20 @@ Status NumPyConverter::ConvertObjects() {
}
}

template <typename T>
Status LoopPySequence(PyObject* sequence, T func) {
if (PySequence_Check(sequence)) {
OwnedRef ref;
Py_ssize_t size = PySequence_Size(sequence);
if (PyArray_Check(sequence)) {
auto array = reinterpret_cast<PyArrayObject*>(sequence);
Ndarray1DIndexer<PyObject*> objects(array);
for (int64_t i = 0; i < size; ++i) {
RETURN_NOT_OK(func(objects[i]));
}
} else {
for (int64_t i = 0; i < size; ++i) {
ref.reset(PySequence_GetItem(sequence, i));
RETURN_NOT_OK(func(ref.obj()));
}
}
} else if (PyObject_HasAttrString(sequence, "__iter__")) {
OwnedRef iter(PyObject_GetIter(sequence));
PyObject* item;
while ((item = PyIter_Next(iter.obj()))) {
OwnedRef ref(item);
RETURN_NOT_OK(func(ref.obj()));
}
} else {
return Status::TypeError("Object is not a sequence or iterable");
}

return Status::OK();
}

template <typename T>
// Like VisitIterable, but the function takes a second boolean argument
// deducted from `have_mask` and `mask_values`
template <class BinaryFunction>
Status LoopPySequenceWithMasks(PyObject* sequence,
const Ndarray1DIndexer<uint8_t>& mask_values,
bool have_mask, T func) {
if (PySequence_Check(sequence)) {
OwnedRef ref;
Py_ssize_t size = PySequence_Size(sequence);
if (PyArray_Check(sequence)) {
auto array = reinterpret_cast<PyArrayObject*>(sequence);
Ndarray1DIndexer<PyObject*> objects(array);
for (int64_t i = 0; i < size; ++i) {
RETURN_NOT_OK(func(objects[i], have_mask && mask_values[i]));
}
} else {
for (int64_t i = 0; i < size; ++i) {
ref.reset(PySequence_GetItem(sequence, i));
RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
}
}
} else if (PyObject_HasAttrString(sequence, "__iter__")) {
OwnedRef iter(PyObject_GetIter(sequence));
PyObject* item;
bool have_mask, BinaryFunction&& func) {
if (have_mask) {
int64_t i = 0;
while ((item = PyIter_Next(iter.obj()))) {
OwnedRef ref(item);
RETURN_NOT_OK(func(ref.obj(), have_mask && mask_values[i]));
i++;
}
auto visit = [&](PyObject* obj) { return func(obj, mask_values[i++] != 0); };
return internal::VisitIterable(sequence, visit);
} else {
return Status::TypeError("Object is not a sequence or iterable");
auto visit = [&](PyObject* obj) { return func(obj, false); };
return internal::VisitIterable(sequence, visit);
}

return Status::OK();
}

template <int ITEM_TYPE, typename ArrowType>
Expand Down Expand Up @@ -1473,7 +1424,7 @@ Status NumPyConverter::ConvertLists(const std::shared_ptr<DataType>& type,
}
};

return LoopPySequence(list, foreach_item);
return internal::VisitIterable(list, foreach_item);
}
default: {
std::stringstream ss;
Expand Down
17 changes: 6 additions & 11 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

#include "arrow/python/common.h"
#include "arrow/python/helpers.h"
#include "arrow/python/iterators.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/platform.h"
#include "arrow/python/util/datetime.h"
Expand Down Expand Up @@ -577,17 +578,11 @@ Status SerializeSequences(PyObject* context, std::vector<PyObject*> sequences,
SequenceBuilder builder(nullptr);
std::vector<PyObject*> sublists, subtuples, subdicts, subsets;
for (const auto& sequence : sequences) {
OwnedRef iterator(PyObject_GetIter(sequence));
RETURN_IF_PYERROR();
OwnedRef item;
while (true) {
item.reset(PyIter_Next(iterator.obj()));
if (!item.obj()) {
break;
}
RETURN_NOT_OK(Append(context, item.obj(), &builder, &sublists, &subtuples,
&subdicts, &subsets, blobs_out));
}
auto visit = [&](PyObject* obj) {
return Append(context, obj, &builder, &sublists, &subtuples, &subdicts, &subsets,
blobs_out);
};
RETURN_NOT_OK(internal::VisitIterable(sequence, visit));
}
std::shared_ptr<Array> list;
if (sublists.size() > 0) {
Expand Down
Loading