From 5c0fa0b57d190249d0e33e0dc0141ad2b1c98062 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 24 Apr 2017 23:37:16 -0700 Subject: [PATCH 01/29] Start adding iterable support --- cpp/src/arrow/python/builtin_convert.cc | 47 +++++++++++++++++++------ python/pyarrow/_array.pyx | 2 +- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 137937c0946..3064c9abb6e 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -231,11 +231,24 @@ class SeqVisitor { }; Status InferArrowSize(PyObject* obj, int64_t* size) { - *size = static_cast(PySequence_Size(obj)); + if (PySequence_Check(obj)) { + *size = static_cast(PySequence_Size(obj)); + } else if (PyObject_HasAttrString(obj, "__iter__")) { + PyObject* iter = PyObject_GetIter(obj); + *size = 0; + PyObject* item; + while (item = PyIter_Next(iterator)) { + *size += 1; + Py_DECREF(item); + } + Py_DECREF(iter); + } else { + return Status::TypeError("Object is not a sequence or iterable"); + } if (PyErr_Occurred()) { // Not a sequence PyErr_Clear(); - return Status::TypeError("Object is not a sequence"); + return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } @@ -287,21 +300,35 @@ class TypedConverter : public SeqConverter { BuilderType* typed_builder_; }; -class BoolConverter : public TypedConverter { +template +class TypedConverterVisitor : public TypedConverter { public: Status AppendData(PyObject* seq) override { int64_t size = static_cast(PySequence_Size(seq)); RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); - if (item.obj() == Py_None) { - typed_builder_->AppendNull(); + status = appendItem(item); + if (status != Status::OK()) { + return status; + } + } + } + + appendItem(OwnedRef &item); +}; + +class BoolConverter : public TypedConverterVisitor { + public: + Status appendItem(OwnedRef &item) override { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + if (item.obj() == Py_True) { + typed_builder_->Append(true); } else { - if (item.obj() == Py_True) { - typed_builder_->Append(true); - } else { - typed_builder_->Append(false); - } + typed_builder_->Append(false); } } return Status::OK(); diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index 1c571ba153d..be6c2e1d16f 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -914,7 +914,7 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None): Parameters ---------- - sequence : sequence-like object of Python objects + sequence : sequence or iterable-like object of Python objects type : pyarrow.DataType, optional If not passed, will be inferred from the data memory_pool : pyarrow.MemoryPool, optional From 77c935b9ac289b400f3df5d8b74ce06bc5a866d9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 01:17:28 -0700 Subject: [PATCH 02/29] Revert accidently java change --- .../java/org/apache/arrow/vector/stream/MessageSerializer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java index ec7e0f2ffb1..227423dfeb2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java @@ -33,7 +33,6 @@ import org.apache.arrow.flatbuf.MetadataVersion; import org.apache.arrow.flatbuf.RecordBatch; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.file.ArrowBlock; import org.apache.arrow.vector.file.ReadChannel; import org.apache.arrow.vector.file.WriteChannel; import org.apache.arrow.vector.schema.ArrowBuffer; From 76e08ca566b4dc24e1879750060e24ecc044572b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 01:17:42 -0700 Subject: [PATCH 03/29] Part of the way along adding iterable support --- cpp/src/arrow/python/builtin_convert.cc | 59 +++++++++++-------------- cpp/src/arrow/python/builtin_convert.h | 3 +- cpp/src/arrow/python/pandas_convert.cc | 4 +- 3 files changed, 29 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 3064c9abb6e..c1d6f226eba 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -237,7 +237,7 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { PyObject* iter = PyObject_GetIter(obj); *size = 0; PyObject* item; - while (item = PyIter_Next(iterator)) { + while ((item = PyIter_Next(iter))) { *size += 1; Py_DECREF(item); } @@ -281,7 +281,7 @@ class SeqConverter { return Status::OK(); } - virtual Status AppendData(PyObject* seq) = 0; + virtual Status AppendData(PyObject* seq, int64_t size) = 0; protected: std::shared_ptr builder_; @@ -303,25 +303,21 @@ class TypedConverter : public SeqConverter { template class TypedConverterVisitor : public TypedConverter { public: - Status AppendData(PyObject* seq) override { - int64_t size = static_cast(PySequence_Size(seq)); - RETURN_NOT_OK(typed_builder_->Reserve(size)); + Status AppendData(PyObject* seq, int64_t size) override { + RETURN_NOT_OK(this->typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); - status = appendItem(item); - if (status != Status::OK()) { - return status; - } + RETURN_NOT_OK(appendItem(item)); } + return Status::OK(); } - appendItem(OwnedRef &item); + virtual Status appendItem(OwnedRef &item) = 0; }; class BoolConverter : public TypedConverterVisitor { public: - Status appendItem(OwnedRef &item) override { - OwnedRef item(PySequence_GetItem(seq, i)); + inline Status appendItem(OwnedRef &item) override final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -337,9 +333,8 @@ class BoolConverter : public TypedConverterVisitor { class Int64Converter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { + Status AppendData(PyObject* seq, int64_t size) override { int64_t val; - int64_t size = static_cast(PySequence_Size(seq)); RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); @@ -357,8 +352,7 @@ class Int64Converter : public TypedConverter { class DateConverter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { - int64_t size = static_cast(PySequence_Size(seq)); + Status AppendData(PyObject* seq, int64_t size) override { RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); @@ -375,8 +369,7 @@ class DateConverter : public TypedConverter { class TimestampConverter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { - int64_t size = static_cast(PySequence_Size(seq)); + Status AppendData(PyObject* seq, int64_t size) override { RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); @@ -409,9 +402,8 @@ class TimestampConverter : public TypedConverter { class DoubleConverter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { + Status AppendData(PyObject* seq, int64_t size) override { double val; - int64_t size = static_cast(PySequence_Size(seq)); RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); @@ -429,13 +421,12 @@ class DoubleConverter : public TypedConverter { class BytesConverter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { + Status AppendData(PyObject* seq, int64_t size) override { PyObject* item; PyObject* bytes_obj; OwnedRef tmp; const char* bytes; Py_ssize_t length; - Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { item = PySequence_GetItem(seq, i); OwnedRef holder(item); @@ -463,13 +454,12 @@ class BytesConverter : public TypedConverter { class FixedWidthBytesConverter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { + Status AppendData(PyObject* seq, int64_t size) override { PyObject* item; PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = std::dynamic_pointer_cast( typed_builder_->type())->byte_width(); - Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { item = PySequence_GetItem(seq, i); OwnedRef holder(item); @@ -497,13 +487,12 @@ class FixedWidthBytesConverter : public TypedConverter { class UTF8Converter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { + Status AppendData(PyObject* seq, int64_t size) override { PyObject* item; PyObject* bytes_obj; OwnedRef tmp; const char* bytes; Py_ssize_t length; - Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { item = PySequence_GetItem(seq, i); OwnedRef holder(item); @@ -531,15 +520,17 @@ class ListConverter : public TypedConverter { public: Status Init(const std::shared_ptr& builder) override; - Status AppendData(PyObject* seq) override { - Py_ssize_t size = PySequence_Size(seq); + Status AppendData(PyObject* seq, int64_t size) override { for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { typed_builder_->Append(); - RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + PyObject* item_obj = item.obj(); + int64_t list_size = + static_cast(PySequence_Size(item_obj)); + RETURN_NOT_OK(value_converter_->AppendData(item_obj, list_size)); } } return Status::OK(); @@ -559,9 +550,8 @@ class ListConverter : public TypedConverter { class DecimalConverter : public TypedConverter { public: - Status AppendData(PyObject* seq) override { + Status AppendData(PyObject* seq, int64_t size) override { /// Ensure we've allocated enough space - Py_ssize_t size = PySequence_Size(seq); RETURN_NOT_OK(typed_builder_->Reserve(size)); /// Can the compiler figure out that the case statement below isn't necessary @@ -642,7 +632,8 @@ Status ListConverter::Init(const std::shared_ptr& builder) { } Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, - const std::shared_ptr& builder) { + const std::shared_ptr& builder, + int64_t size) { PyDateTime_IMPORT; std::shared_ptr converter = GetConverter(type); if (converter == nullptr) { @@ -652,7 +643,7 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, } converter->Init(builder); - return converter->AppendData(obj); + return converter->AppendData(obj, size); } Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr* out) { @@ -673,7 +664,7 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr // Give the sequence converter an array builder std::shared_ptr builder; RETURN_NOT_OK(MakeBuilder(pool, type, &builder)); - RETURN_NOT_OK(AppendPySequence(obj, type, builder)); + RETURN_NOT_OK(AppendPySequence(obj, type, builder, size)); return builder->Finish(out); } diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h index a6180d496a9..7f42c334cd7 100644 --- a/cpp/src/arrow/python/builtin_convert.h +++ b/cpp/src/arrow/python/builtin_convert.h @@ -44,7 +44,8 @@ ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size); ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, - const std::shared_ptr& builder); + const std::shared_ptr& builder, + int64_t size); // Type and size inference ARROW_EXPORT diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 636a3fd15c0..5523b102108 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -873,7 +873,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr ss << inferred_type->ToString() << " cannot be converted to " << type->ToString(); return Status::TypeError(ss.str()); } - RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder)); + RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder, size)); } else { return Status::TypeError("Unsupported Python type for list items"); } @@ -922,7 +922,7 @@ inline Status PandasConverter::ConvertTypedLists( ss << inferred_type->ToString() << " cannot be converted to STRING."; return Status::TypeError(ss.str()); } - RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder)); + RETURN_NOT_OK(AppendPySequence(objects[i], inferred_type, value_builder, size)); } else { return Status::TypeError("Unsupported Python type for list items"); } From 15cdfe347c6720dea83cced97034f9cdb984c41b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 01:23:19 -0700 Subject: [PATCH 04/29] Move more of the convertors to the visitor version --- cpp/src/arrow/python/builtin_convert.cc | 38 ++++++++++--------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index c1d6f226eba..e86284b47fe 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -331,37 +331,29 @@ class BoolConverter : public TypedConverterVisitor { } }; -class Int64Converter : public TypedConverter { +class Int64Converter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { + inline Status appendItem(OwnedRef &item) override final { int64_t val; - RETURN_NOT_OK(typed_builder_->Reserve(size)); - for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(seq, i)); - if (item.obj() == Py_None) { - typed_builder_->AppendNull(); - } else { - val = static_cast(PyLong_AsLongLong(item.obj())); - RETURN_IF_PYERROR(); - typed_builder_->Append(val); - } + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + val = static_cast(PyLong_AsLongLong(item.obj())); + RETURN_IF_PYERROR(); + typed_builder_->Append(val); } return Status::OK(); } }; -class DateConverter : public TypedConverter { +class DateConverter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { - RETURN_NOT_OK(typed_builder_->Reserve(size)); - for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(seq, i)); - if (item.obj() == Py_None) { - typed_builder_->AppendNull(); - } else { - PyDateTime_Date* pydate = reinterpret_cast(item.obj()); - typed_builder_->Append(PyDate_to_ms(pydate)); - } + inline Status appendItem(OwnedRef &item) override final { + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + PyDateTime_Date* pydate = reinterpret_cast(item.obj()); + typed_builder_->Append(PyDate_to_ms(pydate)); } return Status::OK(); } From a1bf4bd13208a9433c9b3c46675767a9d34ab152 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 09:51:56 -0700 Subject: [PATCH 05/29] Move over timestamp and byte converters --- cpp/src/arrow/python/builtin_convert.cc | 151 +++++++++++------------- 1 file changed, 66 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index e86284b47fe..e11e0522b26 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -359,120 +359,101 @@ class DateConverter : public TypedConverterVisitor { } }; -class TimestampConverter : public TypedConverter { +class TimestampConverter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { - RETURN_NOT_OK(typed_builder_->Reserve(size)); - for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(seq, i)); - if (item.obj() == Py_None) { - typed_builder_->AppendNull(); - } else { - PyDateTime_DateTime* pydatetime = - reinterpret_cast(item.obj()); - struct tm datetime = {0}; - datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900; - datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1; - datetime.tm_mday = PyDateTime_GET_DAY(pydatetime); - datetime.tm_hour = PyDateTime_DATE_GET_HOUR(pydatetime); - datetime.tm_min = PyDateTime_DATE_GET_MINUTE(pydatetime); - datetime.tm_sec = PyDateTime_DATE_GET_SECOND(pydatetime); - int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); - RETURN_IF_PYERROR(); - struct tm epoch = {0}; - epoch.tm_year = 70; - epoch.tm_mday = 1; - // Microseconds since the epoch - int64_t val = static_cast( - lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us); - typed_builder_->Append(val); - } + inline Status appendItem(OwnedRef &item) override final { + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + PyDateTime_DateTime* pydatetime = + reinterpret_cast(item.obj()); + struct tm datetime = {0}; + datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900; + datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1; + datetime.tm_mday = PyDateTime_GET_DAY(pydatetime); + datetime.tm_hour = PyDateTime_DATE_GET_HOUR(pydatetime); + datetime.tm_min = PyDateTime_DATE_GET_MINUTE(pydatetime); + datetime.tm_sec = PyDateTime_DATE_GET_SECOND(pydatetime); + int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); + RETURN_IF_PYERROR(); + struct tm epoch = {0}; + epoch.tm_year = 70; + epoch.tm_mday = 1; + // Microseconds since the epoch + int64_t val = static_cast( + lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us); + typed_builder_->Append(val); } return Status::OK(); } }; -class DoubleConverter : public TypedConverter { +class DoubleConverter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { + inline Status appendItem(OwnedRef &item) override final { double val; - RETURN_NOT_OK(typed_builder_->Reserve(size)); - for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(seq, i)); - if (item.obj() == Py_None) { - typed_builder_->AppendNull(); - } else { - val = PyFloat_AsDouble(item.obj()); - RETURN_IF_PYERROR(); - typed_builder_->Append(val); - } + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + typed_builder_->Append(val); } return Status::OK(); } }; -class BytesConverter : public TypedConverter { +class BytesConverter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { - PyObject* item; + inline Status appendItem(OwnedRef &item) override final { PyObject* bytes_obj; - OwnedRef tmp; const char* bytes; Py_ssize_t length; - for (int64_t i = 0; i < size; ++i) { - item = PySequence_GetItem(seq, i); - OwnedRef holder(item); + OwnedRef tmp; - if (item == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - continue; - } else if (PyUnicode_Check(item)) { - tmp.reset(PyUnicode_AsUTF8String(item)); - RETURN_IF_PYERROR(); - bytes_obj = tmp.obj(); - } else if (PyBytes_Check(item)) { - bytes_obj = item; - } else { - return InvalidConversion(item, "bytes"); - } - // No error checking - length = PyBytes_GET_SIZE(bytes_obj); - bytes = PyBytes_AS_STRING(bytes_obj); - RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast(length))); + if (item.obj() == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + return Status::OK(); + } else if (PyUnicode_Check(item.obj())) { + tmp.reset(PyUnicode_AsUTF8String(item.obj())); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + } else if (PyBytes_Check(item.obj())) { + bytes_obj = item.obj(); + } else { + return InvalidConversion(item.obj(), "bytes"); } + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast(length))); return Status::OK(); } }; -class FixedWidthBytesConverter : public TypedConverter { +class FixedWidthBytesConverter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { - PyObject* item; + inline Status appendItem(OwnedRef &item) override final { PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = std::dynamic_pointer_cast( typed_builder_->type())->byte_width(); - for (int64_t i = 0; i < size; ++i) { - item = PySequence_GetItem(seq, i); - OwnedRef holder(item); - - if (item == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - continue; - } else if (PyUnicode_Check(item)) { - tmp.reset(PyUnicode_AsUTF8String(item)); - RETURN_IF_PYERROR(); - bytes_obj = tmp.obj(); - } else if (PyBytes_Check(item)) { - bytes_obj = item; - } else { - return InvalidConversion(item, "bytes"); - } - // No error checking - RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length)); - RETURN_NOT_OK(typed_builder_->Append( - reinterpret_cast(PyBytes_AS_STRING(bytes_obj)))); + if (item.obj() == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + return Status::OK(); + } else if (PyUnicode_Check(item.obj())) { + tmp.reset(PyUnicode_AsUTF8String(item.obj())); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + } else if (PyBytes_Check(item.obj())) { + bytes_obj = item.obj(); + } else { + return InvalidConversion(item.obj(), "bytes"); } + // No error checking + RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length)); + RETURN_NOT_OK(typed_builder_->Append( + reinterpret_cast(PyBytes_AS_STRING(bytes_obj)))); return Status::OK(); } }; From 48b08aa5d10ab5b80bb6ae264cbf4a9f46b5c3cb Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 09:59:10 -0700 Subject: [PATCH 06/29] Switch remaining converters --- cpp/src/arrow/python/builtin_convert.cc | 96 +++++++++++-------------- 1 file changed, 40 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index e11e0522b26..9057f5dcf03 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -304,7 +304,9 @@ template class TypedConverterVisitor : public TypedConverter { public: Status AppendData(PyObject* seq, int64_t size) override { + /// Ensure we've allocated enough space RETURN_NOT_OK(this->typed_builder_->Reserve(size)); + // Iterate over the items adding each one for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); RETURN_NOT_OK(appendItem(item)); @@ -458,53 +460,45 @@ class FixedWidthBytesConverter : public TypedConverterVisitor { +class UTF8Converter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { - PyObject* item; + inline Status appendItem(OwnedRef &item) override final { PyObject* bytes_obj; OwnedRef tmp; const char* bytes; Py_ssize_t length; - for (int64_t i = 0; i < size; ++i) { - item = PySequence_GetItem(seq, i); - OwnedRef holder(item); - - if (item == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - continue; - } else if (!PyUnicode_Check(item)) { - return Status::Invalid("Non-unicode value encountered"); - } - tmp.reset(PyUnicode_AsUTF8String(item)); - RETURN_IF_PYERROR(); - bytes_obj = tmp.obj(); - // No error checking - length = PyBytes_GET_SIZE(bytes_obj); - bytes = PyBytes_AS_STRING(bytes_obj); - RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast(length))); + if (item.obj() == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + return Status::OK(); + } else if (!PyUnicode_Check(item.obj())) { + return Status::Invalid("Non-unicode value encountered"); } + tmp.reset(PyUnicode_AsUTF8String(item.obj())); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast(length))); return Status::OK(); } }; -class ListConverter : public TypedConverter { +class ListConverter : public TypedConverterVisitor { public: Status Init(const std::shared_ptr& builder) override; - Status AppendData(PyObject* seq, int64_t size) override { - for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(seq, i)); - if (item.obj() == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - } else { - typed_builder_->Append(); - PyObject* item_obj = item.obj(); - int64_t list_size = - static_cast(PySequence_Size(item_obj)); - RETURN_NOT_OK(value_converter_->AppendData(item_obj, list_size)); - } + inline Status appendItem(OwnedRef &item) override final { + if (item.obj() == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + } else { + typed_builder_->Append(); + PyObject* item_obj = item.obj(); + int64_t list_size = + static_cast(PySequence_Size(item_obj)); + RETURN_NOT_OK(value_converter_->AppendData(item_obj, list_size)); } return Status::OK(); } @@ -521,37 +515,27 @@ class ListConverter : public TypedConverter { break; \ } -class DecimalConverter : public TypedConverter { +class DecimalConverter : public TypedConverterVisitor { public: - Status AppendData(PyObject* seq, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - + inline Status appendItem(OwnedRef &item) override final { /// Can the compiler figure out that the case statement below isn't necessary /// once we're running? const int bit_width = std::dynamic_pointer_cast(typed_builder_->type()) ->bit_width(); - OwnedRef ref; - PyObject* item = nullptr; - for (int64_t i = 0; i < size; ++i) { - ref.reset(PySequence_GetItem(seq, i)); - item = ref.obj(); - - /// TODO(phillipc): Check for nan? - if (item != Py_None) { - switch (bit_width) { - DECIMAL_CONVERT_CASE(32, item, typed_builder_) - DECIMAL_CONVERT_CASE(64, item, typed_builder_) - DECIMAL_CONVERT_CASE(128, item, typed_builder_) - default: - break; - } - RETURN_IF_PYERROR(); - } else { - RETURN_NOT_OK(typed_builder_->AppendNull()); + /// TODO(phillipc): Check for nan? + if (item.obj() != Py_None) { + switch (bit_width) { + DECIMAL_CONVERT_CASE(32, item.obj(), typed_builder_) + DECIMAL_CONVERT_CASE(64, item.obj(), typed_builder_) + DECIMAL_CONVERT_CASE(128, item.obj(), typed_builder_) + default: + return Status::OK(); } + RETURN_IF_PYERROR(); + } else { + RETURN_NOT_OK(typed_builder_->AppendNull()); } return Status::OK(); From b6c72f5c07b5adce4bf82b29a2cc6e9f2fbe78db Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 10:06:34 -0700 Subject: [PATCH 07/29] Make TypedConverterVisitor work on PySequence or Python Iterators --- cpp/src/arrow/python/builtin_convert.cc | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 9057f5dcf03..04f4e674ac2 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -303,13 +303,25 @@ class TypedConverter : public SeqConverter { template class TypedConverterVisitor : public TypedConverter { public: - Status AppendData(PyObject* seq, int64_t size) override { + Status AppendData(PyObject* obj, int64_t size) override { /// Ensure we've allocated enough space RETURN_NOT_OK(this->typed_builder_->Reserve(size)); // Iterate over the items adding each one - for (int64_t i = 0; i < size; ++i) { - OwnedRef item(PySequence_GetItem(seq, i)); - RETURN_NOT_OK(appendItem(item)); + if (PySequence_Check(obj)) { + for (int64_t i = 0; i < size; ++i) { + OwnedRef ref(PySequence_GetItem(obj, i)); + RETURN_NOT_OK(appendItem(ref)); + } + } else if (PyObject_HasAttrString(obj, "__iter__")) { + PyObject* iter = PyObject_GetIter(obj); + PyObject* item; + while ((item = PyIter_Next(iter))) { + OwnedRef ref(item); + RETURN_NOT_OK(appendItem(ref)); + } + Py_DECREF(iter); + } else { + return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } From ca0d5303dd586b7fa8353f4196018e4e9aa95dba Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 10:25:19 -0700 Subject: [PATCH 08/29] In theory this works ok now for iterables as well --- cpp/src/arrow/python/builtin_convert.cc | 60 ++++++++++++++++--------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 04f4e674ac2..bebc44056e2 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -151,30 +151,28 @@ class SeqVisitor { memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); } + // co-recursive with VisitElem Status Visit(PyObject* obj, int level = 0) { - Py_ssize_t size = PySequence_Size(obj); if (level > max_nesting_level_) { max_nesting_level_ = level; } - for (int64_t i = 0; i < size; ++i) { - // TODO(wesm): Error checking? - // TODO(wesm): Specialize for PyList_GET_ITEM? - OwnedRef item_ref(PySequence_GetItem(obj, i)); - PyObject* item = item_ref.obj(); - - if (PyList_Check(item)) { - RETURN_NOT_OK(Visit(item, level + 1)); - } else if (PyDict_Check(item)) { - return Status::NotImplemented("No type inference for dicts"); - } else { - // We permit nulls at any level of nesting - if (item == Py_None) { - // TODO - } else { - ++nesting_histogram_[level]; - scalars_.Visit(item); - } + // Loop through either a sequence or an iterator. + // VisitElemt takes ownership of the object we are visiting. + if (PySequence_Check(obj)) { + Py_ssize_t size = PySequence_Size(obj); + for (int64_t i = 0; i < size; ++i) { + // TODO(wesm): Specialize for PyList_GET_ITEM? + RETURN_NOT_OK(VisitElem(PySequence_GetItem(obj, i), level)); + } + } else if (PyObject_HasAttrString(obj, "__iter__")) { + PyObject* iter = PyObject_GetIter(obj); + PyObject* item; + while ((item = PyIter_Next(iter))) { + RETURN_NOT_OK(VisitElem(item, level)); } + Py_DECREF(iter); + } else { + return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } @@ -228,6 +226,28 @@ class SeqVisitor { // Track observed int max_nesting_level_; int nesting_histogram_[MAX_NESTING_LEVELS]; + + // Visits a specific element (inner part of the loop) + Status VisitElem(PyObject* item, int level) { + // TODO(wesm): Error checking? + OwnedRef item_ref(item); + + if (PyList_Check(item)) { + RETURN_NOT_OK(Visit(item, level + 1)); + } else if (PyDict_Check(item)) { + return Status::NotImplemented("No type inference for dicts"); + } else { + // We permit nulls at any level of nesting + if (item == Py_None) { + // TODO + } else { + ++nesting_histogram_[level]; + scalars_.Visit(item); + } + } + return Status::OK(); + } + }; Status InferArrowSize(PyObject* obj, int64_t* size) { @@ -263,7 +283,7 @@ Status InferArrowTypeAndSize( PyDateTime_IMPORT; SeqVisitor seq_visitor; - RETURN_NOT_OK(seq_visitor.Visit(obj)); + RETURN_NOT_OK(seq_visitor.Visit(obj, *size)); RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); From 82ec3c3da23b79096d99ed95c56369ec4fd69c86 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Apr 2017 16:58:54 -0700 Subject: [PATCH 09/29] revert changes to _array.pyx --- python/pyarrow/_array.pyx | 2 +- python/pyarrow/tests/test_convert_builtin.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index be6c2e1d16f..1c571ba153d 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -914,7 +914,7 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None): Parameters ---------- - sequence : sequence or iterable-like object of Python objects + sequence : sequence-like object of Python objects type : pyarrow.DataType, optional If not passed, will be inferred from the data memory_pool : pyarrow.MemoryPool, optional diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index d25055d8280..33bd6db2c5e 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -22,6 +22,21 @@ import datetime import decimal +class TestConvertIterable(unittest.TestCase): + + def test_iterable_types(self): + arr1 = pa.array(range(3)) + arr2 = pa.array((1, 2, 3)) + + assert arr1.equals(arr2) + + def test_empty_iterable(self): + arr = pa.array(range(0)) + assert len(arr) == 0 + assert arr.null_count == 0 + assert arr.type == pa.null() + assert arr.to_pylist() == [] + class TestConvertSequence(unittest.TestCase): From 63c0b7fa591a9e7c9b9c3c2b5c72b1016c4545d6 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 01:43:23 -0700 Subject: [PATCH 10/29] Tests pass (TODO cleanup debugging) --- cpp/src/arrow/python/builtin_convert.cc | 32 ++++++++++++++++---- python/pyarrow/tests/test_convert_builtin.py | 25 +++++++++++++-- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index bebc44056e2..4e83ffbc51b 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -91,6 +91,7 @@ class ScalarVisitor { } else if (PyFloat_Check(obj)) { ++float_count_; } else if (IsPyInteger(obj)) { + printf("PyInteger during visit...\n"); ++int_count_; } else if (PyDate_CheckExact(obj)) { ++date_count_; @@ -101,12 +102,14 @@ class ScalarVisitor { } else if (PyUnicode_Check(obj)) { ++unicode_count_; } else { + printf("Error?...\n"); // TODO(wesm): accumulate error information somewhere } } std::shared_ptr GetType() { // TODO(wesm): handling mixed-type cases + printf("getting type from scalar our int counts is %d\n", int_count_); if (float_count_) { return float64(); } else if (int_count_) { @@ -159,8 +162,10 @@ class SeqVisitor { // Loop through either a sequence or an iterator. // VisitElemt takes ownership of the object we are visiting. if (PySequence_Check(obj)) { + printf("Visitng sequence style...\n"); Py_ssize_t size = PySequence_Size(obj); for (int64_t i = 0; i < size; ++i) { + printf("Visiting elem %d\n", i); // TODO(wesm): Specialize for PyList_GET_ITEM? RETURN_NOT_OK(VisitElem(PySequence_GetItem(obj, i), level)); } @@ -172,7 +177,7 @@ class SeqVisitor { } Py_DECREF(iter); } else { - return Status::TypeError("Object is not a sequence or iterable"); + return Status::TypeError("Object is not a sequence or iterable 1"); } return Status::OK(); } @@ -233,6 +238,7 @@ class SeqVisitor { OwnedRef item_ref(item); if (PyList_Check(item)) { + printf("recursing level pylist level...\n"); RETURN_NOT_OK(Visit(item, level + 1)); } else if (PyDict_Check(item)) { return Status::NotImplemented("No type inference for dicts"); @@ -263,12 +269,12 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { } Py_DECREF(iter); } else { - return Status::TypeError("Object is not a sequence or iterable"); + return Status::TypeError("Object is not a sequence or iterable 2"); } if (PyErr_Occurred()) { // Not a sequence PyErr_Clear(); - return Status::TypeError("Object is not a sequence or iterable"); + return Status::TypeError("Object is not a sequence or iterable 3"); } return Status::OK(); } @@ -276,6 +282,10 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { // Non-exhaustive type inference Status InferArrowTypeAndSize( PyObject* obj, int64_t* size, std::shared_ptr* out_type) { + printf("Infering type and size "); + PyObject_Print(obj, stdout, 0); + printf("\n"); + RETURN_NOT_OK(InferArrowSize(obj, size)); // For 0-length sequences, refuse to guess @@ -283,7 +293,7 @@ Status InferArrowTypeAndSize( PyDateTime_IMPORT; SeqVisitor seq_visitor; - RETURN_NOT_OK(seq_visitor.Visit(obj, *size)); + RETURN_NOT_OK(seq_visitor.Visit(obj)); RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); @@ -324,6 +334,9 @@ template class TypedConverterVisitor : public TypedConverter { public: Status AppendData(PyObject* obj, int64_t size) override { + printf("Appending data..."); + PyObject_Print(obj, stdout, 0); + printf("\n"); /// Ensure we've allocated enough space RETURN_NOT_OK(this->typed_builder_->Reserve(size)); // Iterate over the items adding each one @@ -332,7 +345,7 @@ class TypedConverterVisitor : public TypedConverter { OwnedRef ref(PySequence_GetItem(obj, i)); RETURN_NOT_OK(appendItem(ref)); } - } else if (PyObject_HasAttrString(obj, "__iter__")) { + } else if (PyObject_HasAttrString(obj, "__iter__")) { PyObject* iter = PyObject_GetIter(obj); PyObject* item; while ((item = PyIter_Next(iter))) { @@ -341,7 +354,10 @@ class TypedConverterVisitor : public TypedConverter { } Py_DECREF(iter); } else { - return Status::TypeError("Object is not a sequence or iterable"); + printf("non sequence or iterable? "); + PyObject_Print(obj, stdout, 0); + printf("\n"); + return Status::TypeError("Object is not a sequence or iterable 5a"); } return Status::OK(); } @@ -523,6 +539,9 @@ class ListConverter : public TypedConverterVisitor { Status Init(const std::shared_ptr& builder) override; inline Status appendItem(OwnedRef &item) override final { + printf("List converter...\n"); + PyObject_Print(item.obj(), stdout, 0); + printf("boop\n"); if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { @@ -624,6 +643,7 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, const std::shared_ptr& builder, int64_t size) { PyDateTime_IMPORT; + printf("fuck type id is %d int is %d and list is %d" , type->id(), Type::INT64, Type::LIST); std::shared_ptr converter = GetConverter(type); if (converter == nullptr) { std::stringstream ss; diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 33bd6db2c5e..7ede0400c4c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -22,16 +22,23 @@ import datetime import decimal +class StrangeIterable: + def __init__(self, lst): + self.lst = lst + + def __iter__(self): + return self.lst.__iter__() + class TestConvertIterable(unittest.TestCase): def test_iterable_types(self): - arr1 = pa.array(range(3)) - arr2 = pa.array((1, 2, 3)) + arr1 = pa.array(StrangeIterable([0, 1, 2, 3])) + arr2 = pa.array((0, 1, 2, 3)) assert arr1.equals(arr2) def test_empty_iterable(self): - arr = pa.array(range(0)) + arr = pa.array(StrangeIterable([])) assert len(arr) == 0 assert arr.null_count == 0 assert arr.type == pa.null() @@ -223,3 +230,15 @@ def test_decimal_large_integer(self): type = pa.decimal(precision=23, scale=5) arr = pa.array(data, type=type) assert arr.to_pylist() == data + + def test_range_types(self): + arr1 = pa.array(range(3)) + arr2 = pa.array((0, 1, 2)) + assert arr1.equals(arr2) + + def test_empty_range(self): + arr = pa.array(range(0)) + assert len(arr) == 0 + assert arr.null_count == 0 + assert arr.type == pa.null() + assert arr.to_pylist() == [] From 80cc971e6c4a88d7951f4f494c81cdcca262d7ae Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 01:44:55 -0700 Subject: [PATCH 11/29] Cleanup debugging --- cpp/src/arrow/python/builtin_convert.cc | 27 ++++--------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 4e83ffbc51b..5a537ea6bf8 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -91,7 +91,6 @@ class ScalarVisitor { } else if (PyFloat_Check(obj)) { ++float_count_; } else if (IsPyInteger(obj)) { - printf("PyInteger during visit...\n"); ++int_count_; } else if (PyDate_CheckExact(obj)) { ++date_count_; @@ -102,14 +101,12 @@ class ScalarVisitor { } else if (PyUnicode_Check(obj)) { ++unicode_count_; } else { - printf("Error?...\n"); // TODO(wesm): accumulate error information somewhere } } std::shared_ptr GetType() { // TODO(wesm): handling mixed-type cases - printf("getting type from scalar our int counts is %d\n", int_count_); if (float_count_) { return float64(); } else if (int_count_) { @@ -162,10 +159,8 @@ class SeqVisitor { // Loop through either a sequence or an iterator. // VisitElemt takes ownership of the object we are visiting. if (PySequence_Check(obj)) { - printf("Visitng sequence style...\n"); Py_ssize_t size = PySequence_Size(obj); for (int64_t i = 0; i < size; ++i) { - printf("Visiting elem %d\n", i); // TODO(wesm): Specialize for PyList_GET_ITEM? RETURN_NOT_OK(VisitElem(PySequence_GetItem(obj, i), level)); } @@ -177,7 +172,7 @@ class SeqVisitor { } Py_DECREF(iter); } else { - return Status::TypeError("Object is not a sequence or iterable 1"); + return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } @@ -238,7 +233,6 @@ class SeqVisitor { OwnedRef item_ref(item); if (PyList_Check(item)) { - printf("recursing level pylist level...\n"); RETURN_NOT_OK(Visit(item, level + 1)); } else if (PyDict_Check(item)) { return Status::NotImplemented("No type inference for dicts"); @@ -269,12 +263,12 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { } Py_DECREF(iter); } else { - return Status::TypeError("Object is not a sequence or iterable 2"); + return Status::TypeError("Object is not a sequence or iterable"); } if (PyErr_Occurred()) { // Not a sequence PyErr_Clear(); - return Status::TypeError("Object is not a sequence or iterable 3"); + return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } @@ -282,9 +276,6 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { // Non-exhaustive type inference Status InferArrowTypeAndSize( PyObject* obj, int64_t* size, std::shared_ptr* out_type) { - printf("Infering type and size "); - PyObject_Print(obj, stdout, 0); - printf("\n"); RETURN_NOT_OK(InferArrowSize(obj, size)); @@ -334,9 +325,6 @@ template class TypedConverterVisitor : public TypedConverter { public: Status AppendData(PyObject* obj, int64_t size) override { - printf("Appending data..."); - PyObject_Print(obj, stdout, 0); - printf("\n"); /// Ensure we've allocated enough space RETURN_NOT_OK(this->typed_builder_->Reserve(size)); // Iterate over the items adding each one @@ -354,10 +342,7 @@ class TypedConverterVisitor : public TypedConverter { } Py_DECREF(iter); } else { - printf("non sequence or iterable? "); - PyObject_Print(obj, stdout, 0); - printf("\n"); - return Status::TypeError("Object is not a sequence or iterable 5a"); + return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } @@ -539,9 +524,6 @@ class ListConverter : public TypedConverterVisitor { Status Init(const std::shared_ptr& builder) override; inline Status appendItem(OwnedRef &item) override final { - printf("List converter...\n"); - PyObject_Print(item.obj(), stdout, 0); - printf("boop\n"); if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { @@ -643,7 +625,6 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, const std::shared_ptr& builder, int64_t size) { PyDateTime_IMPORT; - printf("fuck type id is %d int is %d and list is %d" , type->id(), Type::INT64, Type::LIST); std::shared_ptr converter = GetConverter(type); if (converter == nullptr) { std::stringstream ss; From 3a55e82494c6e1764ac4e4f499b9e029450f7845 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 01:48:56 -0700 Subject: [PATCH 12/29] Update array function description --- python/pyarrow/_array.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index 1c571ba153d..d0f72a75553 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -914,7 +914,7 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None): Parameters ---------- - sequence : sequence-like object of Python objects + sequence : sequence-like or iterable object of Python objects type : pyarrow.DataType, optional If not passed, will be inferred from the data memory_pool : pyarrow.MemoryPool, optional From be58bc0f54e2836707c812f0c9b4007c7316dd06 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 01:49:52 -0700 Subject: [PATCH 13/29] Restore ArrowBlock (unreleated change) --- .../java/org/apache/arrow/vector/stream/MessageSerializer.java | 1 + 1 file changed, 1 insertion(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java index 227423dfeb2..ec7e0f2ffb1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/stream/MessageSerializer.java @@ -33,6 +33,7 @@ import org.apache.arrow.flatbuf.MetadataVersion; import org.apache.arrow.flatbuf.RecordBatch; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.file.ArrowBlock; import org.apache.arrow.vector.file.ReadChannel; import org.apache.arrow.vector.file.WriteChannel; import org.apache.arrow.vector.schema.ArrowBuffer; From d392daa837e21f60ec409a51248605f417a24a6c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 02:01:55 -0700 Subject: [PATCH 14/29] Add limmited pure iterator support and a note --- python/pyarrow/_array.pyx | 23 ++++++++++++++------ python/pyarrow/includes/pyarrow.pxd | 4 ++++ python/pyarrow/tests/test_convert_builtin.py | 6 +++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index d0f72a75553..601abe0fe85 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -908,18 +908,20 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type, -def array(object sequence, DataType type=None, MemoryPool memory_pool=None): +def array(object sequence, DataType type=None, MemoryPool memory_pool=None, size=None): """ Create pyarrow.Array instance from a Python sequence Parameters ---------- - sequence : sequence-like or iterable object of Python objects + sequence : sequence-like or iterable object of Python objects. + If both type and size are specified may be a single use iterable. type : pyarrow.DataType, optional If not passed, will be inferred from the data memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool + size : Size of the elements. Returns ------- @@ -933,11 +935,18 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None): if type is None: check_status(pyarrow.ConvertPySequence(sequence, pool, &sp_array)) else: - check_status( - pyarrow.ConvertPySequence( - sequence, pool, &sp_array, type.sp_type - ) - ) + if size is None: + check_status( + pyarrow.ConvertPySequence( + sequence, pool, &sp_array, type.sp_type + ) + ) + else: + check_status( + pyarrow.ConvertPySequence( + sequence, pool, &sp_array, type.sp_type, size + ) + ) return box_array(sp_array) diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index c40df3db8a9..abba73ec4a3 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -33,6 +33,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus ConvertPySequence(object obj, CMemoryPool* pool, shared_ptr[CArray]* out, const shared_ptr[CDataType]& type) + CStatus ConvertPySequence(object obj, CMemoryPool* pool, + shared_ptr[CArray]* out, + const shared_ptr[CDataType]& type, + int64_t size) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 7ede0400c4c..5d350e3c54c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -45,6 +45,12 @@ def test_empty_iterable(self): assert arr.to_pylist() == [] +class TestLimitedConvertIterator(unittest.TestCase): + def test_iterator_types(self): + arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3) + arr2 = pa.array((0, 1, 2)) + assert arr1.equals(arr2) + class TestConvertSequence(unittest.TestCase): def test_sequence_types(self): From c429f9a5e49af76863ad27e5aec4a4c480407128 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 09:46:22 -0700 Subject: [PATCH 15/29] Style fixes --- cpp/src/arrow/python/builtin_convert.cc | 26 ++++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 5a537ea6bf8..ce1ea61dc34 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -153,7 +153,6 @@ class SeqVisitor { // co-recursive with VisitElem Status Visit(PyObject* obj, int level = 0) { - if (level > max_nesting_level_) { max_nesting_level_ = level; } // Loop through either a sequence or an iterator. @@ -247,7 +246,6 @@ class SeqVisitor { } return Status::OK(); } - }; Status InferArrowSize(PyObject* obj, int64_t* size) { @@ -349,10 +347,10 @@ class TypedConverterVisitor : public TypedConverter { virtual Status appendItem(OwnedRef &item) = 0; }; - + class BoolConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -368,7 +366,7 @@ class BoolConverter : public TypedConverterVisitor { class Int64Converter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { int64_t val; if (item.obj() == Py_None) { typed_builder_->AppendNull(); @@ -383,7 +381,7 @@ class Int64Converter : public TypedConverterVisitor { class DateConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -396,7 +394,7 @@ class DateConverter : public TypedConverterVisitor { class TimestampConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -425,7 +423,7 @@ class TimestampConverter : public TypedConverterVisitor { class DoubleConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { double val; if (item.obj() == Py_None) { typed_builder_->AppendNull(); @@ -440,7 +438,7 @@ class DoubleConverter : public TypedConverterVisitor { class BytesConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { PyObject* bytes_obj; const char* bytes; Py_ssize_t length; @@ -468,7 +466,7 @@ class BytesConverter : public TypedConverterVisitor { class FixedWidthBytesConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = std::dynamic_pointer_cast( @@ -495,7 +493,7 @@ class FixedWidthBytesConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { PyObject* bytes_obj; OwnedRef tmp; const char* bytes; @@ -510,7 +508,7 @@ class UTF8Converter : public TypedConverterVisitor { tmp.reset(PyUnicode_AsUTF8String(item.obj())); RETURN_IF_PYERROR(); bytes_obj = tmp.obj(); - + // No error checking length = PyBytes_GET_SIZE(bytes_obj); bytes = PyBytes_AS_STRING(bytes_obj); @@ -523,7 +521,7 @@ class ListConverter : public TypedConverterVisitor { public: Status Init(const std::shared_ptr& builder) override; - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { @@ -550,7 +548,7 @@ class ListConverter : public TypedConverterVisitor { class DecimalConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) override final { + inline Status appendItem(OwnedRef &item) final { /// Can the compiler figure out that the case statement below isn't necessary /// once we're running? const int bit_width = From 1d970bdb2e00f45c196723977ede02cff8840a24 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 23:09:38 -0700 Subject: [PATCH 16/29] Switch the SeqVisitor to use OwnedRef --- cpp/src/arrow/python/builtin_convert.cc | 27 +++++++++++-------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index ce1ea61dc34..223357180c9 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -156,20 +156,20 @@ class SeqVisitor { if (level > max_nesting_level_) { max_nesting_level_ = level; } // Loop through either a sequence or an iterator. - // VisitElemt takes ownership of the object we are visiting. if (PySequence_Check(obj)) { Py_ssize_t size = PySequence_Size(obj); for (int64_t i = 0; i < size; ++i) { // TODO(wesm): Specialize for PyList_GET_ITEM? - RETURN_NOT_OK(VisitElem(PySequence_GetItem(obj, i), level)); + OwnedRef ref = OwnedRef(PySequence_GetItem(obj, i)); + RETURN_NOT_OK(VisitElem(ref, level)); } } else if (PyObject_HasAttrString(obj, "__iter__")) { - PyObject* iter = PyObject_GetIter(obj); + OwnedRef iter = OwnedRef(PyObject_GetIter(obj)); PyObject* item; - while ((item = PyIter_Next(iter))) { - RETURN_NOT_OK(VisitElem(item, level)); + while ((item = PyIter_Next(iter.obj()))) { + OwnedRef ref = OwnedRef(item); + RETURN_NOT_OK(VisitElem(ref, level)); } - Py_DECREF(iter); } else { return Status::TypeError("Object is not a sequence or iterable"); } @@ -227,21 +227,18 @@ class SeqVisitor { int nesting_histogram_[MAX_NESTING_LEVELS]; // Visits a specific element (inner part of the loop) - Status VisitElem(PyObject* item, int level) { - // TODO(wesm): Error checking? - OwnedRef item_ref(item); - - if (PyList_Check(item)) { - RETURN_NOT_OK(Visit(item, level + 1)); - } else if (PyDict_Check(item)) { + Status VisitElem(OwnedRef &item_ref, int level) { + if (PyList_Check(item_ref.obj())) { + RETURN_NOT_OK(Visit(item_ref.obj(), level + 1)); + } else if (PyDict_Check(item_ref.obj())) { return Status::NotImplemented("No type inference for dicts"); } else { // We permit nulls at any level of nesting - if (item == Py_None) { + if (item_ref.obj() == Py_None) { // TODO } else { ++nesting_histogram_[level]; - scalars_.Visit(item); + scalars_.Visit(item_ref.obj()); } } return Status::OK(); From 52b03e3e8aab4b3fdead6e1dd0cd1998c634f9c8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 23:12:43 -0700 Subject: [PATCH 17/29] Use a const ownedref --- cpp/src/arrow/python/builtin_convert.cc | 26 +++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 223357180c9..626542cf621 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -227,7 +227,7 @@ class SeqVisitor { int nesting_histogram_[MAX_NESTING_LEVELS]; // Visits a specific element (inner part of the loop) - Status VisitElem(OwnedRef &item_ref, int level) { + Status VisitElem(const OwnedRef &item_ref, int level) { if (PyList_Check(item_ref.obj())) { RETURN_NOT_OK(Visit(item_ref.obj(), level + 1)); } else if (PyDict_Check(item_ref.obj())) { @@ -299,6 +299,8 @@ class SeqConverter { virtual Status AppendData(PyObject* seq, int64_t size) = 0; + virtual ~SeqConverter() {} + protected: std::shared_ptr builder_; }; @@ -342,12 +344,12 @@ class TypedConverterVisitor : public TypedConverter { return Status::OK(); } - virtual Status appendItem(OwnedRef &item) = 0; + virtual Status AppendItem(const OwnedRef& item) = 0; }; class BoolConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -363,7 +365,7 @@ class BoolConverter : public TypedConverterVisitor { class Int64Converter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { int64_t val; if (item.obj() == Py_None) { typed_builder_->AppendNull(); @@ -378,7 +380,7 @@ class Int64Converter : public TypedConverterVisitor { class DateConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -391,7 +393,7 @@ class DateConverter : public TypedConverterVisitor { class TimestampConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -420,7 +422,7 @@ class TimestampConverter : public TypedConverterVisitor { class DoubleConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { double val; if (item.obj() == Py_None) { typed_builder_->AppendNull(); @@ -435,7 +437,7 @@ class DoubleConverter : public TypedConverterVisitor { class BytesConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { PyObject* bytes_obj; const char* bytes; Py_ssize_t length; @@ -463,7 +465,7 @@ class BytesConverter : public TypedConverterVisitor { class FixedWidthBytesConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = std::dynamic_pointer_cast( @@ -490,7 +492,7 @@ class FixedWidthBytesConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { PyObject* bytes_obj; OwnedRef tmp; const char* bytes; @@ -518,7 +520,7 @@ class ListConverter : public TypedConverterVisitor { public: Status Init(const std::shared_ptr& builder) override; - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { @@ -545,7 +547,7 @@ class ListConverter : public TypedConverterVisitor { class DecimalConverter : public TypedConverterVisitor { public: - inline Status appendItem(OwnedRef &item) final { + inline Status AppendItem(const OwnedRef& item) final { /// Can the compiler figure out that the case statement below isn't necessary /// once we're running? const int bit_width = From 389976cb301e483bc6927717bc45baa0c528e301 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 23:31:15 -0700 Subject: [PATCH 18/29] Use CRTP in the iterator --- cpp/src/arrow/python/builtin_convert.cc | 62 +++++++++++++++---------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 626542cf621..21aceb981f2 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -318,7 +318,7 @@ class TypedConverter : public SeqConverter { BuilderType* typed_builder_; }; -template +template class TypedConverterVisitor : public TypedConverter { public: Status AppendData(PyObject* obj, int64_t size) override { @@ -328,28 +328,31 @@ class TypedConverterVisitor : public TypedConverter { if (PySequence_Check(obj)) { for (int64_t i = 0; i < size; ++i) { OwnedRef ref(PySequence_GetItem(obj, i)); - RETURN_NOT_OK(appendItem(ref)); + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); } } else if (PyObject_HasAttrString(obj, "__iter__")) { PyObject* iter = PyObject_GetIter(obj); + OwnedRef iter_ref(iter); PyObject* item; while ((item = PyIter_Next(iter))) { OwnedRef ref(item); - RETURN_NOT_OK(appendItem(ref)); + RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); } - Py_DECREF(iter); } else { return Status::TypeError("Object is not a sequence or iterable"); } return Status::OK(); } - virtual Status AppendItem(const OwnedRef& item) = 0; + Status AppendItem(const OwnedRef& item) { + return Status::NotImplemented; + }; }; -class BoolConverter : public TypedConverterVisitor { +class BoolConverter : public TypedConverterVisitor< + BooleanBuilder, BoolConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -363,9 +366,10 @@ class BoolConverter : public TypedConverterVisitor { } }; -class Int64Converter : public TypedConverterVisitor { +class Int64Converter : public TypedConverterVisitor< + Int64Builder, Int64Converter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { int64_t val; if (item.obj() == Py_None) { typed_builder_->AppendNull(); @@ -378,9 +382,10 @@ class Int64Converter : public TypedConverterVisitor { } }; -class DateConverter : public TypedConverterVisitor { +class DateConverter : public TypedConverterVisitor< + Date64Builder, DateConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -391,9 +396,10 @@ class DateConverter : public TypedConverterVisitor { } }; -class TimestampConverter : public TypedConverterVisitor { +class TimestampConverter : public TypedConverterVisitor< + TimestampBuilder, TimestampConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { typed_builder_->AppendNull(); } else { @@ -420,9 +426,10 @@ class TimestampConverter : public TypedConverterVisitor { } }; -class DoubleConverter : public TypedConverterVisitor { +class DoubleConverter : public TypedConverterVisitor< + DoubleBuilder, DoubleConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { double val; if (item.obj() == Py_None) { typed_builder_->AppendNull(); @@ -435,9 +442,10 @@ class DoubleConverter : public TypedConverterVisitor { } }; -class BytesConverter : public TypedConverterVisitor { +class BytesConverter : public TypedConverterVisitor< + BinaryBuilder, BytesConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; const char* bytes; Py_ssize_t length; @@ -463,9 +471,10 @@ class BytesConverter : public TypedConverterVisitor { } }; -class FixedWidthBytesConverter : public TypedConverterVisitor { +class FixedWidthBytesConverter : public TypedConverterVisitor< + FixedSizeBinaryBuilder, FixedWidthBytesConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = std::dynamic_pointer_cast( @@ -490,9 +499,10 @@ class FixedWidthBytesConverter : public TypedConverterVisitor { +class UTF8Converter : public TypedConverterVisitor< + StringBuilder, UTF8Converter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; const char* bytes; @@ -516,11 +526,12 @@ class UTF8Converter : public TypedConverterVisitor { } }; -class ListConverter : public TypedConverterVisitor { +class ListConverter : public TypedConverterVisitor< + ListBuilder, ListConverter> { public: Status Init(const std::shared_ptr& builder) override; - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { RETURN_NOT_OK(typed_builder_->AppendNull()); } else { @@ -545,9 +556,10 @@ class ListConverter : public TypedConverterVisitor { break; \ } -class DecimalConverter : public TypedConverterVisitor { +class DecimalConverter : public TypedConverterVisitor< + arrow::DecimalBuilder, DecimalConverter> { public: - inline Status AppendItem(const OwnedRef& item) final { + inline Status AppendItem(const OwnedRef& item) { /// Can the compiler figure out that the case statement below isn't necessary /// once we're running? const int bit_width = From 8c42fdc23585e06c07d17ab6a3c305e4613ae54c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 26 Apr 2017 23:52:57 -0700 Subject: [PATCH 19/29] Feedback from wes (fix some previously unchecked appends, fix long line ) --- cpp/src/arrow/python/builtin_convert.cc | 50 ++++++++++--------------- python/pyarrow/_array.pyx | 3 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 21aceb981f2..2ffff70bd03 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -250,13 +250,13 @@ Status InferArrowSize(PyObject* obj, int64_t* size) { *size = static_cast(PySequence_Size(obj)); } else if (PyObject_HasAttrString(obj, "__iter__")) { PyObject* iter = PyObject_GetIter(obj); + OwnedRef iter_ref(iter); *size = 0; PyObject* item; while ((item = PyIter_Next(iter))) { + OwnedRef item_ref(item); *size += 1; - Py_DECREF(item); } - Py_DECREF(iter); } else { return Status::TypeError("Object is not a sequence or iterable"); } @@ -354,15 +354,14 @@ class BoolConverter : public TypedConverterVisitor< public: inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { - typed_builder_->AppendNull(); + return typed_builder_->AppendNull(); } else { if (item.obj() == Py_True) { - typed_builder_->Append(true); + return typed_builder_->Append(true); } else { - typed_builder_->Append(false); + return typed_builder_->Append(false); } } - return Status::OK(); } }; @@ -372,13 +371,12 @@ class Int64Converter : public TypedConverterVisitor< inline Status AppendItem(const OwnedRef& item) { int64_t val; if (item.obj() == Py_None) { - typed_builder_->AppendNull(); + return typed_builder_->AppendNull(); } else { val = static_cast(PyLong_AsLongLong(item.obj())); RETURN_IF_PYERROR(); - typed_builder_->Append(val); + return typed_builder_->Append(val); } - return Status::OK(); } }; @@ -387,12 +385,11 @@ class DateConverter : public TypedConverterVisitor< public: inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { - typed_builder_->AppendNull(); + return typed_builder_->AppendNull(); } else { PyDateTime_Date* pydate = reinterpret_cast(item.obj()); - typed_builder_->Append(PyDate_to_ms(pydate)); + return typed_builder_->Append(PyDate_to_ms(pydate)); } - return Status::OK(); } }; @@ -401,7 +398,7 @@ class TimestampConverter : public TypedConverterVisitor< public: inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { - typed_builder_->AppendNull(); + return typed_builder_->AppendNull(); } else { PyDateTime_DateTime* pydatetime = reinterpret_cast(item.obj()); @@ -420,9 +417,8 @@ class TimestampConverter : public TypedConverterVisitor< // Microseconds since the epoch int64_t val = static_cast( lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us); - typed_builder_->Append(val); + return typed_builder_->Append(val); } - return Status::OK(); } }; @@ -432,13 +428,12 @@ class DoubleConverter : public TypedConverterVisitor< inline Status AppendItem(const OwnedRef& item) { double val; if (item.obj() == Py_None) { - typed_builder_->AppendNull(); + return typed_builder_->AppendNull(); } else { val = PyFloat_AsDouble(item.obj()); RETURN_IF_PYERROR(); - typed_builder_->Append(val); + return typed_builder_->Append(val); } - return Status::OK(); } }; @@ -466,8 +461,7 @@ class BytesConverter : public TypedConverterVisitor< // No error checking length = PyBytes_GET_SIZE(bytes_obj); bytes = PyBytes_AS_STRING(bytes_obj); - RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast(length))); - return Status::OK(); + return typed_builder_->Append(bytes, static_cast(length)); } }; @@ -493,9 +487,8 @@ class FixedWidthBytesConverter : public TypedConverterVisitor< } // No error checking RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length)); - RETURN_NOT_OK(typed_builder_->Append( - reinterpret_cast(PyBytes_AS_STRING(bytes_obj)))); - return Status::OK(); + return typed_builder_->Append( + reinterpret_cast(PyBytes_AS_STRING(bytes_obj))); } }; @@ -509,8 +502,7 @@ class UTF8Converter : public TypedConverterVisitor< Py_ssize_t length; if (item.obj() == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); - return Status::OK(); + return typed_builder_->AppendNull() ; } else if (!PyUnicode_Check(item.obj())) { return Status::Invalid("Non-unicode value encountered"); } @@ -521,8 +513,7 @@ class UTF8Converter : public TypedConverterVisitor< // No error checking length = PyBytes_GET_SIZE(bytes_obj); bytes = PyBytes_AS_STRING(bytes_obj); - RETURN_NOT_OK(typed_builder_->Append(bytes, static_cast(length))); - return Status::OK(); + return typed_builder_->Append(bytes, static_cast(length)); } }; @@ -533,15 +524,14 @@ class ListConverter : public TypedConverterVisitor< inline Status AppendItem(const OwnedRef& item) { if (item.obj() == Py_None) { - RETURN_NOT_OK(typed_builder_->AppendNull()); + return typed_builder_->AppendNull(); } else { typed_builder_->Append(); PyObject* item_obj = item.obj(); int64_t list_size = static_cast(PySequence_Size(item_obj)); - RETURN_NOT_OK(value_converter_->AppendData(item_obj, list_size)); + return value_converter_->AppendData(item_obj, list_size); } - return Status::OK(); } protected: diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index 601abe0fe85..e18e484949f 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -908,7 +908,8 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type, -def array(object sequence, DataType type=None, MemoryPool memory_pool=None, size=None): +def array(object sequence, DataType type=None, MemoryPool memory_pool=None, + size=None): """ Create pyarrow.Array instance from a Python sequence From 9eb3f106321addb258ed8e42342ce652b9bd0766 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 28 Apr 2017 15:59:17 -0700 Subject: [PATCH 20/29] Return the append inside of the decimal convert case/switch business --- cpp/src/arrow/python/builtin_convert.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index c37cb34d275..ea315cc59e9 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -527,7 +527,7 @@ class ListConverter : public TypedConverterVisitor< case bit_width: { \ arrow::decimal::Decimal##bit_width out; \ RETURN_NOT_OK(PythonDecimalToArrowDecimal((item), &out)); \ - RETURN_NOT_OK((builder)->Append(out)); \ + return ((builder)->Append(out)); \ break; \ } @@ -552,10 +552,8 @@ class DecimalConverter : public TypedConverterVisitor< } RETURN_IF_PYERROR(); } else { - RETURN_NOT_OK(typed_builder_->AppendNull()); + return typed_builder_->AppendNull(); } - - return Status::OK(); } }; From fa0abcc20d738cc90327020e0167e71d01b702ab Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 19 Jun 2017 18:18:24 -0700 Subject: [PATCH 21/29] Add ConvertPySequence to other side --- python/pyarrow/includes/libarrow.pxd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9df31c80ccf..628385c6457 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -644,6 +644,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus ConvertPySequence(object obj, CMemoryPool* pool, shared_ptr[CArray]* out, const shared_ptr[CDataType]& type) + CStatus ConvertPySequence(object obj, CMemoryPool* pool, + shared_ptr[CArray]* out, + const shared_ptr[CDataType]& type, + int64_t size) CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) From 42f06996fc16a7e0b6a9c19baa13991e239ec21b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 19 Jun 2017 18:37:56 -0700 Subject: [PATCH 22/29] Style fix --- cpp/src/arrow/python/builtin_convert.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index ea315cc59e9..62358fcba05 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -346,7 +346,7 @@ class TypedConverterVisitor : public TypedConverter { Status AppendItem(const OwnedRef& item) { return Status::NotImplemented; - }; + } }; class BoolConverter : public TypedConverterVisitor< @@ -392,7 +392,7 @@ class DateConverter : public TypedConverterVisitor< } } }; - + class TimestampConverter : public TypedConverterVisitor< Date64Builder, TimestampConverter> { public: @@ -487,7 +487,7 @@ class UTF8Converter : public TypedConverterVisitor< Py_ssize_t length; if (item.obj() == Py_None) { - return typed_builder_->AppendNull() ; + return typed_builder_->AppendNull(); } else if (!PyUnicode_Check(item.obj())) { return Status::Invalid("Non-unicode value encountered"); } From ad935e9d4c5f2161b524575afcfbbaf295d1fad4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 20 Jun 2017 03:13:15 -0700 Subject: [PATCH 23/29] Have size override the size of the iterator if the iterator is larger. --- cpp/src/arrow/python/builtin_convert.cc | 5 ++++- python/pyarrow/array.pxi | 3 ++- python/pyarrow/tests/test_convert_builtin.py | 5 +++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 62358fcba05..c6ca182bd43 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -334,9 +334,12 @@ class TypedConverterVisitor : public TypedConverter { PyObject* iter = PyObject_GetIter(obj); OwnedRef iter_ref(iter); PyObject* item; - while ((item = PyIter_Next(iter))) { + int64_t i = 0; + // To allow people with long generators to only convert a subset, stop consuming at size. + while ((item = PyIter_Next(iter)) && i < size) { OwnedRef ref(item); RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); + ++i; } } else { return Status::TypeError("Object is not a sequence or iterable"); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 53afbf89ad7..82c7137b8c3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1037,7 +1037,8 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool - size : Size of the elements. + size : int64, optional + Size of the elements. If the imput is larger than size bail at this length. Returns ------- diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 5d350e3c54c..803a3e5fd45 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -51,6 +51,11 @@ def test_iterator_types(self): arr2 = pa.array((0, 1, 2)) assert arr1.equals(arr2) + def test_iterator_size_overflow(self): + arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2) + arr2 = pa.array((0, 1)) + assert arr1.equals(arr2) + class TestConvertSequence(unittest.TestCase): def test_sequence_types(self): From 1fd9588a44e5a045f9a81787f97a4b19224752dc Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 20 Jun 2017 03:28:32 -0700 Subject: [PATCH 24/29] Do dynamic resize on the array buffer if size ended up being larger (e.g. support underflow from iterator constructors). --- cpp/src/arrow/python/builtin_convert.cc | 3 +++ python/pyarrow/tests/test_convert_builtin.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index c6ca182bd43..4c28de69eb1 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -341,6 +341,9 @@ class TypedConverterVisitor : public TypedConverter { RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); ++i; } + if (size != i) { + RETURN_NOT_OK(this->typed_builder_->Resize(i)); + } } else { return Status::TypeError("Object is not a sequence or iterable"); } diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 803a3e5fd45..bf14c4f2328 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -56,6 +56,11 @@ def test_iterator_size_overflow(self): arr2 = pa.array((0, 1)) assert arr1.equals(arr2) + def test_iterator_size_underflow(self): + arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10) + arr2 = pa.array((0, 1, 2)) + assert arr1.equals(arr2) + class TestConvertSequence(unittest.TestCase): def test_sequence_types(self): From dddf57db899bf12386185f757f11d323723e3d60 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 20 Jun 2017 03:31:45 -0700 Subject: [PATCH 25/29] Make a note about the resize/realloc in underflow with size --- python/pyarrow/array.pxi | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 82c7137b8c3..08ab9cfd1bd 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1038,7 +1038,11 @@ def array(object sequence, DataType type=None, MemoryPool memory_pool=None, If not passed, will allocate memory from the currently-set default memory pool size : int64, optional - Size of the elements. If the imput is larger than size bail at this length. + Size of the elements. If the imput is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). Returns ------- From ee2afaa448a73d06838db947038d6eb14d307609 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 20 Jun 2017 08:57:48 -0700 Subject: [PATCH 26/29] Comment the built in converter type inferance code a bit. --- cpp/src/arrow/python/builtin_convert.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 4c28de69eb1..064ee6b1b6d 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -145,6 +145,7 @@ class ScalarVisitor { static constexpr int MAX_NESTING_LEVELS = 32; +// SeqVisitor is used to infer the type. class SeqVisitor { public: SeqVisitor() : max_nesting_level_(0) { @@ -177,13 +178,17 @@ class SeqVisitor { } std::shared_ptr GetType() { + // If all the non-list inputs were null (or there were no inputs) if (scalars_.total_count() == 0) { if (max_nesting_level_ == 0) { + // If its just a single empty list or list of nulls, return null. return null(); } else { + // Error, if we have nesting but no concrete base type. return nullptr; } } else { + // Lists of Lists of [X] std::shared_ptr result = scalars_.GetType(); for (int i = 0; i < max_nesting_level_; ++i) { result = std::make_shared(result); @@ -196,6 +201,7 @@ class SeqVisitor { if (scalars_.total_count() > 0) { if (num_nesting_levels() > 1) { return Status::Invalid("Mixed nesting levels not supported"); + // If the nesting goes deeper than the deepest scalar } else if (max_observed_level() < max_nesting_level_) { return Status::Invalid("Mixed nesting levels not supported"); } @@ -203,6 +209,7 @@ class SeqVisitor { return Status::OK(); } + // Returns the deepest level which has scalar elements. int max_observed_level() const { int result = 0; for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { @@ -211,6 +218,7 @@ class SeqVisitor { return result; } + // Returns the number of nesting levels which have scalar elements. int num_nesting_levels() const { int result = 0; for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { @@ -223,10 +231,13 @@ class SeqVisitor { ScalarVisitor scalars_; // Track observed + // Deapest nesting level (irregardless of scalars) int max_nesting_level_; + // Number of scalar elements at each nesting level. + // (TOOD: We really only need to know if a scalar is present, not the count). int nesting_histogram_[MAX_NESTING_LEVELS]; - // Visits a specific element (inner part of the loop) + // Visits a specific element (inner part of the loop). Status VisitElem(const OwnedRef &item_ref, int level) { if (PyList_Check(item_ref.obj())) { RETURN_NOT_OK(Visit(item_ref.obj(), level + 1)); From 2ed00d9197012563125c696821efd698577adbb1 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 27 Jun 2017 16:26:30 -0700 Subject: [PATCH 27/29] Fix long line --- cpp/src/arrow/python/builtin_convert.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 064ee6b1b6d..fd84542d206 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -346,7 +346,8 @@ class TypedConverterVisitor : public TypedConverter { OwnedRef iter_ref(iter); PyObject* item; int64_t i = 0; - // To allow people with long generators to only convert a subset, stop consuming at size. + // To allow people with long generators to only convert a subset, stop + // consuming at size. while ((item = PyIter_Next(iter)) && i < size) { OwnedRef ref(item); RETURN_NOT_OK(static_cast(this)->AppendItem(ref)); From 0b72e95655b3fa653cdcf8a445efc9cc522c35f2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 27 Jun 2017 16:31:52 -0700 Subject: [PATCH 28/29] Remove unecessary file after merge --- python/pyarrow/includes/pyarrow.pxd | 79 ----------------------------- 1 file changed, 79 deletions(-) delete mode 100644 python/pyarrow/includes/pyarrow.pxd diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd deleted file mode 100644 index 7b38bfcd17a..00000000000 --- a/python/pyarrow/includes/pyarrow.pxd +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# distutils: language = c++ - -from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CDataType, - CTable, CTensor, CStatus, Type, - CMemoryPool, TimeUnit, - RandomAccessFile, OutputStream, - CBufferReader) - - -cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: - shared_ptr[CDataType] GetPrimitiveType(Type type) - shared_ptr[CDataType] GetTimestampType(TimeUnit unit) - CStatus ConvertPySequence(object obj, CMemoryPool* pool, - shared_ptr[CArray]* out) - CStatus ConvertPySequence(object obj, CMemoryPool* pool, - shared_ptr[CArray]* out, - const shared_ptr[CDataType]& type) - CStatus ConvertPySequence(object obj, CMemoryPool* pool, - shared_ptr[CArray]* out, - const shared_ptr[CDataType]& type, - int64_t size) - - CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) - - CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo, - const shared_ptr[CDataType]& type, - shared_ptr[CArray]* out) - - CStatus NdarrayToTensor(CMemoryPool* pool, object ao, - shared_ptr[CTensor]* out); - - CStatus TensorToNdarray(const CTensor& tensor, object base, - PyObject** out) - - CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, - object py_ref, PyObject** out) - - CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, - object py_ref, PyObject** out) - - CStatus ConvertTableToPandas(const shared_ptr[CTable]& table, - int nthreads, PyObject** out) - - void set_default_memory_pool(CMemoryPool* pool) - CMemoryPool* get_memory_pool() - - cdef cppclass PyBuffer(CBuffer): - PyBuffer(object o) - - cdef cppclass PyReadableFile(RandomAccessFile): - PyReadableFile(object fo) - - cdef cppclass PyOutputStream(OutputStream): - PyOutputStream(object fo) - - cdef cppclass PyBytesReader(CBufferReader): - PyBytesReader(object fo) From 750e7f4ccaff84932c652b12dab17ebb30bc6dab Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 27 Jun 2017 20:54:54 -0700 Subject: [PATCH 29/29] Switch AppendItem to pure virtual for TypedConverterVisitor --- cpp/src/arrow/python/builtin_convert.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index fd84542d206..d3bfb37fdfb 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -362,9 +362,7 @@ class TypedConverterVisitor : public TypedConverter { return Status::OK(); } - Status AppendItem(const OwnedRef& item) { - return Status::NotImplemented; - } + virtual Status AppendItem(const OwnedRef& item) = 0; }; class BoolConverter : public TypedConverterVisitor<