From 1d11513fce0812afde5f21c6dcd90499f8eb4090 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 21 Jun 2017 23:07:34 +0200 Subject: [PATCH 1/2] ARROW-1137: Python: Ensure Pandas roundtrip of all-None column Change-Id: Ib815d3fa42f0a0ae6c0d9850e9b0b435bad1c331 --- cpp/src/arrow/python/pandas_convert.cc | 65 ++++++++++++++++++--- python/pyarrow/tests/test_convert_pandas.py | 5 ++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index ac61cbc13c6..d83be488b44 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -1305,6 +1305,24 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) return Status::OK(); } +inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + for (int c = 0; c < data.num_chunks(); c++) { + std::shared_ptr arr = data.chunk(c); + + const uint8_t* data_ptr; + int32_t length; + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + // All values are null + Py_INCREF(Py_None); + *out_values = Py_None; + ++out_values; + } + } + return Status::OK(); +} + inline Status ConvertFixedSizeBinary(const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -1457,6 +1475,8 @@ class ObjectBlock : public PandasBlock { RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); } else if (type == Type::DECIMAL) { RETURN_NOT_OK(ConvertDecimals(data, out_buffer)); + } else if (type == Type::NA) { + RETURN_NOT_OK(ConvertNulls(data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast(col->type()); switch (list_type->value_type()->id()) { @@ -1506,7 +1526,12 @@ class IntBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); - if (type != ARROW_TYPE) { return Status::NotImplemented(col->type()->ToString()); } + if (type != ARROW_TYPE) { + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas int" << sizeof(C_TYPE) << " block."; + return Status::NotImplemented(ss.str()); + } ConvertIntegerNoNullsSameType(data, out_buffer); placement_data_[rel_placement] = abs_placement; @@ -1532,7 +1557,12 @@ class Float32Block : public PandasBlock { int64_t rel_placement) override { Type::type type = col->type()->id(); - if (type != Type::FLOAT) { return Status::NotImplemented(col->type()->ToString()); } + if (type != Type::FLOAT) { + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas float32 block."; + return Status::NotImplemented(ss.str()); + } float* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; @@ -1584,7 +1614,10 @@ class Float64Block : public PandasBlock { ConvertNumericNullable(data, NAN, out_buffer); break; default: - return Status::NotImplemented(col->type()->ToString()); + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas float64 block."; + return Status::NotImplemented(ss.str()); } #undef INTEGER_CASE @@ -1603,7 +1636,12 @@ class BoolBlock : public PandasBlock { int64_t rel_placement) override { Type::type type = col->type()->id(); - if (type != Type::BOOL) { return Status::NotImplemented(col->type()->ToString()); } + if (type != Type::BOOL) { + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas boolean block."; + return Status::NotImplemented(ss.str()); + } uint8_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; @@ -1660,7 +1698,10 @@ class DatetimeBlock : public PandasBlock { return Status::NotImplemented("Unsupported time unit"); } } else { - return Status::NotImplemented(col->type()->ToString()); + std::stringstream ss; + ss << "Cannot write Arrow data of type " << col->type()->ToString(); + ss << " to a Pandas datetime block."; + return Status::NotImplemented(ss.str()); } placement_data_[rel_placement] = abs_placement; @@ -1917,8 +1958,14 @@ class DataFrameBlockCreator { case Type::DECIMAL: output_type = PandasBlock::DECIMAL; break; + case Type::NA: + output_type = PandasBlock::OBJECT; + break; default: - return Status::NotImplemented(col->type()->ToString()); + std::stringstream ss; + ss << "No known equivalent Pandas block for Arrow data of type "; + ss << col->type()->ToString() << " is known."; + return Status::NotImplemented(ss.str()); } int block_placement = 0; @@ -2301,7 +2348,11 @@ class ArrowDeserializer { return Status::OK(); } - Status Visit(const NullType& type) { return Status::NotImplemented("null type"); } + Status Visit(const NullType& type) { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast(PyArray_DATA(arr_)); + return ConvertNulls(data_, out_values); + } Status Visit(const StructType& type) { return Status::NotImplemented("struct type"); } diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ca304558c5f..d17ef3c0ad1 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -98,6 +98,11 @@ def _check_array_roundtrip(self, values, expected=None, mask=None, tm.assert_series_equal(pd.Series(result), expected, check_names=False) + def test_all_none_objects(self): + df = pd.DataFrame({'a': [None, None, None]}) + self._check_pandas_roundtrip(df) + + def test_float_no_nulls(self): data = {} fields = [] From 59c0df89733f4e8b37b403df8dd2fd45911e3e15 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 22 Jun 2017 13:44:29 +0200 Subject: [PATCH 2/2] Remove unused variables Change-Id: Iaed4d4be2b11335253be2416b57b04dd261e4931 --- cpp/src/arrow/python/pandas_convert.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index d83be488b44..6b0e3429081 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -1310,8 +1310,6 @@ inline Status ConvertNulls(const ChunkedArray& data, PyObject** out_values) { for (int c = 0; c < data.num_chunks(); c++) { std::shared_ptr arr = data.chunk(c); - const uint8_t* data_ptr; - int32_t length; const bool has_nulls = data.null_count() > 0; for (int64_t i = 0; i < arr->length(); ++i) { // All values are null