From 4e4c7529747713f0d76536ea882f6f82c07b6f58 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 13 May 2017 22:00:07 -0400 Subject: [PATCH 1/3] Add conversions for numpy object arrays with integers and floats Change-Id: I69c2960b510fd5e1eeef66ac614b40019b545825 --- cpp/src/arrow/python/pandas_convert.cc | 88 ++++++++++++++++++++- python/pyarrow/tests/test_convert_pandas.py | 18 +++++ 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index b6fb05e1e5e..a4d06f89e4c 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -80,6 +80,19 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } +static inline bool PyObject_is_float(const PyObject* obj) { + return PyFloat_Check(obj); +} + +static inline bool PyObject_is_integer(const PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + return (!PyBool_Check(obj)) && (PyLong_Check(obj) || PyArray_IsIntegerScalar(obj)); +#else + return (!PyBool_Check(obj)) && + (PyInt_Check(obj) || PyLong_Check(obj) || PyArray_IsIntegerScalar(obj)); +#endif +} + template static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef npy_traits traits; @@ -394,9 +407,11 @@ class PandasConverter { template Status ConvertDates(); + Status ConvertBooleans(); Status ConvertObjectStrings(); + Status ConvertObjectFloats(); Status ConvertObjectFixedWidthBytes(const std::shared_ptr& type); - Status ConvertBooleans(); + Status ConvertObjectIntegers(); Status ConvertLists(const std::shared_ptr& type); Status ConvertObjects(); Status ConvertDecimals(); @@ -610,6 +625,71 @@ Status PandasConverter::ConvertObjectStrings() { return Status::OK(); } +Status PandasConverter::ConvertObjectFloats() { + PyAcquireGIL lock; + + DoubleBuilder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer objects(arr_); + Ndarray1DIndexer mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyFloat_Check(obj)) { + RETURN_NOT_OK(builder.Append(PyFloat_AS_DOUBLE(obj))); + } else { + return InvalidConversion(obj, "float"); + } + } + + return builder.Finish(&out_); +} + +Status PandasConverter::ConvertObjectIntegers() { + PyAcquireGIL lock; + + Int64Builder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer objects(arr_); + Ndarray1DIndexer mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyObject_is_integer(obj)) { + const int64_t val = static_cast(PyLong_AsLong(obj)); + if (PyErr_Occurred()) { + PyErr_Clear(); + return InvalidConversion(obj, "integer"); + } + RETURN_NOT_OK(builder.Append(val)); + } else { + return InvalidConversion(obj, "integer"); + } + } + + return builder.Finish(&out_); +} + Status PandasConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr& type) { PyAcquireGIL lock; @@ -804,8 +884,12 @@ Status PandasConverter::ConvertObjects() { continue; } else if (PyObject_is_string(objects[i])) { return ConvertObjectStrings(); + } else if (PyObject_is_float(objects[i])) { + return ConvertObjectFloats(); } else if (PyBool_Check(objects[i])) { return ConvertBooleans(); + } else if (PyObject_is_integer(objects[i])) { + return ConvertObjectIntegers(); } else if (PyDate_CheckExact(objects[i])) { // We could choose Date32 or Date64 return ConvertDates(); @@ -813,7 +897,7 @@ Status PandasConverter::ConvertObjects() { return ConvertDecimals(); } else { return InvalidConversion( - const_cast(objects[i]), "string, bool, or date"); + const_cast(objects[i]), "string, bool, float, int, date, decimal"); } } } diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 9b9b7519fd9..be35905fc75 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -140,6 +140,24 @@ def test_float_nulls(self): result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) + def test_float_object_nulls(self): + arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) + df = pd.DataFrame({'floats': arr}) + expected = pd.DataFrame({'floats': pd.to_numeric(arr)}) + field = pa.field('floats', pa.float64()) + schema = pa.schema([field]) + self._check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + + def test_int_object_nulls(self): + arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) + df = pd.DataFrame({'ints': arr}) + expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) + field = pa.field('ints', pa.int64()) + schema = pa.schema([field]) + self._check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + def test_integer_no_nulls(self): data = OrderedDict() fields = [] From 45f1ecb9af36f77625b101993dfeee944802e28d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 14 May 2017 09:00:34 -0400 Subject: [PATCH 2/3] Fixes for manylinux1 Change-Id: Id1f5aa57b6fd090dc295a956e2a91b74e333fd96 --- cpp/src/arrow/python/pandas_convert.cc | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index a4d06f89e4c..d164c6a1ff4 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -85,12 +85,7 @@ static inline bool PyObject_is_float(const PyObject* obj) { } static inline bool PyObject_is_integer(const PyObject* obj) { -#if PY_MAJOR_VERSION >= 3 - return (!PyBool_Check(obj)) && (PyLong_Check(obj) || PyArray_IsIntegerScalar(obj)); -#else - return (!PyBool_Check(obj)) && - (PyInt_Check(obj) || PyLong_Check(obj) || PyArray_IsIntegerScalar(obj)); -#endif + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); } template From 9e0b2eae4d9a8a8c796025bc6a54f0a65b4e787b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 14 May 2017 15:46:12 -0400 Subject: [PATCH 3/3] Code review comments Change-Id: Icaf053db0b8141af18fe19a8e11e9541cc591af9 --- cpp/src/arrow/python/pandas_convert.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index d164c6a1ff4..96dd09aa817 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -641,7 +641,9 @@ Status PandasConverter::ConvertObjectFloats() { if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { RETURN_NOT_OK(builder.AppendNull()); } else if (PyFloat_Check(obj)) { - RETURN_NOT_OK(builder.Append(PyFloat_AS_DOUBLE(obj))); + double val = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); } else { return InvalidConversion(obj, "float"); } @@ -672,10 +674,7 @@ Status PandasConverter::ConvertObjectIntegers() { RETURN_NOT_OK(builder.AppendNull()); } else if (PyObject_is_integer(obj)) { const int64_t val = static_cast(PyLong_AsLong(obj)); - if (PyErr_Occurred()) { - PyErr_Clear(); - return InvalidConversion(obj, "integer"); - } + RETURN_IF_PYERROR(); RETURN_NOT_OK(builder.Append(val)); } else { return InvalidConversion(obj, "integer");