diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index b6fb05e1e5e..96dd09aa817 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -80,6 +80,14 @@ static inline bool PyObject_is_string(const PyObject* obj) { #endif } +static inline bool PyObject_is_float(const PyObject* obj) { + return PyFloat_Check(obj); +} + +static inline bool PyObject_is_integer(const PyObject* obj) { + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); +} + template static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef npy_traits traits; @@ -394,9 +402,11 @@ class PandasConverter { template Status ConvertDates(); + Status ConvertBooleans(); Status ConvertObjectStrings(); + Status ConvertObjectFloats(); Status ConvertObjectFixedWidthBytes(const std::shared_ptr& type); - Status ConvertBooleans(); + Status ConvertObjectIntegers(); Status ConvertLists(const std::shared_ptr& type); Status ConvertObjects(); Status ConvertDecimals(); @@ -610,6 +620,70 @@ Status PandasConverter::ConvertObjectStrings() { return Status::OK(); } +Status PandasConverter::ConvertObjectFloats() { + PyAcquireGIL lock; + + DoubleBuilder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer objects(arr_); + Ndarray1DIndexer mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyFloat_Check(obj)) { + double val = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); + } else { + return InvalidConversion(obj, "float"); + } + } + + return builder.Finish(&out_); +} + +Status PandasConverter::ConvertObjectIntegers() { + PyAcquireGIL lock; + + Int64Builder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + + Ndarray1DIndexer objects(arr_); + Ndarray1DIndexer mask_values; + + bool have_mask = false; + if (mask_ != nullptr) { + mask_values.Init(mask_); + have_mask = true; + } + + PyObject* obj; + for (int64_t i = 0; i < objects.size(); ++i) { + obj = objects[i]; + if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) { + RETURN_NOT_OK(builder.AppendNull()); + } else if (PyObject_is_integer(obj)) { + const int64_t val = static_cast(PyLong_AsLong(obj)); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(builder.Append(val)); + } else { + return InvalidConversion(obj, "integer"); + } + } + + return builder.Finish(&out_); +} + Status PandasConverter::ConvertObjectFixedWidthBytes( const std::shared_ptr& type) { PyAcquireGIL lock; @@ -804,8 +878,12 @@ Status PandasConverter::ConvertObjects() { continue; } else if (PyObject_is_string(objects[i])) { return ConvertObjectStrings(); + } else if (PyObject_is_float(objects[i])) { + return ConvertObjectFloats(); } else if (PyBool_Check(objects[i])) { return ConvertBooleans(); + } else if (PyObject_is_integer(objects[i])) { + return ConvertObjectIntegers(); } else if (PyDate_CheckExact(objects[i])) { // We could choose Date32 or Date64 return ConvertDates(); @@ -813,7 +891,7 @@ Status PandasConverter::ConvertObjects() { return ConvertDecimals(); } else { return InvalidConversion( - const_cast(objects[i]), "string, bool, or date"); + const_cast(objects[i]), "string, bool, float, int, date, decimal"); } } } diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 9b9b7519fd9..be35905fc75 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -140,6 +140,24 @@ def test_float_nulls(self): result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) + def test_float_object_nulls(self): + arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) + df = pd.DataFrame({'floats': arr}) + expected = pd.DataFrame({'floats': pd.to_numeric(arr)}) + field = pa.field('floats', pa.float64()) + schema = pa.schema([field]) + self._check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + + def test_int_object_nulls(self): + arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) + df = pd.DataFrame({'ints': arr}) + expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) + field = pa.field('ints', pa.int64()) + schema = pa.schema([field]) + self._check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + def test_integer_no_nulls(self): data = OrderedDict() fields = []