Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 80 additions & 2 deletions cpp/src/arrow/python/pandas_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ static inline bool PyObject_is_string(const PyObject* obj) {
#endif
}

static inline bool PyObject_is_float(const PyObject* obj) {
return PyFloat_Check(obj);
}

static inline bool PyObject_is_integer(const PyObject* obj) {
return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj);
}

template <int TYPE>
static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
typedef npy_traits<TYPE> traits;
Expand Down Expand Up @@ -394,9 +402,11 @@ class PandasConverter {
template <typename ArrowType>
Status ConvertDates();

Status ConvertBooleans();
Status ConvertObjectStrings();
Status ConvertObjectFloats();
Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
Status ConvertBooleans();
Status ConvertObjectIntegers();
Status ConvertLists(const std::shared_ptr<DataType>& type);
Status ConvertObjects();
Status ConvertDecimals();
Expand Down Expand Up @@ -610,6 +620,70 @@ Status PandasConverter::ConvertObjectStrings() {
return Status::OK();
}

Status PandasConverter::ConvertObjectFloats() {
PyAcquireGIL lock;

DoubleBuilder builder(pool_);
RETURN_NOT_OK(builder.Resize(length_));

Ndarray1DIndexer<PyObject*> objects(arr_);
Ndarray1DIndexer<uint8_t> mask_values;

bool have_mask = false;
if (mask_ != nullptr) {
mask_values.Init(mask_);
have_mask = true;
}

PyObject* obj;
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else if (PyFloat_Check(obj)) {
double val = PyFloat_AsDouble(obj);
RETURN_IF_PYERROR();
RETURN_NOT_OK(builder.Append(val));
} else {
return InvalidConversion(obj, "float");
}
}

return builder.Finish(&out_);
}

Status PandasConverter::ConvertObjectIntegers() {
PyAcquireGIL lock;

Int64Builder builder(pool_);
RETURN_NOT_OK(builder.Resize(length_));

Ndarray1DIndexer<PyObject*> objects(arr_);
Ndarray1DIndexer<uint8_t> mask_values;

bool have_mask = false;
if (mask_ != nullptr) {
mask_values.Init(mask_);
have_mask = true;
}

PyObject* obj;
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
} else if (PyObject_is_integer(obj)) {
const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
RETURN_IF_PYERROR();
RETURN_NOT_OK(builder.Append(val));
} else {
return InvalidConversion(obj, "integer");
}
}

return builder.Finish(&out_);
}

Status PandasConverter::ConvertObjectFixedWidthBytes(
const std::shared_ptr<DataType>& type) {
PyAcquireGIL lock;
Expand Down Expand Up @@ -804,16 +878,20 @@ Status PandasConverter::ConvertObjects() {
continue;
} else if (PyObject_is_string(objects[i])) {
return ConvertObjectStrings();
} else if (PyObject_is_float(objects[i])) {
return ConvertObjectFloats();
} else if (PyBool_Check(objects[i])) {
return ConvertBooleans();
} else if (PyObject_is_integer(objects[i])) {
return ConvertObjectIntegers();
} else if (PyDate_CheckExact(objects[i])) {
// We could choose Date32 or Date64
return ConvertDates<Date32Type>();
} else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), Decimal.obj())) {
return ConvertDecimals();
} else {
return InvalidConversion(
const_cast<PyObject*>(objects[i]), "string, bool, or date");
const_cast<PyObject*>(objects[i]), "string, bool, float, int, date, decimal");
}
}
}
Expand Down
18 changes: 18 additions & 0 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,24 @@ def test_float_nulls(self):
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)

def test_float_object_nulls(self):
arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
df = pd.DataFrame({'floats': arr})
expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
field = pa.field('floats', pa.float64())
schema = pa.schema([field])
self._check_pandas_roundtrip(df, expected=expected,
expected_schema=schema)

def test_int_object_nulls(self):
arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
df = pd.DataFrame({'ints': arr})
expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
field = pa.field('ints', pa.int64())
schema = pa.schema([field])
self._check_pandas_roundtrip(df, expected=expected,
expected_schema=schema)

def test_integer_no_nulls(self):
data = OrderedDict()
fields = []
Expand Down