From 851fcc87e1d27c5ccc7b07a35de8791adbe2d158 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 Jul 2020 15:03:32 -0500 Subject: [PATCH] Recognize more pandas null sentinels in sequence type inference when converting to Arrow --- cpp/src/arrow/python/inference.cc | 8 +++++++- python/pyarrow/tests/test_pandas.py | 10 +++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index db528f05c6e..c2fc06e554c 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -320,7 +320,7 @@ class TypeInferrer { Status Visit(PyObject* obj, bool* keep_going) { ++total_count_; - if (obj == Py_None || (pandas_null_sentinels_ && internal::PyFloat_IsNaN(obj))) { + if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) { ++none_count_; } else if (PyBool_Check(obj)) { ++bool_count_; @@ -626,6 +626,12 @@ class TypeInferrer { // Non-exhaustive type inference Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels, std::shared_ptr* out_type) { + if (pandas_null_sentinels) { + // ARROW-842: If pandas is not installed then null checks will be less + // comprehensive, but that is okay. + internal::InitPandasStaticData(); + } + TypeInferrer inferrer(pandas_null_sentinels); RETURN_NOT_OK(inferrer.VisitSequence(obj, mask)); RETURN_NOT_OK(inferrer.GetType(out_type)); diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index fe48d059b45..979b2a6f72c 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1205,10 +1205,10 @@ def test_date_objects_typed(self): tm.assert_frame_equal(table_pandas_objects, expected_pandas_objects) - def test_object_null_values(self): + def test_pandas_null_values(self): # ARROW-842 - NA = getattr(pd, 'NA', None) - values = np.array([datetime(2000, 1, 1), pd.NaT, NA], dtype=object) + pd_NA = getattr(pd, 'NA', None) + values = np.array([datetime(2000, 1, 1), pd.NaT, pd_NA], dtype=object) values_with_none = np.array([datetime(2000, 1, 1), None, None], dtype=object) result = pa.array(values, from_pandas=True) @@ -1216,6 +1216,10 @@ def test_object_null_values(self): assert result.equals(expected) assert result.null_count == 2 + # ARROW-9407 + assert pa.array([pd.NaT], from_pandas=True).type == pa.null() + assert pa.array([pd_NA], from_pandas=True).type == pa.null() + def test_dates_from_integers(self): t1 = pa.date32() t2 = pa.date64()