From f3d472626bf1c9f8652d328caeb5c1c9df91ddd8 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 3 Oct 2018 16:32:52 -0700 Subject: [PATCH] added test with fix that passes, but fails other tests convert numpy bool to arrow bools before cast, add from bool tests call PrepareInputData in date conversion using GenerateBits for better performance --- cpp/src/arrow/compute/kernels/cast-test.cc | 19 ++++++ cpp/src/arrow/python/numpy_to_arrow.cc | 66 +++++++++------------ cpp/src/arrow/python/type_traits.h | 1 + python/pyarrow/tests/test_convert_pandas.py | 39 +++++++++--- 4 files changed, 81 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 781e0af87a8..c3a0df5d8a7 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) { AssertBufferSame(*arr, *result, 1); } +TEST_F(TestCast, FromBoolean) { + CastOptions options; + + vector is_valid(20, true); + is_valid[3] = false; + + vector v1(is_valid.size(), true); + vector e1(is_valid.size(), 1); + for (size_t i = 0; i < v1.size(); ++i) { + if (i % 3 == 1) { + v1[i] = false; + e1[i] = 0; + } + } + + CheckCase(boolean(), v1, is_valid, int32(), e1, + options); +} + TEST_F(TestCast, ToBoolean) { CastOptions options; for (auto type : kNumericTypes) { diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index aa28b6e8708..aada6bf598c 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -63,6 +63,7 @@ namespace arrow { using internal::checked_cast; using internal::CopyBitmap; +using internal::GenerateBitsUnrolled; namespace py { @@ -246,6 +247,11 @@ class NumPyConverter { return Status::OK(); } + // Called before ConvertData to ensure Numpy input buffer is in expected + // Arrow layout + template + Status PrepareInputData(std::shared_ptr* data); + // ---------------------------------------------------------------------- // Traditional visitor conversion for non-object arrays @@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po } // namespace template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { if (is_strided()) { RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); + } else if (dtype_->type_num == NPY_BOOL) { + int64_t nbytes = BitUtil::BytesForBits(length_); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); + + Ndarray1DIndexer values(arr_); + int64_t i = 0; + const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; + GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); + + *data = buffer; } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } + return Status::OK(); +} + +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + RETURN_NOT_OK(PrepareInputData(data)); + std::shared_ptr input_type; RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); @@ -426,38 +450,12 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { return Status::OK(); } -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - int64_t nbytes = BitUtil::BytesForBits(length_); - std::shared_ptr buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); - - Ndarray1DIndexer values(arr_); - - uint8_t* bitmap = buffer->mutable_data(); - - memset(bitmap, 0, nbytes); - for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { - BitUtil::SetBit(bitmap, i); - } - } - - *data = buffer; - return Status::OK(); -} - template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted @@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - constexpr int64_t kMillisecondsInDay = 86400000; - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index d90517a60a2..bc71ec4e90b 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -149,6 +149,7 @@ template <> struct arrow_traits { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; + typedef typename npy_traits::value_type T; }; #define INT_DECL(TYPE) \ diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 3e89f5eb4ff..cd7f4999ace 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None, else: assert arr.null_count == (mask | values_nulls).sum() - if mask is None: - tm.assert_series_equal(pd.Series(result), pd.Series(values), - check_names=False) - else: - expected = pd.Series(np.ma.masked_array(values, mask=mask)) - tm.assert_series_equal(pd.Series(result), expected, - check_names=False) + if expected is None: + if mask is None: + expected = pd.Series(values) + else: + expected = pd.Series(np.ma.masked_array(values, mask=mask)) + + tm.assert_series_equal(pd.Series(result), expected, check_names=False) def _check_array_from_pandas_roundtrip(np_array, type=None): @@ -559,6 +559,11 @@ def test_float_nulls_to_ints(self): assert table[0].to_pylist() == [1, 2, None] tm.assert_frame_equal(df, table.to_pandas()) + def test_float_nulls_to_boolean(self): + s = pd.Series([0.0, 1.0, 2.0, None, -3.0]) + expected = pd.Series([False, True, True, None, True]) + _check_array_roundtrip(s, expected=expected, type=pa.bool_()) + def test_integer_no_nulls(self): data = OrderedDict() fields = [] @@ -672,6 +677,26 @@ def test_boolean_nulls(self): tm.assert_frame_equal(result, ex_frame) + def test_boolean_to_int(self): + # test from dtype=bool + s = pd.Series([True, True, False, True, True] * 2) + expected = pd.Series([1, 1, 0, 1, 1] * 2) + _check_array_roundtrip(s, expected=expected, type=pa.int64()) + + def test_boolean_objects_to_int(self): + # test from dtype=object + s = pd.Series([True, True, False, True, True] * 2, dtype=object) + expected = pd.Series([1, 1, 0, 1, 1] * 2) + expected_msg = 'Expected integer, got bool' + with pytest.raises(pa.ArrowTypeError, match=expected_msg): + _check_array_roundtrip(s, expected=expected, type=pa.int64()) + + def test_boolean_nulls_to_float(self): + # test from dtype=object + s = pd.Series([True, True, False, None, True] * 2) + expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2) + _check_array_roundtrip(s, expected=expected, type=pa.float64()) + def test_float_object_nulls(self): arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) df = pd.DataFrame({'floats': arr})