diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 781e0af87a8..c3a0df5d8a7 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) { AssertBufferSame(*arr, *result, 1); } +TEST_F(TestCast, FromBoolean) { + CastOptions options; + + vector is_valid(20, true); + is_valid[3] = false; + + vector v1(is_valid.size(), true); + vector e1(is_valid.size(), 1); + for (size_t i = 0; i < v1.size(); ++i) { + if (i % 3 == 1) { + v1[i] = false; + e1[i] = 0; + } + } + + CheckCase(boolean(), v1, is_valid, int32(), e1, + options); +} + TEST_F(TestCast, ToBoolean) { CastOptions options; for (auto type : kNumericTypes) { diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index aa28b6e8708..aada6bf598c 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -63,6 +63,7 @@ namespace arrow { using internal::checked_cast; using internal::CopyBitmap; +using internal::GenerateBitsUnrolled; namespace py { @@ -246,6 +247,11 @@ class NumPyConverter { return Status::OK(); } + // Called before ConvertData to ensure Numpy input buffer is in expected + // Arrow layout + template + Status PrepareInputData(std::shared_ptr* data); + // ---------------------------------------------------------------------- // Traditional visitor conversion for non-object arrays @@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po } // namespace template -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { +inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { if (is_strided()) { RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); + } else if (dtype_->type_num == NPY_BOOL) { + int64_t nbytes = BitUtil::BytesForBits(length_); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); + + Ndarray1DIndexer values(arr_); + int64_t i = 0; + const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; + GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); + + *data = buffer; } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } + return Status::OK(); +} + +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + RETURN_NOT_OK(PrepareInputData(data)); + std::shared_ptr input_type; RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); @@ -426,38 +450,12 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { return Status::OK(); } -template <> -inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - int64_t nbytes = BitUtil::BytesForBits(length_); - std::shared_ptr buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer)); - - Ndarray1DIndexer values(arr_); - - uint8_t* bitmap = buffer->mutable_data(); - - memset(bitmap, 0, nbytes); - for (int i = 0; i < length_; ++i) { - if (values[i] > 0) { - BitUtil::SetBit(bitmap, i); - } - } - - *data = buffer; - return Status::OK(); -} - template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted @@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { - if (is_strided()) { - RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data)); - } else { - // Can zero-copy - *data = std::make_shared(reinterpret_cast(arr_)); - } - constexpr int64_t kMillisecondsInDay = 86400000; - std::shared_ptr input_type; + RETURN_NOT_OK(PrepareInputData(data)); + auto date_dtype = reinterpret_cast(dtype_->c_metadata); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h index d90517a60a2..bc71ec4e90b 100644 --- a/cpp/src/arrow/python/type_traits.h +++ b/cpp/src/arrow/python/type_traits.h @@ -149,6 +149,7 @@ template <> struct arrow_traits { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; + typedef typename npy_traits::value_type T; }; #define INT_DECL(TYPE) \ diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 3e89f5eb4ff..cd7f4999ace 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None, else: assert arr.null_count == (mask | values_nulls).sum() - if mask is None: - tm.assert_series_equal(pd.Series(result), pd.Series(values), - check_names=False) - else: - expected = pd.Series(np.ma.masked_array(values, mask=mask)) - tm.assert_series_equal(pd.Series(result), expected, - check_names=False) + if expected is None: + if mask is None: + expected = pd.Series(values) + else: + expected = pd.Series(np.ma.masked_array(values, mask=mask)) + + tm.assert_series_equal(pd.Series(result), expected, check_names=False) def _check_array_from_pandas_roundtrip(np_array, type=None): @@ -559,6 +559,11 @@ def test_float_nulls_to_ints(self): assert table[0].to_pylist() == [1, 2, None] tm.assert_frame_equal(df, table.to_pandas()) + def test_float_nulls_to_boolean(self): + s = pd.Series([0.0, 1.0, 2.0, None, -3.0]) + expected = pd.Series([False, True, True, None, True]) + _check_array_roundtrip(s, expected=expected, type=pa.bool_()) + def test_integer_no_nulls(self): data = OrderedDict() fields = [] @@ -672,6 +677,26 @@ def test_boolean_nulls(self): tm.assert_frame_equal(result, ex_frame) + def test_boolean_to_int(self): + # test from dtype=bool + s = pd.Series([True, True, False, True, True] * 2) + expected = pd.Series([1, 1, 0, 1, 1] * 2) + _check_array_roundtrip(s, expected=expected, type=pa.int64()) + + def test_boolean_objects_to_int(self): + # test from dtype=object + s = pd.Series([True, True, False, True, True] * 2, dtype=object) + expected = pd.Series([1, 1, 0, 1, 1] * 2) + expected_msg = 'Expected integer, got bool' + with pytest.raises(pa.ArrowTypeError, match=expected_msg): + _check_array_roundtrip(s, expected=expected, type=pa.int64()) + + def test_boolean_nulls_to_float(self): + # test from dtype=object + s = pd.Series([True, True, False, None, True] * 2) + expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2) + _check_array_roundtrip(s, expected=expected, type=pa.float64()) + def test_float_object_nulls(self): arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) df = pd.DataFrame({'floats': arr})