Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions cpp/src/arrow/compute/kernels/cast-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) {
AssertBufferSame(*arr, *result, 1);
}

TEST_F(TestCast, FromBoolean) {
CastOptions options;

vector<bool> is_valid(20, true);
is_valid[3] = false;

vector<bool> v1(is_valid.size(), true);
vector<int32_t> e1(is_valid.size(), 1);
for (size_t i = 0; i < v1.size(); ++i) {
if (i % 3 == 1) {
v1[i] = false;
e1[i] = 0;
}
}

CheckCase<BooleanType, bool, Int32Type, int32_t>(boolean(), v1, is_valid, int32(), e1,
options);
}

TEST_F(TestCast, ToBoolean) {
CastOptions options;
for (auto type : kNumericTypes) {
Expand Down
66 changes: 29 additions & 37 deletions cpp/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ namespace arrow {

using internal::checked_cast;
using internal::CopyBitmap;
using internal::GenerateBitsUnrolled;

namespace py {

Expand Down Expand Up @@ -246,6 +247,11 @@ class NumPyConverter {
return Status::OK();
}

// Called before ConvertData to ensure Numpy input buffer is in expected
// Arrow layout
template <typename ArrowType>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment or docstring here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Status PrepareInputData(std::shared_ptr<Buffer>* data);

// ----------------------------------------------------------------------
// Traditional visitor conversion for non-object arrays

Expand Down Expand Up @@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po
} // namespace

template <typename ArrowType>
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
if (is_strided()) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not too sure the details of this - is it possible for a boolean array to be strided?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All Numpy arrays can be strided, yes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, thanks. This is handled by the Ndarray1DIndexer right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes (see operator[]).

RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
} else if (dtype_->type_num == NPY_BOOL) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit odd. Seems like we should be using a templated approach here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought about that but the input type isn't parameterized until the casting operation is called and this conversion needs to happen before then. Also, I think bool is the only case where we need a conversion like this since we are going from 1 byte to a bitmask, so it seems best to do a simple check. What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO, we can start with a simple check and refine later if needed.

int64_t nbytes = BitUtil::BytesForBits(length_);
std::shared_ptr<Buffer> buffer;
RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));

Ndarray1DIndexer<uint8_t> values(arr_);
int64_t i = 0;
const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used the unrolled version, just wondering if there is really any reason to use the other?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not unless the generate function is heavy (it is inlined several times), which isn't the case here.


*data = buffer;
} else {
// Can zero-copy
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
}

return Status::OK();
}

template <typename ArrowType>
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
RETURN_NOT_OK(PrepareInputData<ArrowType>(data));

std::shared_ptr<DataType> input_type;
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));

Expand All @@ -426,38 +450,12 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
return Status::OK();
}

template <>
inline Status NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing this to allow the casting kernel to do the conversion makes my test pass, but fails lots of others, so this is not the right fix.

int64_t nbytes = BitUtil::BytesForBits(length_);
std::shared_ptr<Buffer> buffer;
RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));

Ndarray1DIndexer<uint8_t> values(arr_);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem seems to be reading values as uint8_t

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like a dtype check did not happen, why was this code path hitting silently? I can take a closer look too but curious if you know

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output array is visited and there is no check for the dtype except in the specConvertData specialized for date types. In the case of a boolean output type, it assumed the numpy data was unit8_t. In the other cases, the dtype is sent to the CastBuffer where the input and output types are parameterized, but it expects the buffer to be in Arrow layout, so for bools it needs to be converted to a bitmask before.


uint8_t* bitmap = buffer->mutable_data();

memset(bitmap, 0, nbytes);
for (int i = 0; i < length_; ++i) {
if (values[i] > 0) {
BitUtil::SetBit(bitmap, i);
}
}

*data = buffer;
return Status::OK();
}

template <>
inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
if (is_strided()) {
RETURN_NOT_OK(CopyStridedArray<Date32Type>(arr_, length_, pool_, data));
} else {
// Can zero-copy
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
}

std::shared_ptr<DataType> input_type;

RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it doesn't really make sense to convert from bool to date, but better reuse the same code to prevent any weird errors, just in case.


auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
Expand Down Expand Up @@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d

template <>
inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
if (is_strided()) {
RETURN_NOT_OK(CopyStridedArray<Date64Type>(arr_, length_, pool_, data));
} else {
// Can zero-copy
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
}

constexpr int64_t kMillisecondsInDay = 86400000;

std::shared_ptr<DataType> input_type;

RETURN_NOT_OK(PrepareInputData<Date64Type>(data));

auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/python/type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ template <>
struct arrow_traits<Type::BOOL> {
static constexpr int npy_type = NPY_BOOL;
static constexpr bool supports_nulls = false;
typedef typename npy_traits<NPY_BOOL>::value_type T;
};

#define INT_DECL(TYPE) \
Expand Down
39 changes: 32 additions & 7 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None,
else:
assert arr.null_count == (mask | values_nulls).sum()

if mask is None:
tm.assert_series_equal(pd.Series(result), pd.Series(values),
check_names=False)
else:
expected = pd.Series(np.ma.masked_array(values, mask=mask))
tm.assert_series_equal(pd.Series(result), expected,
check_names=False)
if expected is None:
if mask is None:
expected = pd.Series(values)
else:
expected = pd.Series(np.ma.masked_array(values, mask=mask))

tm.assert_series_equal(pd.Series(result), expected, check_names=False)


def _check_array_from_pandas_roundtrip(np_array, type=None):
Expand Down Expand Up @@ -559,6 +559,11 @@ def test_float_nulls_to_ints(self):
assert table[0].to_pylist() == [1, 2, None]
tm.assert_frame_equal(df, table.to_pandas())

def test_float_nulls_to_boolean(self):
s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
expected = pd.Series([False, True, True, None, True])
_check_array_roundtrip(s, expected=expected, type=pa.bool_())

def test_integer_no_nulls(self):
data = OrderedDict()
fields = []
Expand Down Expand Up @@ -672,6 +677,26 @@ def test_boolean_nulls(self):

tm.assert_frame_equal(result, ex_frame)

def test_boolean_to_int(self):
# test from dtype=bool
s = pd.Series([True, True, False, True, True] * 2)
expected = pd.Series([1, 1, 0, 1, 1] * 2)
_check_array_roundtrip(s, expected=expected, type=pa.int64())

def test_boolean_objects_to_int(self):
# test from dtype=object
s = pd.Series([True, True, False, True, True] * 2, dtype=object)
expected = pd.Series([1, 1, 0, 1, 1] * 2)
expected_msg = 'Expected integer, got bool'
with pytest.raises(pa.ArrowTypeError, match=expected_msg):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this shouldn't raise an error if the type was specified, I can look at that in another pr

_check_array_roundtrip(s, expected=expected, type=pa.int64())

def test_boolean_nulls_to_float(self):
# test from dtype=object
s = pd.Series([True, True, False, None, True] * 2)
expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
_check_array_roundtrip(s, expected=expected, type=pa.float64())

def test_float_object_nulls(self):
arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
df = pd.DataFrame({'floats': arr})
Expand Down