Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 83 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_validity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common_internal.h"

#include "arrow/type.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/ree_util.h"

namespace arrow {

Expand Down Expand Up @@ -82,6 +85,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
}
}

static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be worth reusing the choose kernel for this case?

int64_t out_offset) {
const auto* sparse_union_type =
::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
DCHECK_LE(span.child_data.size(), 128);

const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
for (int64_t i = 0; i < span.length; i++) {
const int8_t child_id = sparse_union_type->child_ids()[types[i]];
if (span.child_data[child_id].IsNull(i + span.offset)) {
bit_util::SetBit(out_bitmap, i + out_offset);
}
}
}

static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto* dense_union_type =
::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
DCHECK_LE(span.child_data.size(), 128);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Someday if we have more dense union utilities the same trick as for sparse unions could be used here


const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT
for (int64_t i = 0; i < span.length; i++) {
const int8_t child_id = dense_union_type->child_ids()[types[i]];
const int32_t offset = offsets[i];
if (span.child_data[child_id].IsNull(offset)) {
bit_util::SetBit(out_bitmap, i + out_offset);
}
}
}

template <typename RunEndCType>
void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it seems we could trivially create an ree($index_type, bool), then return that directly or decode it using existing utilities

int64_t out_offset) {
const auto& values = arrow::ree_util::ValuesArray(span);
DCHECK(!is_nested(values.type->id()));
const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR;

if (!values_bitmap) {
return;
}

arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span);
auto end = ree_span.end();
for (auto it = ree_span.begin(); it != end; ++it) {
if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) {
bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(),
true);
}
}
}

void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
int64_t out_offset) {
const auto type_id = arrow::ree_util::RunEndsArray(span).type->id();
if (type_id == Type::INT16) {
SetREELogicalNullBits<int16_t>(span, out_bitmap, out_offset);
} else if (type_id == Type::INT32) {
SetREELogicalNullBits<int32_t>(span, out_bitmap, out_offset);
} else {
DCHECK_EQ(type_id, Type::INT64);
SetREELogicalNullBits<int64_t>(span, out_bitmap, out_offset);
}
}

Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& arr = batch[0].array;
ArraySpan* out_span = out->array_span_mutable();
Expand All @@ -91,17 +160,29 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return Status::OK();
}

const auto& options = NanOptionsState::Get(ctx);
uint8_t* out_bitmap = out_span->buffers[1].data;
if (arr.GetNullCount() > 0) {
// Input has nulls => output is the inverted null (validity) bitmap.
InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap,
out_span->offset);
} else {
// Input has no nulls => output is entirely false.
// Input has no nulls => output is entirely false...
bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false);
// ...except for some types (e.g. unions types and REE) which do not
// represent nulls in the validity bitmap.
const auto t = arr.type->id();
if (t == Type::SPARSE_UNION) {
SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
} else if (t == Type::DENSE_UNION) {
SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset);
} else if (t == Type::RUN_END_ENCODED) {
SetREELogicalNullBits(arr, out_bitmap, out_span->offset);
} else {
DCHECK(arrow::internal::HasValidityBitmap(t));
}
}

const auto& options = NanOptionsState::Get(ctx);
if (is_floating(arr.type->id()) && options.nan_is_null) {
switch (arr.type->id()) {
case Type::FLOAT:
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_validity_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
"[true, false, false, true]", &nan_is_null_options);
}

TEST_F(TestBooleanValidityKernels, IsNullUnion) {
auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]");
auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])");
auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])");
ASSERT_OK_AND_ASSIGN(auto arr1,
SparseUnionArray::Make(*type_ids, {field_i64, field_str}));
auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]");
CheckScalarUnary("is_null", arr1, expected);

auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]");
auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])");
auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])");
ASSERT_OK_AND_ASSIGN(auto arr2,
DenseUnionArray::Make(*type_ids, *value_offsets,
{dense_field_i64, dense_field_str}));
CheckScalarUnary("is_null", arr2, expected);
}

TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) {
auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])");
auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])");
ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values));
ASSERT_OK(ree_array->ValidateFull());
auto expected =
ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]");
CheckScalarUnary("is_null", ree_array, expected);
}

TEST(TestValidityKernels, IsFinite) {
for (const auto& ty : IntTypes()) {
CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")},
Expand Down
42 changes: 42 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1656,6 +1656,48 @@ def test_is_null():
assert result.equals(expected)


def test_is_null_union():
arr = pa.UnionArray.from_sparse(
pa.array([0, 1, 0, 0, 1], type=pa.int8()),
[
pa.array([0.0, 1.1, None, 3.3, 4.4]),
pa.array([True, None, False, True, False]),
]
)
assert arr.to_pylist() == [0.0, None, None, 3.3, False]
result = arr.is_null()
expected = pa.array([False, True, True, False, False])
assert result.equals(expected)
result = arr.slice(1, 2).is_null()
assert result.equals(expected.slice(1, 2))

arr = pa.UnionArray.from_dense(
pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()),
pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()),
[
pa.array([0.0, 1.1, None, 3.3]),
pa.array([True, None, False])
]
)
assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False]
result = arr.is_null()
expected = pa.array([False, False, False, True, False, True, False])
assert result.equals(expected)
result = arr.slice(1, 3).is_null()
assert result.equals(expected.slice(1, 3))


@pytest.mark.parametrize("typ", ["int16", "int32", "int64"])
def test_is_null_run_end_encoded(typ):
decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1])
arr = pc.run_end_encode(decoded, run_end_type=typ)
result = arr.is_null()
expected = pa.array([False, False, False, True, False, False, True, True, False])
assert result.equals(expected)
result = arr.slice(2, 5).is_null()
assert result.equals(expected.slice(2, 5))


def test_is_nan():
arr = pa.array([1, 2, 3, None, np.nan])
result = arr.is_nan()
Expand Down