diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 6b1cec0f5cc..8e25be62440 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -20,8 +20,11 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/ree_util.h" namespace arrow { @@ -82,6 +85,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of } } +static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto* sparse_union_type = + ::arrow::internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = sparse_union_type->child_ids()[types[i]]; + if (span.child_data[child_id].IsNull(i + span.offset)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto* dense_union_type = + ::arrow::internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + const int32_t* offsets = span.GetValues(2); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = dense_union_type->child_ids()[types[i]]; + const int32_t offset = offsets[i]; + if (span.child_data[child_id].IsNull(offset)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +template +void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto& values = arrow::ree_util::ValuesArray(span); + DCHECK(!is_nested(values.type->id())); + const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; + + if (!values_bitmap) { + return; + } + + arrow::ree_util::RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) { + bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(), + true); + } + } +} + +void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto type_id = arrow::ree_util::RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + SetREELogicalNullBits(span, out_bitmap, out_offset); + } else if (type_id == Type::INT32) { + SetREELogicalNullBits(span, out_bitmap, out_offset); + } else { + DCHECK_EQ(type_id, Type::INT64); + SetREELogicalNullBits(span, out_bitmap, out_offset); + } +} + Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& arr = batch[0].array; ArraySpan* out_span = out->array_span_mutable(); @@ -91,17 +160,29 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return Status::OK(); } - const auto& options = NanOptionsState::Get(ctx); uint8_t* out_bitmap = out_span->buffers[1].data; if (arr.GetNullCount() > 0) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, out_span->offset); } else { - // Input has no nulls => output is entirely false. + // Input has no nulls => output is entirely false... bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); + // ...except for some types (e.g. unions types and REE) which do not + // represent nulls in the validity bitmap. + const auto t = arr.type->id(); + if (t == Type::SPARSE_UNION) { + SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); + } else if (t == Type::DENSE_UNION) { + SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); + } else if (t == Type::RUN_END_ENCODED) { + SetREELogicalNullBits(arr, out_bitmap, out_span->offset); + } else { + DCHECK(arrow::internal::HasValidityBitmap(t)); + } } + const auto& options = NanOptionsState::Get(ctx); if (is_floating(arr.type->id()) && options.nan_is_null) { switch (arr.type->id()) { case Type::FLOAT: diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 94d951c8382..48353daf92e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) { "[true, false, false, true]", &nan_is_null_options); } +TEST_F(TestBooleanValidityKernels, IsNullUnion) { + auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]"); + auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])"); + auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])"); + ASSERT_OK_AND_ASSIGN(auto arr1, + SparseUnionArray::Make(*type_ids, {field_i64, field_str})); + auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]"); + CheckScalarUnary("is_null", arr1, expected); + + auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]"); + auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])"); + auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])"); + ASSERT_OK_AND_ASSIGN(auto arr2, + DenseUnionArray::Make(*type_ids, *value_offsets, + {dense_field_i64, dense_field_str})); + CheckScalarUnary("is_null", arr2, expected); +} + +TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { + auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); + auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); + ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); + ASSERT_OK(ree_array->ValidateFull()); + auto expected = + ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); + CheckScalarUnary("is_null", ree_array, expected); +} + TEST(TestValidityKernels, IsFinite) { for (const auto& ty : IntTypes()) { CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")}, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 152cae1be1f..b20b87f19ab 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1656,6 +1656,48 @@ def test_is_null(): assert result.equals(expected) +def test_is_null_union(): + arr = pa.UnionArray.from_sparse( + pa.array([0, 1, 0, 0, 1], type=pa.int8()), + [ + pa.array([0.0, 1.1, None, 3.3, 4.4]), + pa.array([True, None, False, True, False]), + ] + ) + assert arr.to_pylist() == [0.0, None, None, 3.3, False] + result = arr.is_null() + expected = pa.array([False, True, True, False, False]) + assert result.equals(expected) + result = arr.slice(1, 2).is_null() + assert result.equals(expected.slice(1, 2)) + + arr = pa.UnionArray.from_dense( + pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), + pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()), + [ + pa.array([0.0, 1.1, None, 3.3]), + pa.array([True, None, False]) + ] + ) + assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False] + result = arr.is_null() + expected = pa.array([False, False, False, True, False, True, False]) + assert result.equals(expected) + result = arr.slice(1, 3).is_null() + assert result.equals(expected.slice(1, 3)) + + +@pytest.mark.parametrize("typ", ["int16", "int32", "int64"]) +def test_is_null_run_end_encoded(typ): + decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1]) + arr = pc.run_end_encode(decoded, run_end_type=typ) + result = arr.is_null() + expected = pa.array([False, False, False, True, False, False, True, True, False]) + assert result.equals(expected) + result = arr.slice(2, 5).is_null() + assert result.equals(expected.slice(2, 5)) + + def test_is_nan(): arr = pa.array([1, 2, 3, None, np.nan]) result = arr.is_nan()