diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 8764e9c354c..77b8209a180 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -236,6 +236,18 @@ void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_siz span->buffers[buffer_index].size = 2 * sizeof(offset_type); } +template +void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) { + using RunEndCType = typename RunEndType::c_type; + auto buffer = reinterpret_cast(span->scratch_space); + buffer[0] = static_cast(1); + span->type = run_end_type; + span->length = 1; + span->null_count = 0; + span->buffers[1].data = reinterpret_cast(buffer); + span->buffers[1].size = sizeof(RunEndCType); +} + int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -304,9 +316,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) { Type::type type_id = value.type->id(); - // Populate null count and validity bitmap (only for non-union/null types) - this->null_count = value.is_valid ? 0 : 1; - if (!is_union(type_id) && type_id != Type::NA) { + // Populate null count and validity bitmap + if (type_id == Type::NA) { + this->null_count = 1; + } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) { + this->null_count = 0; + } else { + this->null_count = value.is_valid ? 0 : 1; this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit; this->buffers[0].size = 1; } @@ -422,6 +438,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) { this->child_data[i].FillFromScalar(*scalar.value[i]); } } + } else if (type_id == Type::RUN_END_ENCODED) { + const auto& scalar = checked_cast(value); + this->child_data.resize(2); + auto& run_end_type = scalar.run_end_type(); + switch (run_end_type->id()) { + case Type::INT16: + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); + break; + case Type::INT32: + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); + break; + default: + DCHECK_EQ(run_end_type->id(), Type::INT64); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); + } + this->child_data[1].FillFromScalar(*scalar.value); } else if (type_id == Type::EXTENSION) { // Pass through storage const auto& scalar = checked_cast(value); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 6b1cec0f5cc..bedeff1a127 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -22,6 +22,8 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/ree_util.h" namespace arrow { @@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of } } +static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto* sparse_union_type = + ::arrow::internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = sparse_union_type->child_ids()[types[i]]; + if (span.child_data[child_id].IsNull(i + span.offset)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto* dense_union_type = + ::arrow::internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + const int32_t* offsets = span.GetValues(2); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = dense_union_type->child_ids()[types[i]]; + const int32_t offset = offsets[i]; + if (span.child_data[child_id].IsNull(offset)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +template +void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto& values = arrow::ree_util::ValuesArray(span); + DCHECK(!is_nested(values.type->id())); + const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; + + if (!values_bitmap) { + return; + } + + arrow::ree_util::RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) { + bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(), + true); + } + } +} + +void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto type_id = arrow::ree_util::RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + SetREELogicalNullBits(span, out_bitmap, out_offset); + } else if (type_id == Type::INT32) { + SetREELogicalNullBits(span, out_bitmap, out_offset); + } else { + DCHECK_EQ(type_id, Type::INT64); + SetREELogicalNullBits(span, out_bitmap, out_offset); + } +} + Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& arr = batch[0].array; ArraySpan* out_span = out->array_span_mutable(); @@ -100,6 +168,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { } else { // Input has no nulls => output is entirely false. bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); + // Except for union/ree which never has physical nulls, but can have logical + // nulls from the child arrays -> set those bits to true + const auto t = arr.type->id(); + if (t == Type::SPARSE_UNION) { + SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); + } else if (t == Type::DENSE_UNION) { + SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); + } else if (t == Type::RUN_END_ENCODED) { + SetREELogicalNullBits(arr, out_bitmap, out_span->offset); + } } if (is_floating(arr.type->id()) && options.nan_is_null) { diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 94d951c8382..48353daf92e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) { "[true, false, false, true]", &nan_is_null_options); } +TEST_F(TestBooleanValidityKernels, IsNullUnion) { + auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]"); + auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])"); + auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])"); + ASSERT_OK_AND_ASSIGN(auto arr1, + SparseUnionArray::Make(*type_ids, {field_i64, field_str})); + auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]"); + CheckScalarUnary("is_null", arr1, expected); + + auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]"); + auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])"); + auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])"); + ASSERT_OK_AND_ASSIGN(auto arr2, + DenseUnionArray::Make(*type_ids, *value_offsets, + {dense_field_i64, dense_field_str})); + CheckScalarUnary("is_null", arr2, expected); +} + +TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { + auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); + auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); + ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); + ASSERT_OK(ree_array->ValidateFull()); + auto expected = + ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); + CheckScalarUnary("is_null", ree_array, expected); +} + TEST(TestValidityKernels, IsFinite) { for (const auto& ty : IntTypes()) { CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")}, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 875d0e613b6..a625506f305 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1637,6 +1637,48 @@ def test_is_null(): assert result.equals(expected) +def test_is_null_union(): + arr = pa.UnionArray.from_sparse( + pa.array([0, 1, 0, 0, 1], type=pa.int8()), + [ + pa.array([0.0, 1.1, None, 3.3, 4.4]), + pa.array([True, None, False, True, False]), + ] + ) + assert arr.to_pylist() == [0.0, None, None, 3.3, False] + result = arr.is_null() + expected = pa.array([False, True, True, False, False]) + assert result.equals(expected) + result = arr.slice(1, 2).is_null() + assert result.equals(expected.slice(1, 2)) + + arr = pa.UnionArray.from_dense( + pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), + pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()), + [ + pa.array([0.0, 1.1, None, 3.3]), + pa.array([True, None, False]) + ] + ) + assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False] + result = arr.is_null() + expected = pa.array([False, False, False, True, False, True, False]) + assert result.equals(expected) + result = arr.slice(1, 3).is_null() + assert result.equals(expected.slice(1, 3)) + + +@pytest.mark.parametrize("typ", ["int16", "int32", "int64"]) +def test_is_null_run_end_encoded(typ): + decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1]) + arr = pc.run_end_encode(decoded, run_end_type=typ) + result = arr.is_null() + expected = pa.array([False, False, False, True, False, False, True, True, False]) + assert result.equals(expected) + result = arr.slice(2, 5).is_null() + assert result.equals(expected.slice(2, 5)) + + def test_is_nan(): arr = pa.array([1, 2, 3, None, np.nan]) result = arr.is_nan()