From 010f2bb3f0b2bcabd921b1baa1f57d6077c22905 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 11 Apr 2023 12:37:23 +0200 Subject: [PATCH 01/14] GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls --- .../arrow/compute/kernels/scalar_validity.cc | 13 +++++++ cpp/src/arrow/util/ree_util.cc | 33 ++++++++++++++++ cpp/src/arrow/util/ree_util.h | 2 + cpp/src/arrow/util/union_util.cc | 29 ++++++++++++++ cpp/src/arrow/util/union_util.h | 4 ++ python/pyarrow/tests/test_compute.py | 38 +++++++++++++++++++ 6 files changed, 119 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 6b1cec0f5cc..65595824656 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -22,6 +22,8 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/union_util.h" +#include "arrow/util/ree_util.h" namespace arrow { @@ -102,6 +104,17 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); } + const auto t = arr.type->id(); + if (t == Type::SPARSE_UNION) { + union_util::SetLogicalSparseUnionNullBits(arr, out_bitmap, out_span->offset); + } + else if (t == Type::DENSE_UNION) { + union_util::SetLogicalDenseUnionNullBits(arr, out_bitmap, out_span->offset); + } + else if (t == Type::RUN_END_ENCODED) { + ree_util::SetLogicalNullBits(arr, out_bitmap, out_span->offset); + } + if (is_floating(arr.type->id()) && options.nan_is_null) { switch (arr.type->id()) { case Type::FLOAT: diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index fcd6c204e06..4e6e8ec2263 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -47,6 +47,25 @@ int64_t LogicalNullCount(const ArraySpan& span) { return null_count; } +template +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { + const auto& values = ValuesArray(span); + const auto& values_bitmap = values.buffers[0].data; + + RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + const bool is_null = + values_bitmap && + !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); + if (is_null) { + for (int64_t i = 0; i < it.run_length(); i++) { + bit_util::SetBit(out_bitmap, it.logical_position() + i + out_offset); + } + } + } +} + } // namespace int64_t LogicalNullCount(const ArraySpan& span) { @@ -61,6 +80,20 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { + const auto type_id = RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + SetLogicalNullBits(span, out_bitmap, out_offset); + } + else if (type_id == Type::INT32) { + SetLogicalNullBits(span, out_bitmap, out_offset); + } + else { + DCHECK_EQ(type_id, Type::INT64); + SetLogicalNullBits(span, out_bitmap, out_offset); + } +} + int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) { const auto type_id = RunEndsArray(span).type->id(); if (type_id == Type::INT16) { diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index a3d3d16c0da..523bb914b4b 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -56,6 +56,8 @@ Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type, /// \brief Compute the logical null count of an REE array int64_t LogicalNullCount(const ArraySpan& span); +void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset); + namespace internal { /// \brief Uses binary-search to find the physical offset given a logical offset diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc index ca3ecf28644..771490d4a1b 100644 --- a/cpp/src/arrow/util/union_util.cc +++ b/cpp/src/arrow/util/union_util.cc @@ -55,4 +55,33 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { return null_count; } +void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { + const auto* sparse_union_type = + internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = sparse_union_type->child_ids()[types[span.offset + i]]; + if (span.child_data[child_id].IsNull(i)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { + const auto* dense_union_type = internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + const int32_t* offsets = span.GetValues(2); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = dense_union_type->child_ids()[types[span.offset + i]]; + const int32_t offset = offsets[span.offset + i]; + if (span.child_data[child_id].IsNull(offset)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + } // namespace arrow::union_util diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h index 0f30d5a3278..948ea3fdb2f 100644 --- a/cpp/src/arrow/util/union_util.h +++ b/cpp/src/arrow/util/union_util.h @@ -27,5 +27,9 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span); /// \brief Compute the number of of logical nulls in a dense union array int64_t LogicalDenseUnionNullCount(const ArraySpan& span); +void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset); + +void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset); + } // namespace union_util } // namespace arrow diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 152cae1be1f..bf5730b2657 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1656,6 +1656,44 @@ def test_is_null(): assert result.equals(expected) +def test_is_null_union(): + arr = pa.UnionArray.from_sparse( + pa.array([0, 1, 0, 0, 1], type=pa.int8()), + [ + pa.array([0.0, 1.1, None, 3.3, 4.4]), + pa.array([True, None, False, True, False]), + ] + ) + assert arr.to_pylist() == [0.0, None, None, 3.3, False] + result = arr.is_null() + expected = pa.array([False, True, True, False, False]) + assert result.equals(expected) + + arr = pa.UnionArray.from_dense( + pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), + pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()), + [ + pa.array([0.0, 1.1, None, 3.3]), + pa.array([True, None, False]) + ] + ) + assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False] + result = arr.is_null() + expected = pa.array([False, False, False, True, False, True, False]) + assert result.equals(expected) + + +@pytest.mark.parametrize("typ", ["int16", "int32", "int64"]) +def test_is_null_run_end_encoded(typ): + decoded = pa.array([1, 1, 1, None, 2, 2, None, None, 1]) + arr = pc.run_end_encode(decoded, run_end_type=typ) + result = arr.is_null() + expected = pa.array([False, False, False, True, False, False, True, True, False]) + assert result.equals(expected) + result = arr.slice(2, 5).is_null() + assert result.equals(expected.slice(2, 5)) + + def test_is_nan(): arr = pa.array([1, 2, 3, None, np.nan]) result = arr.is_nan() From 3a13fba2bcfe47c873a06edc2898ab563734dcf9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 11 Apr 2023 12:50:22 +0200 Subject: [PATCH 02/14] lint --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 8 +++----- cpp/src/arrow/util/ree_util.cc | 6 ++---- cpp/src/arrow/util/union_util.cc | 6 ++++-- cpp/src/arrow/util/union_util.h | 6 ++++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 65595824656..c406c620106 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -22,8 +22,8 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/union_util.h" #include "arrow/util/ree_util.h" +#include "arrow/util/union_util.h" namespace arrow { @@ -107,11 +107,9 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const auto t = arr.type->id(); if (t == Type::SPARSE_UNION) { union_util::SetLogicalSparseUnionNullBits(arr, out_bitmap, out_span->offset); - } - else if (t == Type::DENSE_UNION) { + } else if (t == Type::DENSE_UNION) { union_util::SetLogicalDenseUnionNullBits(arr, out_bitmap, out_span->offset); - } - else if (t == Type::RUN_END_ENCODED) { + } else if (t == Type::RUN_END_ENCODED) { ree_util::SetLogicalNullBits(arr, out_bitmap, out_span->offset); } diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 4e6e8ec2263..5f9dcdb8aeb 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -84,11 +84,9 @@ void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_ const auto type_id = RunEndsArray(span).type->id(); if (type_id == Type::INT16) { SetLogicalNullBits(span, out_bitmap, out_offset); - } - else if (type_id == Type::INT32) { + } else if (type_id == Type::INT32) { SetLogicalNullBits(span, out_bitmap, out_offset); - } - else { + } else { DCHECK_EQ(type_id, Type::INT64); SetLogicalNullBits(span, out_bitmap, out_offset); } diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc index 771490d4a1b..8dbf7a91d44 100644 --- a/cpp/src/arrow/util/union_util.cc +++ b/cpp/src/arrow/util/union_util.cc @@ -55,7 +55,8 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { return null_count; } -void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { +void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { const auto* sparse_union_type = internal::checked_cast(span.type); DCHECK_LE(span.child_data.size(), 128); @@ -69,7 +70,8 @@ void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, i } } -void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { +void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { const auto* dense_union_type = internal::checked_cast(span.type); DCHECK_LE(span.child_data.size(), 128); diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h index 948ea3fdb2f..0afd71a2d55 100644 --- a/cpp/src/arrow/util/union_util.h +++ b/cpp/src/arrow/util/union_util.h @@ -27,9 +27,11 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span); /// \brief Compute the number of of logical nulls in a dense union array int64_t LogicalDenseUnionNullCount(const ArraySpan& span); -void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset); +void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset); -void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset); +void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset); } // namespace union_util } // namespace arrow From 1d480e453ea3aba1a638a2a5eea903c8e50c0a43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 12 Apr 2023 10:59:34 +0200 Subject: [PATCH 03/14] move functions to scalar_validity.cc --- .../arrow/compute/kernels/scalar_validity.cc | 73 ++++++++++++++++++- cpp/src/arrow/util/ree_util.cc | 31 -------- cpp/src/arrow/util/ree_util.h | 2 - cpp/src/arrow/util/union_util.cc | 31 -------- cpp/src/arrow/util/union_util.h | 6 -- 5 files changed, 69 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index c406c620106..a25729a5e91 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -22,8 +22,8 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/ree_util.h" -#include "arrow/util/union_util.h" namespace arrow { @@ -84,6 +84,71 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of } } +static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto* sparse_union_type = + ::arrow::internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = sparse_union_type->child_ids()[types[span.offset + i]]; + if (span.child_data[child_id].IsNull(i)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto* dense_union_type = + ::arrow::internal::checked_cast(span.type); + DCHECK_LE(span.child_data.size(), 128); + + const int8_t* types = span.GetValues(1); // NOLINT + const int32_t* offsets = span.GetValues(2); // NOLINT + for (int64_t i = 0; i < span.length; i++) { + const int8_t child_id = dense_union_type->child_ids()[types[span.offset + i]]; + const int32_t offset = offsets[span.offset + i]; + if (span.child_data[child_id].IsNull(offset)) { + bit_util::SetBit(out_bitmap, i + out_offset); + } + } +} + +template +void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto& values = arrow::ree_util::ValuesArray(span); + const auto& values_bitmap = values.buffers[0].data; + + arrow::ree_util::RunEndEncodedArraySpan ree_span(span); + auto end = ree_span.end(); + for (auto it = ree_span.begin(); it != end; ++it) { + const bool is_null = + values_bitmap && + !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); + if (is_null) { + for (int64_t i = 0; i < it.run_length(); i++) { + bit_util::SetBit(out_bitmap, it.logical_position() + i + out_offset); + } + } + } +} + +void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, + int64_t out_offset) { + const auto type_id = arrow::ree_util::RunEndsArray(span).type->id(); + if (type_id == Type::INT16) { + SetREELogicalNullBits(span, out_bitmap, out_offset); + } else if (type_id == Type::INT32) { + SetREELogicalNullBits(span, out_bitmap, out_offset); + } else { + DCHECK_EQ(type_id, Type::INT64); + SetREELogicalNullBits(span, out_bitmap, out_offset); + } +} + Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& arr = batch[0].array; ArraySpan* out_span = out->array_span_mutable(); @@ -106,11 +171,11 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const auto t = arr.type->id(); if (t == Type::SPARSE_UNION) { - union_util::SetLogicalSparseUnionNullBits(arr, out_bitmap, out_span->offset); + SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); } else if (t == Type::DENSE_UNION) { - union_util::SetLogicalDenseUnionNullBits(arr, out_bitmap, out_span->offset); + SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); } else if (t == Type::RUN_END_ENCODED) { - ree_util::SetLogicalNullBits(arr, out_bitmap, out_span->offset); + SetREELogicalNullBits(arr, out_bitmap, out_span->offset); } if (is_floating(arr.type->id()) && options.nan_is_null) { diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index 5f9dcdb8aeb..fcd6c204e06 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -47,25 +47,6 @@ int64_t LogicalNullCount(const ArraySpan& span) { return null_count; } -template -void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { - const auto& values = ValuesArray(span); - const auto& values_bitmap = values.buffers[0].data; - - RunEndEncodedArraySpan ree_span(span); - auto end = ree_span.end(); - for (auto it = ree_span.begin(); it != end; ++it) { - const bool is_null = - values_bitmap && - !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); - if (is_null) { - for (int64_t i = 0; i < it.run_length(); i++) { - bit_util::SetBit(out_bitmap, it.logical_position() + i + out_offset); - } - } - } -} - } // namespace int64_t LogicalNullCount(const ArraySpan& span) { @@ -80,18 +61,6 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } -void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { - const auto type_id = RunEndsArray(span).type->id(); - if (type_id == Type::INT16) { - SetLogicalNullBits(span, out_bitmap, out_offset); - } else if (type_id == Type::INT32) { - SetLogicalNullBits(span, out_bitmap, out_offset); - } else { - DCHECK_EQ(type_id, Type::INT64); - SetLogicalNullBits(span, out_bitmap, out_offset); - } -} - int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) { const auto type_id = RunEndsArray(span).type->id(); if (type_id == Type::INT16) { diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index 523bb914b4b..a3d3d16c0da 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -56,8 +56,6 @@ Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type, /// \brief Compute the logical null count of an REE array int64_t LogicalNullCount(const ArraySpan& span); -void SetLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset); - namespace internal { /// \brief Uses binary-search to find the physical offset given a logical offset diff --git a/cpp/src/arrow/util/union_util.cc b/cpp/src/arrow/util/union_util.cc index 8dbf7a91d44..ca3ecf28644 100644 --- a/cpp/src/arrow/util/union_util.cc +++ b/cpp/src/arrow/util/union_util.cc @@ -55,35 +55,4 @@ int64_t LogicalDenseUnionNullCount(const ArraySpan& span) { return null_count; } -void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, - int64_t out_offset) { - const auto* sparse_union_type = - internal::checked_cast(span.type); - DCHECK_LE(span.child_data.size(), 128); - - const int8_t* types = span.GetValues(1); // NOLINT - for (int64_t i = 0; i < span.length; i++) { - const int8_t child_id = sparse_union_type->child_ids()[types[span.offset + i]]; - if (span.child_data[child_id].IsNull(i)) { - bit_util::SetBit(out_bitmap, i + out_offset); - } - } -} - -void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, - int64_t out_offset) { - const auto* dense_union_type = internal::checked_cast(span.type); - DCHECK_LE(span.child_data.size(), 128); - - const int8_t* types = span.GetValues(1); // NOLINT - const int32_t* offsets = span.GetValues(2); // NOLINT - for (int64_t i = 0; i < span.length; i++) { - const int8_t child_id = dense_union_type->child_ids()[types[span.offset + i]]; - const int32_t offset = offsets[span.offset + i]; - if (span.child_data[child_id].IsNull(offset)) { - bit_util::SetBit(out_bitmap, i + out_offset); - } - } -} - } // namespace arrow::union_util diff --git a/cpp/src/arrow/util/union_util.h b/cpp/src/arrow/util/union_util.h index 0afd71a2d55..0f30d5a3278 100644 --- a/cpp/src/arrow/util/union_util.h +++ b/cpp/src/arrow/util/union_util.h @@ -27,11 +27,5 @@ int64_t LogicalSparseUnionNullCount(const ArraySpan& span); /// \brief Compute the number of of logical nulls in a dense union array int64_t LogicalDenseUnionNullCount(const ArraySpan& span); -void SetLogicalSparseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, - int64_t out_offset); - -void SetLogicalDenseUnionNullBits(const ArraySpan& span, uint8_t* out_bitmap, - int64_t out_offset); - } // namespace union_util } // namespace arrow From 0c9b19d462984ab6908038e5610c438d3b6412a4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 12 Apr 2023 13:48:05 +0200 Subject: [PATCH 04/14] address feedback (early return, use SetBitsTo) --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index a25729a5e91..002912b7bba 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -120,18 +120,18 @@ template void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { const auto& values = arrow::ree_util::ValuesArray(span); - const auto& values_bitmap = values.buffers[0].data; + const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; + + if (!values_bitmap) { + return; + } arrow::ree_util::RunEndEncodedArraySpan ree_span(span); auto end = ree_span.end(); for (auto it = ree_span.begin(); it != end; ++it) { - const bool is_null = - values_bitmap && - !bit_util::GetBit(values_bitmap, values.offset + it.index_into_array()); - if (is_null) { - for (int64_t i = 0; i < it.run_length(); i++) { - bit_util::SetBit(out_bitmap, it.logical_position() + i + out_offset); - } + if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) { + bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(), + true); } } } From dd6d0e0b4bfdba769a44402ea9589f10dfc9b402 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Apr 2023 15:29:46 +0200 Subject: [PATCH 05/14] add basic c++ tests + fix impl for sliced arrays --- .../arrow/compute/kernels/scalar_validity.cc | 8 +++--- .../compute/kernels/scalar_validity_test.cc | 28 +++++++++++++++++++ python/pyarrow/tests/test_compute.py | 4 +++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 002912b7bba..82b351ebf34 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -92,8 +92,8 @@ static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bi const int8_t* types = span.GetValues(1); // NOLINT for (int64_t i = 0; i < span.length; i++) { - const int8_t child_id = sparse_union_type->child_ids()[types[span.offset + i]]; - if (span.child_data[child_id].IsNull(i)) { + const int8_t child_id = sparse_union_type->child_ids()[types[i]]; + if (span.child_data[child_id].IsNull(i + span.offset)) { bit_util::SetBit(out_bitmap, i + out_offset); } } @@ -108,8 +108,8 @@ static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bit const int8_t* types = span.GetValues(1); // NOLINT const int32_t* offsets = span.GetValues(2); // NOLINT for (int64_t i = 0; i < span.length; i++) { - const int8_t child_id = dense_union_type->child_ids()[types[span.offset + i]]; - const int32_t offset = offsets[span.offset + i]; + const int8_t child_id = dense_union_type->child_ids()[types[i]]; + const int32_t offset = offsets[i]; if (span.child_data[child_id].IsNull(offset)) { bit_util::SetBit(out_bitmap, i + out_offset); } diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 94d951c8382..5bef6787043 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) { "[true, false, false, true]", &nan_is_null_options); } +TEST_F(TestBooleanValidityKernels, IsNullUnion) { + auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]"); + auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])"); + auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])"); + ASSERT_OK_AND_ASSIGN(auto arr1, + SparseUnionArray::Make(*type_ids, {field_i64, field_str})); + auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]"); + CheckScalarUnary("is_null", arr1, expected); + + auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]"); + auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])"); + auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])"); + ASSERT_OK_AND_ASSIGN(auto arr2, + DenseUnionArray::Make(*type_ids, *value_offsets, + {dense_field_i64, dense_field_str})); + CheckScalarUnary("is_null", arr2, expected); +} + +// TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { +// auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); +// auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); +// ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); +// ASSERT_OK(ree_array->ValidateFull()); +// auto expected = +// ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); +// CheckScalarUnary("is_null", ree_array, expected); +// } + TEST(TestValidityKernels, IsFinite) { for (const auto& ty : IntTypes()) { CheckScalar("is_finite", {ArrayFromJSON(ty, "[0, 1, 42, null]")}, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index bf5730b2657..b20b87f19ab 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1668,6 +1668,8 @@ def test_is_null_union(): result = arr.is_null() expected = pa.array([False, True, True, False, False]) assert result.equals(expected) + result = arr.slice(1, 2).is_null() + assert result.equals(expected.slice(1, 2)) arr = pa.UnionArray.from_dense( pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), @@ -1681,6 +1683,8 @@ def test_is_null_union(): result = arr.is_null() expected = pa.array([False, False, False, True, False, True, False]) assert result.equals(expected) + result = arr.slice(1, 3).is_null() + assert result.equals(expected.slice(1, 3)) @pytest.mark.parametrize("typ", ["int16", "int32", "int64"]) From 448eed9ede2efc5231cae1bfc6c3597f2c5fc7fc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Apr 2023 16:13:39 +0200 Subject: [PATCH 06/14] support REE in FillFromScalar --- cpp/src/arrow/array/data.cc | 7 +++++++ .../compute/kernels/scalar_validity_test.cc | 18 +++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 79595ab7c7c..58c59c9f8cd 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -431,6 +431,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) { this->child_data[i].FillFromScalar(*scalar.value[i]); } } + } else if (type_id == Type::RUN_END_ENCODED) { + const auto& scalar = checked_cast(value); + this->child_data.resize(2); + auto run_end_type = scalar.run_end_type(); + auto run_end = MakeScalar(run_end_type, 1).ValueOrDie(); + this->child_data[0].FillFromScalar(*run_end); + this->child_data[1].FillFromScalar(*scalar.value); } else if (type_id == Type::EXTENSION) { // Pass through storage const auto& scalar = checked_cast(value); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 5bef6787043..48353daf92e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -110,15 +110,15 @@ TEST_F(TestBooleanValidityKernels, IsNullUnion) { CheckScalarUnary("is_null", arr2, expected); } -// TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { -// auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); -// auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); -// ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); -// ASSERT_OK(ree_array->ValidateFull()); -// auto expected = -// ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); -// CheckScalarUnary("is_null", ree_array, expected); -// } +TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) { + auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])"); + auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])"); + ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values)); + ASSERT_OK(ree_array->ValidateFull()); + auto expected = + ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]"); + CheckScalarUnary("is_null", ree_array, expected); +} TEST(TestValidityKernels, IsFinite) { for (const auto& ty : IntTypes()) { From ca937d81267cdb7e79d8ace97ab9ab7dec25bb7c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Apr 2023 17:04:47 +0200 Subject: [PATCH 07/14] address feedback --- cpp/src/arrow/array/data.cc | 2 +- .../arrow/compute/kernels/scalar_validity.cc | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 58c59c9f8cd..0b75ab23844 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -434,7 +434,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } else if (type_id == Type::RUN_END_ENCODED) { const auto& scalar = checked_cast(value); this->child_data.resize(2); - auto run_end_type = scalar.run_end_type(); + auto& run_end_type = scalar.run_end_type(); auto run_end = MakeScalar(run_end_type, 1).ValueOrDie(); this->child_data[0].FillFromScalar(*run_end); this->child_data[1].FillFromScalar(*scalar.value); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 82b351ebf34..8df08631d08 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -167,15 +167,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { } else { // Input has no nulls => output is entirely false. bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); - } - - const auto t = arr.type->id(); - if (t == Type::SPARSE_UNION) { - SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); - } else if (t == Type::DENSE_UNION) { - SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); - } else if (t == Type::RUN_END_ENCODED) { - SetREELogicalNullBits(arr, out_bitmap, out_span->offset); + // Except for union/ree which never has physical nulls, but can have logical + // nulls from the child arrays -> set those bits to true + const auto t = arr.type->id(); + if (t == Type::SPARSE_UNION) { + SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); + } else if (t == Type::DENSE_UNION) { + SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); + } else if (t == Type::RUN_END_ENCODED) { + SetREELogicalNullBits(arr, out_bitmap, out_span->offset); + } } if (is_floating(arr.type->id()) && options.nan_is_null) { From fb4352af93cda6e1be98cef6776887fcbdfc0cad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Apr 2023 12:02:48 +0200 Subject: [PATCH 08/14] Update cpp/src/arrow/compute/kernels/scalar_validity.cc Co-authored-by: Weston Pace --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index 8df08631d08..d518facf55e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -119,6 +119,7 @@ static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bit template void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { + DCHECK(!is_nested(span.type->id())); const auto& values = arrow::ree_util::ValuesArray(span); const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; From 0d3fb6aeeefc46d649067a60d428e270f8504a1d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 May 2023 10:23:39 +0200 Subject: [PATCH 09/14] Update cpp/src/arrow/compute/kernels/scalar_validity.cc Co-authored-by: Weston Pace --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index d518facf55e..bedeff1a127 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -119,8 +119,8 @@ static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bit template void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap, int64_t out_offset) { - DCHECK(!is_nested(span.type->id())); const auto& values = arrow::ree_util::ValuesArray(span); + DCHECK(!is_nested(values.type->id())); const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR; if (!values_bitmap) { From ca2bcc67a36c037993a316074a1df9cd3d4c36ac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 May 2023 11:43:46 +0200 Subject: [PATCH 10/14] fix creating run end from scratch space --- cpp/src/arrow/array/data.cc | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 0b75ab23844..6bc96c31dd3 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -236,6 +236,19 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { return {scratch_space, sizeof(offset_type) * 2}; } +template +void SetRunForScalar(ArraySpan* span, std::shared_ptr type, + uint64_t* scratch_space) { + // Create a lenght-1 array with the value 1 for a REE scalar + using run_type = typename ArrowType::c_type; + auto buffer = reinterpret_cast(scratch_space); + buffer[0] = static_cast(1); + auto data_buffer = + std::make_shared(reinterpret_cast(buffer), sizeof(run_type)); + auto data = ArrayData::Make(type, 1, {nullptr, data_buffer}, 0); + span->child_data[0].SetMembers(*data); +} + int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -435,8 +448,17 @@ void ArraySpan::FillFromScalar(const Scalar& value) { const auto& scalar = checked_cast(value); this->child_data.resize(2); auto& run_end_type = scalar.run_end_type(); - auto run_end = MakeScalar(run_end_type, 1).ValueOrDie(); - this->child_data[0].FillFromScalar(*run_end); + switch (run_end_type->id()) { + case Type::INT16: + SetRunForScalar(this, run_end_type, this->scratch_space); + break; + case Type::INT32: + SetRunForScalar(this, run_end_type, this->scratch_space); + break; + default: + DCHECK_EQ(run_end_type->id(), Type::INT64); + SetRunForScalar(this, run_end_type, this->scratch_space); + } this->child_data[1].FillFromScalar(*scalar.value); } else if (type_id == Type::EXTENSION) { // Pass through storage From 0935b4dbc8bcfbd799e1d5e743b773dd0db3c3f1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 May 2023 15:19:07 +0200 Subject: [PATCH 11/14] address feedback --- cpp/src/arrow/array/data.cc | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 6bc96c31dd3..acdc10d68b2 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -236,17 +236,16 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { return {scratch_space, sizeof(offset_type) * 2}; } -template -void SetRunForScalar(ArraySpan* span, std::shared_ptr type, - uint64_t* scratch_space) { - // Create a lenght-1 array with the value 1 for a REE scalar - using run_type = typename ArrowType::c_type; - auto buffer = reinterpret_cast(scratch_space); - buffer[0] = static_cast(1); +template +void FillRunEndsArrayForScalar(ArraySpan* span, std::shared_ptr run_end_type) { + using RunEndCType = typename RunEndType::c_type; + auto buffer = reinterpret_cast(span->scratch_space); + buffer[0] = static_cast(1); auto data_buffer = - std::make_shared(reinterpret_cast(buffer), sizeof(run_type)); - auto data = ArrayData::Make(type, 1, {nullptr, data_buffer}, 0); - span->child_data[0].SetMembers(*data); + std::make_shared(reinterpret_cast(buffer), sizeof(RunEndCType)); + auto data = + ArrayData::Make(std::move(run_end_type), 1, {nullptr, std::move(data_buffer)}, 0); + span->SetMembers(*data); } int GetNumBuffers(const DataType& type) { @@ -450,14 +449,14 @@ void ArraySpan::FillFromScalar(const Scalar& value) { auto& run_end_type = scalar.run_end_type(); switch (run_end_type->id()) { case Type::INT16: - SetRunForScalar(this, run_end_type, this->scratch_space); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type); break; case Type::INT32: - SetRunForScalar(this, run_end_type, this->scratch_space); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type); break; default: DCHECK_EQ(run_end_type->id(), Type::INT64); - SetRunForScalar(this, run_end_type, this->scratch_space); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type); } this->child_data[1].FillFromScalar(*scalar.value); } else if (type_id == Type::EXTENSION) { From cceea95a6813ccb187609ebe3ae5e2756876dfc2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 May 2023 10:27:20 +0200 Subject: [PATCH 12/14] set members directly --- cpp/src/arrow/array/data.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index acdc10d68b2..bcf76ac2cc1 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -237,15 +237,15 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { } template -void FillRunEndsArrayForScalar(ArraySpan* span, std::shared_ptr run_end_type) { +void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) { using RunEndCType = typename RunEndType::c_type; auto buffer = reinterpret_cast(span->scratch_space); buffer[0] = static_cast(1); - auto data_buffer = - std::make_shared(reinterpret_cast(buffer), sizeof(RunEndCType)); - auto data = - ArrayData::Make(std::move(run_end_type), 1, {nullptr, std::move(data_buffer)}, 0); - span->SetMembers(*data); + span->type = run_end_type; + span->length = 1; + span->null_count = 0; + span->buffers[1].data = reinterpret_cast(buffer); + span->buffers[1].size = sizeof(RunEndCType); } int GetNumBuffers(const DataType& type) { @@ -449,14 +449,14 @@ void ArraySpan::FillFromScalar(const Scalar& value) { auto& run_end_type = scalar.run_end_type(); switch (run_end_type->id()) { case Type::INT16: - FillRunEndsArrayForScalar(&this->child_data[0], run_end_type); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); break; case Type::INT32: - FillRunEndsArrayForScalar(&this->child_data[0], run_end_type); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); break; default: DCHECK_EQ(run_end_type->id(), Type::INT64); - FillRunEndsArrayForScalar(&this->child_data[0], run_end_type); + FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); } this->child_data[1].FillFromScalar(*scalar.value); } else if (type_id == Type::EXTENSION) { From 1eb11879337c6e491c9766ccf4c4fb6986751550 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 9 Sep 2023 14:21:29 -0300 Subject: [PATCH 13/14] fixup! support REE in FillFromScalar --- cpp/src/arrow/array/data.cc | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index bcf76ac2cc1..79595ab7c7c 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -236,18 +236,6 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { return {scratch_space, sizeof(offset_type) * 2}; } -template -void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) { - using RunEndCType = typename RunEndType::c_type; - auto buffer = reinterpret_cast(span->scratch_space); - buffer[0] = static_cast(1); - span->type = run_end_type; - span->length = 1; - span->null_count = 0; - span->buffers[1].data = reinterpret_cast(buffer); - span->buffers[1].size = sizeof(RunEndCType); -} - int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -443,22 +431,6 @@ void ArraySpan::FillFromScalar(const Scalar& value) { this->child_data[i].FillFromScalar(*scalar.value[i]); } } - } else if (type_id == Type::RUN_END_ENCODED) { - const auto& scalar = checked_cast(value); - this->child_data.resize(2); - auto& run_end_type = scalar.run_end_type(); - switch (run_end_type->id()) { - case Type::INT16: - FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); - break; - case Type::INT32: - FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); - break; - default: - DCHECK_EQ(run_end_type->id(), Type::INT64); - FillRunEndsArrayForScalar(&this->child_data[0], run_end_type.get()); - } - this->child_data[1].FillFromScalar(*scalar.value); } else if (type_id == Type::EXTENSION) { // Pass through storage const auto& scalar = checked_cast(value); From ddf2bee6d10dd7d5909ad39b46250a4ababe5891 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 9 Sep 2023 14:22:23 -0300 Subject: [PATCH 14/14] Clarify comment and add a DCHECK that guarantees if-else chain is exhaustive --- cpp/src/arrow/compute/kernels/scalar_validity.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index bedeff1a127..8e25be62440 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -20,6 +20,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" @@ -159,17 +160,16 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return Status::OK(); } - const auto& options = NanOptionsState::Get(ctx); uint8_t* out_bitmap = out_span->buffers[1].data; if (arr.GetNullCount() > 0) { // Input has nulls => output is the inverted null (validity) bitmap. InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, out_span->offset); } else { - // Input has no nulls => output is entirely false. + // Input has no nulls => output is entirely false... bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false); - // Except for union/ree which never has physical nulls, but can have logical - // nulls from the child arrays -> set those bits to true + // ...except for some types (e.g. unions types and REE) which do not + // represent nulls in the validity bitmap. const auto t = arr.type->id(); if (t == Type::SPARSE_UNION) { SetSparseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); @@ -177,9 +177,12 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { SetDenseUnionLogicalNullBits(arr, out_bitmap, out_span->offset); } else if (t == Type::RUN_END_ENCODED) { SetREELogicalNullBits(arr, out_bitmap, out_span->offset); + } else { + DCHECK(arrow::internal::HasValidityBitmap(t)); } } + const auto& options = NanOptionsState::Get(ctx); if (is_floating(arr.type->id()) && options.nan_is_null) { switch (arr.type->id()) { case Type::FLOAT: