From a371653a02a0f52811fd0f5b61ab56d862093aa2 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 9 Apr 2024 13:37:57 -0300 Subject: [PATCH 1/5] data.h: Add is_validity_defined_by_bitmap() predicate --- cpp/src/arrow/array/data.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index d8a6663cec5..741484aa369 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -46,6 +46,19 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); + +constexpr bool is_validity_defined_by_bitmap(Type::type id) { + switch (id) { + case Type::NA: + case Type::SPARSE_UNION: + case Type::DENSE_UNION: + case Type::RUN_END_ENCODED: + return false; + default: + return true; + } +} + } // namespace internal // When slicing, we do not know the null count of the sliced range without From 0a849e48a3d9308784f894641aeb663e925093b3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 9 Apr 2024 13:39:43 -0300 Subject: [PATCH 2/5] builder_nested.h: Use validity bitmap directly instead of calling IsValid() The constexpr is_validity_defined_by_bitmap() predicate guarantees statically that it's enough to look only at the bitmap for the types these templates are used with. --- cpp/src/arrow/array/builder_nested.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 429aa5c0488..43dfd471e3b 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -181,13 +181,11 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { if constexpr (is_list_view(TYPE::type_id)) { sizes = array.GetValues(2); } - const bool all_valid = !array.MayHaveLogicalNulls(); - const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + static_assert(internal::is_validity_defined_by_bitmap(TYPE::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); int64_t size = 0; if (is_valid) { if constexpr (is_list_view(TYPE::type_id)) { @@ -569,13 +567,11 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { - const int32_t* offsets = array.GetValues(1); - const bool all_valid = !array.MayHaveLogicalNulls(); - const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + const auto* offsets = array.GetValues(1); + static_assert(internal::is_validity_defined_by_bitmap(MapType::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); if (is_valid) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; From 097c37a35663f5708b957e605a520e74498cff66 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 10 Apr 2024 16:48:08 -0300 Subject: [PATCH 3/5] type.h: Introduce may_have_validity_bitmap() and deprecate HasValidityBitmap() --- cpp/src/arrow/type.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3f651741d3e..bd36e051eb4 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2482,7 +2482,7 @@ Result> UnifySchemas( namespace internal { -constexpr bool HasValidityBitmap(Type::type id) { +constexpr bool may_have_validity_bitmap(Type::type id) { switch (id) { case Type::NA: case Type::DENSE_UNION: @@ -2494,6 +2494,9 @@ constexpr bool HasValidityBitmap(Type::type id) { } } +ARROW_DEPRECATED("Deprecated in 17.0.0. Use may_have_validity_bitmap() instead.") +constexpr bool HasValidityBitmap(Type::type id) { return may_have_validity_bitmap(id); } + ARROW_EXPORT std::string ToString(Type::type id); From 4187b31a31d2466c93f4c4a3e1a76473160348e3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 10 Apr 2024 16:49:05 -0300 Subject: [PATCH 4/5] Delete is_validity_defined_by_bitmap in favor of may_have_validity_bitmap --- cpp/src/arrow/array/builder_nested.h | 4 ++-- cpp/src/arrow/array/data.h | 12 ------------ 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 43dfd471e3b..e40f7fcbb7a 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -181,7 +181,7 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { if constexpr (is_list_view(TYPE::type_id)) { sizes = array.GetValues(2); } - static_assert(internal::is_validity_defined_by_bitmap(TYPE::type_id)); + static_assert(internal::may_have_validity_bitmap(TYPE::type_id)); const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t row = offset; row < offset + length; row++) { @@ -568,7 +568,7 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const auto* offsets = array.GetValues(1); - static_assert(internal::is_validity_defined_by_bitmap(MapType::type_id)); + static_assert(internal::may_have_validity_bitmap(MapType::type_id)); const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 741484aa369..beec29789ad 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -47,18 +47,6 @@ ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); -constexpr bool is_validity_defined_by_bitmap(Type::type id) { - switch (id) { - case Type::NA: - case Type::SPARSE_UNION: - case Type::DENSE_UNION: - case Type::RUN_END_ENCODED: - return false; - default: - return true; - } -} - } // namespace internal // When slicing, we do not know the null count of the sliced range without From 7b9a2e744e952fa17fe60233ca1d686c5b522b73 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 10 Apr 2024 17:04:34 -0300 Subject: [PATCH 5/5] Fix deprecation warnings/errors --- cpp/src/arrow/array/array_test.cc | 7 ++++--- cpp/src/arrow/array/concatenate.cc | 2 +- cpp/src/arrow/array/data.cc | 6 +++--- cpp/src/arrow/array/util.cc | 2 +- cpp/src/arrow/array/validate.cc | 2 +- cpp/src/arrow/c/bridge.cc | 2 +- cpp/src/arrow/c/bridge_test.cc | 2 +- cpp/src/arrow/compute/exec.cc | 2 +- cpp/src/arrow/integration/json_internal.cc | 2 +- cpp/src/arrow/ipc/metadata_internal.cc | 5 +++-- 10 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 21ac1a09f56..13aef62997b 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -603,11 +603,11 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) ASSERT_EQ(out->length(), 9); auto out_type_id = out->type()->id(); - const bool has_validity = internal::HasValidityBitmap(out_type_id); + const bool can_check_nulls = internal::may_have_validity_bitmap(out_type_id); // For a dictionary builder, the output dictionary won't necessarily be the same const bool can_check_values = !is_dictionary(out_type_id); - if (has_validity) { + if (can_check_nulls) { ASSERT_EQ(out->null_count(), 4); } else { ASSERT_EQ(out->null_count(), 0); @@ -855,7 +855,8 @@ TEST_F(TestArray, TestAppendArraySlice) { span.SetMembers(*nulls->data()); ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(12, builder->length()); - const bool has_validity_bitmap = internal::HasValidityBitmap(scalar->type->id()); + const bool has_validity_bitmap = + internal::may_have_validity_bitmap(scalar->type->id()); if (has_validity_bitmap) { ASSERT_EQ(4, builder->null_count()); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index ff9ed66d114..44d58cc0bde 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -317,7 +317,7 @@ class ConcatenateImpl { } Status Concatenate(std::shared_ptr* out) && { - if (out_->null_count != 0 && internal::HasValidityBitmap(out_->type->id())) { + if (out_->null_count != 0 && internal::may_have_validity_bitmap(out_->type->id())) { RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0])); } RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 80c411dfa6a..21b13a8cfe1 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -53,7 +53,7 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length, if (type_id == Type::NA) { *null_count = length; (*buffers)[0] = nullptr; - } else if (internal::HasValidityBitmap(type_id)) { + } else if (internal::may_have_validity_bitmap(type_id)) { if (*null_count == 0) { // In case there are no nulls, don't keep an allocated null bitmap around (*buffers)[0] = nullptr; @@ -345,7 +345,7 @@ void FillZeroLengthArray(const DataType* type, ArraySpan* span) { span->buffers[i].size = 0; } - if (!HasValidityBitmap(type->id())) { + if (!may_have_validity_bitmap(type->id())) { span->buffers[0] = {}; } @@ -380,7 +380,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { if (type_id == Type::NA) { this->null_count = 1; - } else if (!internal::HasValidityBitmap(type_id)) { + } else if (!internal::may_have_validity_bitmap(type_id)) { this->null_count = 0; } else { // Populate null count and validity bitmap diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 86e2ffcae4d..bdba92c9a11 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -95,7 +95,7 @@ class ArrayDataEndianSwapper { Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); RETURN_NOT_OK(SwapChildren(type.fields())); - if (internal::HasValidityBitmap(type.id())) { + if (internal::may_have_validity_bitmap(type.id())) { // Copy null bitmap out_->buffers[0] = data_->buffers[0]; } diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 8dd3eb3f90c..0d940d3bc86 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -550,7 +550,7 @@ struct ValidateArrayImpl { if (full_validation) { if (data.null_count != kUnknownNullCount) { int64_t actual_null_count; - if (HasValidityBitmap(data.type->id()) && data.buffers[0]) { + if (may_have_validity_bitmap(data.type->id()) && data.buffers[0]) { // Do not call GetNullCount() as it would also set the `null_count` member actual_null_count = data.length - CountSetBits(data.buffers[0]->data(), data.offset, data.length); diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index d004de7a2ea..8a530b3798d 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -576,7 +576,7 @@ struct ArrayExporter { // Store buffer pointers size_t n_buffers = data->buffers.size(); auto buffers_begin = data->buffers.begin(); - if (n_buffers > 0 && !internal::HasValidityBitmap(data->type->id())) { + if (n_buffers > 0 && !internal::may_have_validity_bitmap(data->type->id())) { --n_buffers; ++buffers_begin; } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index dba6e4736b6..d64fe67accd 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -565,7 +565,7 @@ struct ArrayExportChecker { auto expected_n_buffers = static_cast(expected_data.buffers.size()); auto expected_buffers = expected_data.buffers.data(); - if (!internal::HasValidityBitmap(expected_data.type->id())) { + if (!internal::may_have_validity_bitmap(expected_data.type->id())) { --expected_n_buffers; ++expected_buffers; } diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 28dcf493fa2..f2e45783831 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -480,7 +480,7 @@ struct NullGeneralization { if (dtype_id == Type::NA) { return ALL_NULL; } - if (!arrow::internal::HasValidityBitmap(dtype_id)) { + if (!arrow::internal::may_have_validity_bitmap(dtype_id)) { return ALL_VALID; } if (value.is_scalar()) { diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 64eb342d5bd..4b75e84bfcc 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -1849,7 +1849,7 @@ class ArrayReader { Result> Parse() { ARROW_ASSIGN_OR_RAISE(length_, GetMemberInt(obj_, "count")); - if (::arrow::internal::HasValidityBitmap(type_->id())) { + if (::arrow::internal::may_have_validity_bitmap(type_->id())) { // Null and union types don't have a validity bitmap RETURN_NOT_OK(ParseValidityBitmap()); } diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 4154b594d95..e20b352d18d 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -109,8 +109,9 @@ flatbuf::MetadataVersion MetadataVersionToFlatbuffer(MetadataVersion version) { bool HasValidityBitmap(Type::type type_id, MetadataVersion version) { // In V4, null types have no validity bitmap // In V5 and later, null and union types have no validity bitmap - return (version < MetadataVersion::V5) ? (type_id != Type::NA) - : ::arrow::internal::HasValidityBitmap(type_id); + return (version < MetadataVersion::V5) + ? (type_id != Type::NA) + : ::arrow::internal::may_have_validity_bitmap(type_id); } namespace {