From 66e1c1ff276a5fe37bc6fc8027a41e12784e0f30 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 12 Nov 2023 23:06:28 +0800 Subject: [PATCH 01/25] ComputeNullValues --- cpp/src/arrow/array/array_dict.cc | 60 +++++++++++++++++++++++++++++++ cpp/src/arrow/array/array_dict.h | 9 +++++ 2 files changed, 69 insertions(+) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 28fccdbfcff..0b714ddc2b9 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -212,6 +212,56 @@ Result> TransposeDictIndices( return out_data; } +struct CompactDictionaryNullValuesVistor { + const std::shared_ptr& data; + int64_t& out_null_count; + + template + Status CompactDictionaryNullValuesImpl() { + int64_t index_length = data->length; + int64_t dict_length = data->dictionary->length; + const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); + + using CType = typename IndexArrowType::c_type; + const CType* indices_data = data->GetValues(1); + CType dict_len = static_cast(dict_length); + for (int64_t i = 0; i < index_length; i++) { + if (data->IsNull(i)) { + continue; + } + + CType current_index = indices_data[i]; + if (current_index < 0 || current_index >= dict_len) { + return Status::IndexError( + "Index out of bounds while counting dictionary array: ", current_index, + "(dictionary is ", dict_length, " long) at position ", i); + } + if (!bit_util::GetBit(dictionary_null_bit_map, current_index)) { + out_null_count++; + } + } + return Status::OK(); + } + + template + enable_if_integer Visit(const Type&) { + return CompactDictionaryNullValuesImpl(); + } + + Status Visit(const DataType& type) { + return Status::TypeError("Expected an Index Type of Int or UInt"); + } +}; + +Result CompactDictionaryNullValues(const std::shared_ptr& data) { + int64_t out_null_count = 0; + const auto& dict_type = checked_cast(*data->type); + CompactDictionaryNullValuesVistor vistor{data, out_null_count}; + RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); + + return out_null_count; +} + struct CompactTransposeMapVistor { const std::shared_ptr& data; arrow::MemoryPool* pool; @@ -323,6 +373,16 @@ Result> DictionaryArray::Transpose( return MakeArray(std::move(transposed)); } +Result DictionaryArray::CountNullValues() const { + if (this->dictionary()->null_count() == 0 || this->indices()->null_count() == 0) { + return this->indices()->null_count(); + } + + ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, + CompactDictionaryNullValues(data_)); + return dictionary_null_count + this->indices()->null_count(); +} + Result> DictionaryArray::Compact(MemoryPool* pool) const { std::shared_ptr compact_dictionary; ARROW_ASSIGN_OR_RAISE(std::unique_ptr transpose_map, diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index 9aa0a7bcc2d..bbc224c3d86 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -96,6 +96,15 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& type, const std::shared_ptr& dictionary, const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; + /// \brief Count the number of null values as the dictionary array is decoded. + Result CountNullValues() const; + + /// \brief Compat this DictionaryArray + /// + /// This method returns a compacted dictionary array. All the + /// values in the dictionary are referenced by indices. + /// + /// \param[in] pool a pool to allocate the array data from Result> Compact(MemoryPool* pool = default_memory_pool()) const; /// \brief Determine whether dictionary arrays may be compared without unification From f6e8958382bc6dc314a87a8fd7dc11ffa96a1162 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 00:05:27 +0800 Subject: [PATCH 02/25] add test --- cpp/src/arrow/array/array_dict.cc | 15 +++++----- cpp/src/arrow/array/array_dict_test.cc | 41 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 0b714ddc2b9..ee0d14c24b4 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -212,12 +212,12 @@ Result> TransposeDictIndices( return out_data; } -struct CompactDictionaryNullValuesVistor { +struct CountDictionaryNullValuesVistor { const std::shared_ptr& data; int64_t& out_null_count; template - Status CompactDictionaryNullValuesImpl() { + Status CountDictionaryNullValuesImpl() { int64_t index_length = data->length; int64_t dict_length = data->dictionary->length; const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); @@ -245,7 +245,7 @@ struct CompactDictionaryNullValuesVistor { template enable_if_integer Visit(const Type&) { - return CompactDictionaryNullValuesImpl(); + return CountDictionaryNullValuesImpl(); } Status Visit(const DataType& type) { @@ -253,10 +253,10 @@ struct CompactDictionaryNullValuesVistor { } }; -Result CompactDictionaryNullValues(const std::shared_ptr& data) { +Result CountDictionaryNullValues(const std::shared_ptr& data) { int64_t out_null_count = 0; const auto& dict_type = checked_cast(*data->type); - CompactDictionaryNullValuesVistor vistor{data, out_null_count}; + CountDictionaryNullValuesVistor vistor{data, out_null_count}; RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); return out_null_count; @@ -374,12 +374,11 @@ Result> DictionaryArray::Transpose( } Result DictionaryArray::CountNullValues() const { - if (this->dictionary()->null_count() == 0 || this->indices()->null_count() == 0) { + if (this->dictionary()->null_count() == 0 || this->indices()->length() == 0) { return this->indices()->null_count(); } - ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, - CompactDictionaryNullValues(data_)); + ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, CountDictionaryNullValues(data_)); return dictionary_null_count + this->indices()->null_count(); } diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 2f3ee6e2d49..87821a52c76 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1428,6 +1428,47 @@ TEST(TestDictionary, IndicesArray) { ASSERT_OK(arr->indices()->ValidateFull()); } +void CheckDictionaryComputeNullValues(const std::shared_ptr& dict_type, + const std::string& input_dictionary_json, + const std::string& input_index_json, + const int64_t& expected_null_count) { + auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); + const DictionaryArray& input_ref = checked_cast(*input); + + ASSERT_OK_AND_ASSIGN(int64_t actual, input_ref.CountNullValues()); + ASSERT_EQ(expected_null_count, actual); +} + +TEST(TestDictionary, ComputeNullValues) { + std::shared_ptr type; + std::shared_ptr dict_type; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + type = boolean(); + dict_type = dictionary(index_type, type); + + // no null value + CheckDictionaryComputeNullValues(dict_type, "[]", "[]", 0); + CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[0, 1, 0]", 0); + + // only indices contain null value + CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, 0, 1]", 1); + CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, null]", 2); + + // only dictionary contains null value + CheckDictionaryComputeNullValues(dict_type, "[null, true]", "[]", 0); + CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0]", 2); + + // both indices and dictionary contain null value + CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", + 3); + CheckDictionaryComputeNullValues(dict_type, "[null, true, null, false]", + "[null, 1, 0, 2, 3]", 3); + } +} + void CheckDictionaryCompact(const std::shared_ptr& dict_type, const std::string& input_dictionary_json, const std::string& input_index_json, From 5faa77c26876023f8bd6b2e5f2eaa325f77072de Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 00:18:15 +0800 Subject: [PATCH 03/25] change name --- cpp/src/arrow/array/array_dict_test.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 87821a52c76..b69df67d68f 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1428,7 +1428,7 @@ TEST(TestDictionary, IndicesArray) { ASSERT_OK(arr->indices()->ValidateFull()); } -void CheckDictionaryComputeNullValues(const std::shared_ptr& dict_type, +void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, const std::string& input_dictionary_json, const std::string& input_index_json, const int64_t& expected_null_count) { @@ -1439,7 +1439,7 @@ void CheckDictionaryComputeNullValues(const std::shared_ptr& dict_type ASSERT_EQ(expected_null_count, actual); } -TEST(TestDictionary, ComputeNullValues) { +TEST(TestDictionary, CountNullValues) { std::shared_ptr type; std::shared_ptr dict_type; @@ -1450,21 +1450,21 @@ TEST(TestDictionary, ComputeNullValues) { dict_type = dictionary(index_type, type); // no null value - CheckDictionaryComputeNullValues(dict_type, "[]", "[]", 0); - CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[0, 1, 0]", 0); + CheckDictionaryCountNullValues(dict_type, "[]", "[]", 0); + CheckDictionaryCountNullValues(dict_type, "[true, false]", "[0, 1, 0]", 0); // only indices contain null value - CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, 0, 1]", 1); - CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, null]", 2); + CheckDictionaryCountNullValues(dict_type, "[true, false]", "[null, 0, 1]", 1); + CheckDictionaryCountNullValues(dict_type, "[true, false]", "[null, null]", 2); // only dictionary contains null value - CheckDictionaryComputeNullValues(dict_type, "[null, true]", "[]", 0); - CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0]", 2); + CheckDictionaryCountNullValues(dict_type, "[null, true]", "[]", 0); + CheckDictionaryCountNullValues(dict_type, "[null, true, false]", "[0, 1, 0]", 2); // both indices and dictionary contain null value - CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", + CheckDictionaryCountNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", 3); - CheckDictionaryComputeNullValues(dict_type, "[null, true, null, false]", + CheckDictionaryCountNullValues(dict_type, "[null, true, null, false]", "[null, 1, 0, 2, 3]", 3); } } From 4a9621ff4e7600ca953ec9d0961355df30a36ab5 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 00:19:00 +0800 Subject: [PATCH 04/25] lint --- cpp/src/arrow/array/array_dict_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index b69df67d68f..1ee4dad922d 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1429,9 +1429,9 @@ TEST(TestDictionary, IndicesArray) { } void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, - const std::string& input_dictionary_json, - const std::string& input_index_json, - const int64_t& expected_null_count) { + const std::string& input_dictionary_json, + const std::string& input_index_json, + const int64_t& expected_null_count) { auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); const DictionaryArray& input_ref = checked_cast(*input); @@ -1463,9 +1463,9 @@ TEST(TestDictionary, CountNullValues) { // both indices and dictionary contain null value CheckDictionaryCountNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", - 3); + 3); CheckDictionaryCountNullValues(dict_type, "[null, true, null, false]", - "[null, 1, 0, 2, 3]", 3); + "[null, 1, 0, 2, 3]", 3); } } From 0adeb6b7c2177352b0582be0019e1336c441fc30 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 09:02:35 +0800 Subject: [PATCH 05/25] Update cpp/src/arrow/array/array_dict_test.cc Co-authored-by: Sutou Kouhei --- cpp/src/arrow/array/array_dict_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 1ee4dad922d..78c18aefc78 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1433,7 +1433,7 @@ void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, const std::string& input_index_json, const int64_t& expected_null_count) { auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); - const DictionaryArray& input_ref = checked_cast(*input); + const auto& input_ref = checked_cast(*input); ASSERT_OK_AND_ASSIGN(int64_t actual, input_ref.CountNullValues()); ASSERT_EQ(expected_null_count, actual); From 7015905b3c0341bebe6e8469e6677ae95d9346e3 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 09:02:40 +0800 Subject: [PATCH 06/25] Update cpp/src/arrow/array/array_dict_test.cc Co-authored-by: Sutou Kouhei --- cpp/src/arrow/array/array_dict_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 78c18aefc78..dcd0c5ac789 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1435,7 +1435,7 @@ void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); const auto& input_ref = checked_cast(*input); - ASSERT_OK_AND_ASSIGN(int64_t actual, input_ref.CountNullValues()); + ASSERT_OK_AND_ASSIGN(auto actual, input_ref.CountNullValues()); ASSERT_EQ(expected_null_count, actual); } From 00198a3238f5ef644c210d4b4ba5cd8e5e2e570b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 09:02:46 +0800 Subject: [PATCH 07/25] Update cpp/src/arrow/array/array_dict.cc Co-authored-by: Sutou Kouhei --- cpp/src/arrow/array/array_dict.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index ee0d14c24b4..ce704a95958 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -218,9 +218,9 @@ struct CountDictionaryNullValuesVistor { template Status CountDictionaryNullValuesImpl() { - int64_t index_length = data->length; - int64_t dict_length = data->dictionary->length; - const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); + auto index_length = data->length; + auto dict_length = data->dictionary->length; + const auto* dictionary_null_bit_map = data->dictionary->GetValues(0); using CType = typename IndexArrowType::c_type; const CType* indices_data = data->GetValues(1); From 3050800496fc7277ca004c183092f699e63bfad5 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 09:03:00 +0800 Subject: [PATCH 08/25] Update cpp/src/arrow/array/array_dict.cc Co-authored-by: Sutou Kouhei --- cpp/src/arrow/array/array_dict.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index ce704a95958..8cd24a8d30b 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -249,7 +249,7 @@ struct CountDictionaryNullValuesVistor { } Status Visit(const DataType& type) { - return Status::TypeError("Expected an Index Type of Int or UInt"); + return Status::TypeError("Expected an Index Type of Int or UInt: ", type); } }; From 8b1bd8e88068d2d66143a7c26d4bc7d60b17cec0 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 13 Nov 2023 09:14:55 +0800 Subject: [PATCH 09/25] fix comment --- cpp/src/arrow/array/array_dict.cc | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 8cd24a8d30b..603f4231bd4 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -146,7 +146,7 @@ bool DictionaryArray::CanCompareIndices(const DictionaryArray& other) const { } // ---------------------------------------------------------------------- -// Dictionary transposition +// Dictionary transposition, count null values, compact namespace { @@ -212,12 +212,12 @@ Result> TransposeDictIndices( return out_data; } -struct CountDictionaryNullValuesVistor { +struct CountDictionaryArrayNullValuesVistor { const std::shared_ptr& data; int64_t& out_null_count; template - Status CountDictionaryNullValuesImpl() { + Status CountDictionaryArrayNullValuesImpl() { auto index_length = data->length; auto dict_length = data->dictionary->length; const auto* dictionary_null_bit_map = data->dictionary->GetValues(0); @@ -227,6 +227,7 @@ struct CountDictionaryNullValuesVistor { CType dict_len = static_cast(dict_length); for (int64_t i = 0; i < index_length; i++) { if (data->IsNull(i)) { + out_null_count++; continue; } @@ -245,20 +246,20 @@ struct CountDictionaryNullValuesVistor { template enable_if_integer Visit(const Type&) { - return CountDictionaryNullValuesImpl(); + return CountDictionaryArrayNullValuesImpl(); } Status Visit(const DataType& type) { - return Status::TypeError("Expected an Index Type of Int or UInt: ", type); + return Status::TypeError("Expected an Index Type of Int or UInt, but got a type:", + type); } }; -Result CountDictionaryNullValues(const std::shared_ptr& data) { +Result CountDictionaryArrayNullValues(const std::shared_ptr& data) { int64_t out_null_count = 0; const auto& dict_type = checked_cast(*data->type); - CountDictionaryNullValuesVistor vistor{data, out_null_count}; + CountDictionaryArrayNullValuesVistor vistor{data, out_null_count}; RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); - return out_null_count; } @@ -378,8 +379,8 @@ Result DictionaryArray::CountNullValues() const { return this->indices()->null_count(); } - ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, CountDictionaryNullValues(data_)); - return dictionary_null_count + this->indices()->null_count(); + ARROW_ASSIGN_OR_RAISE(int64_t null_count, CountDictionaryArrayNullValues(data_)); + return null_count; } Result> DictionaryArray::Compact(MemoryPool* pool) const { From 42190449c92359c80118b29251871705addd04cd Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 14 Nov 2023 09:35:52 +0800 Subject: [PATCH 10/25] init dict_util --- cpp/src/arrow/array/data.cc | 4 ++ cpp/src/arrow/util/dict_util.cc | 79 +++++++++++++++++++++++++++++++++ cpp/src/arrow/util/dict_util.h | 28 ++++++++++++ 3 files changed, 111 insertions(+) create mode 100644 cpp/src/arrow/util/dict_util.cc create mode 100644 cpp/src/arrow/util/dict_util.h diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 186682be300..e223a2f1795 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -38,6 +38,7 @@ #include "arrow/util/ree_util.h" #include "arrow/util/slice_util_internal.h" #include "arrow/util/union_util.h" +#include "arrow/util/dict_util.h" namespace arrow { @@ -520,6 +521,9 @@ int64_t ArraySpan::ComputeLogicalNullCount() const { if (t == Type::RUN_END_ENCODED) { return ree_util::LogicalNullCount(*this); } + if(t == Type::DICTIONARY){ + return dict_util::LogicalNullCount(*this); + } return GetNullCount(); } diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc new file mode 100644 index 00000000000..ac7b52970ff --- /dev/null +++ b/cpp/src/arrow/util/dict_util.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/dict_util.h" +#include "array_dict.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace dict_util { + +namespace { + +template +int64_t LogicalNullCount(const ArraySpan& span) { + const auto* dictionary_null_bit_map = span.dictionary().GetValues(0); + + using CType = typename IndexArrowType::c_type; + const CType* indices_data = span.GetValues(1); + auto index_length = span.length; + CType dict_len = static_cast(span.dictionary().length); + int64_t null_count = 0; + for (int64_t i = 0; i < index_length; i++) { + if (span.IsNull(i)) { + null_count++; + continue; + } + + CType current_index = indices_data[i]; + if (!bit_util::GetBit(dictionary_null_bit_map, current_index)) { + null_count++; + } + } + return null_count; +} + +} // namespace + +int64_t LogicalNullCount(const ArraySpan& span) { + if (span.dictionary().GetNullCount() == 0 || span.length == 0) { + return span.GetNullCount(); + } + + const auto& dict_array_type = internal::checked_cast(*span.type); + switch (dict_array_type.index_type()->id()) { + case Type::UINT8: + return LogicalNullCount(span); + case Type::INT8: + return LogicalNullCount(span); + case Type::UINT16: + return LogicalNullCount(span); + case Type::INT16: + return LogicalNullCount(span); + case Type::UINT32: + return LogicalNullCount(span); + case Type::INT32: + return LogicalNullCount(span); + case Type::UINT64: + return LogicalNullCount(span); + default: + return LogicalNullCount(span); + } +} +} // namespace dict_util +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/util/dict_util.h b/cpp/src/arrow/util/dict_util.h new file mode 100644 index 00000000000..a92733ae0f6 --- /dev/null +++ b/cpp/src/arrow/util/dict_util.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array/data.h" + +namespace arrow { +namespace dict_util { + +int64_t LogicalNullCount(const ArraySpan& span); + +} // namespace dict_util +} // namespace arrow From 4e3c7fecfa057f8327960b2930faae1d67c8bfe2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 14 Nov 2023 09:40:10 +0800 Subject: [PATCH 11/25] indices_null_bit_map --- cpp/src/arrow/util/dict_util.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index ac7b52970ff..068ca1bd871 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -28,6 +28,7 @@ namespace { template int64_t LogicalNullCount(const ArraySpan& span) { const auto* dictionary_null_bit_map = span.dictionary().GetValues(0); + const auto* indices_null_bit_map = span.GetValues(0); using CType = typename IndexArrowType::c_type; const CType* indices_data = span.GetValues(1); @@ -35,7 +36,7 @@ int64_t LogicalNullCount(const ArraySpan& span) { CType dict_len = static_cast(span.dictionary().length); int64_t null_count = 0; for (int64_t i = 0; i < index_length; i++) { - if (span.IsNull(i)) { + if (!bit_util::GetBit(indices_null_bit_map, i)) { null_count++; continue; } From 60bcd37474a54033c0751db228e2d5ace9c6979b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 14 Nov 2023 09:40:43 +0800 Subject: [PATCH 12/25] sequence --- cpp/src/arrow/util/dict_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index 068ca1bd871..14963fed087 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -27,8 +27,8 @@ namespace { template int64_t LogicalNullCount(const ArraySpan& span) { - const auto* dictionary_null_bit_map = span.dictionary().GetValues(0); const auto* indices_null_bit_map = span.GetValues(0); + const auto* dictionary_null_bit_map = span.dictionary().GetValues(0); using CType = typename IndexArrowType::c_type; const CType* indices_data = span.GetValues(1); From 186b0478b0e788be127a67cb6dc1d251d6140a10 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:23:36 +0800 Subject: [PATCH 13/25] add test --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/array_test.cc | 46 +++++++++++++++++++++++++++++++ cpp/src/arrow/array/data.cc | 14 ++++++++-- cpp/src/arrow/array/data.h | 14 ++++++++-- cpp/src/arrow/util/dict_util.cc | 7 ++--- 5 files changed, 72 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a611701153..df6f64f9b5c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -223,6 +223,7 @@ set(ARROW_SRCS util/debug.cc util/decimal.cc util/delimiting.cc + util/dict_util.cc util/formatting.cc util/future.cc util/hashing.cc diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 46908439ef5..3a1c04aa305 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -80,6 +80,21 @@ class TestArray : public ::testing::Test { MemoryPool* pool_; }; +void CheckDictionaryNullCount(const std::shared_ptr& dict_type, + const std::string& input_dictionary_json, + const std::string& input_index_json, + const int64_t& expected_null_count, + const int64_t& expected_logical_null_count, + bool expected_may_have_nulls, + bool expected_may_have_logical_nulls) { + std::shared_ptr arr = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); + + ASSERT_EQ(expected_null_count, arr->null_count()); + ASSERT_EQ(expected_logical_null_count, arr->ComputeLogicalNullCount()); + ASSERT_EQ(expected_may_have_nulls, arr->data()->MayHaveNulls()); + ASSERT_EQ(expected_may_have_logical_nulls, arr->data()->MayHaveLogicalNulls()); +} + TEST_F(TestArray, TestNullCount) { // These are placeholders auto data = std::make_shared(nullptr, 0); @@ -127,6 +142,37 @@ TEST_F(TestArray, TestNullCount) { ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount()); ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls()); ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls()); + + // dictionary type + std::shared_ptr type; + std::shared_ptr dict_type; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + type = boolean(); + dict_type = dictionary(index_type, type); + // no null value + CheckDictionaryNullCount(dict_type, "[]", "[]", 0, 0, false, false); + CheckDictionaryNullCount(dict_type, "[true, false]", "[0, 1, 0]", 0, 0, false, false); + + // only indices contain null value + CheckDictionaryNullCount(dict_type, "[true, false]", "[null, 0, 1]", 1, 1, true, + true); + CheckDictionaryNullCount(dict_type, "[true, false]", "[null, null]", 2, 2, true, + true); + + // only dictionary contains null value + CheckDictionaryNullCount(dict_type, "[null, true]", "[]", 0, 0, false, true); + CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0]", 0, 2, false, + true); + + // both indices and dictionary contain null value + CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0, null]", 1, 3, + true, true); + CheckDictionaryNullCount(dict_type, "[null, true, null, false]", "[null, 1, 0, 2, 3]", + 1, 3, true, true); + } } TEST_F(TestArray, TestSlicePreservesAllNullCount) { diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index e223a2f1795..3959bf6cc18 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -33,12 +33,12 @@ #include "arrow/type_traits.h" #include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/dict_util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ree_util.h" #include "arrow/util/slice_util_internal.h" #include "arrow/util/union_util.h" -#include "arrow/util/dict_util.h" namespace arrow { @@ -94,6 +94,10 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } +bool DictionaryMayHaveLogicalNulls(const ArrayData& data) { + return ArraySpan(data).MayHaveLogicalNulls(); +} + BufferSpan PackVariadicBuffers(util::span> buffers) { return {const_cast(reinterpret_cast(buffers.data())), static_cast(buffers.size() * sizeof(std::shared_ptr))}; @@ -175,7 +179,7 @@ int64_t ArrayData::GetNullCount() const { } int64_t ArrayData::ComputeLogicalNullCount() const { - if (this->buffers[0]) { + if (this->buffers[0] && this->type->id() != Type::DICTIONARY) { return GetNullCount(); } return ArraySpan(*this).ComputeLogicalNullCount(); @@ -521,7 +525,7 @@ int64_t ArraySpan::ComputeLogicalNullCount() const { if (t == Type::RUN_END_ENCODED) { return ree_util::LogicalNullCount(*this); } - if(t == Type::DICTIONARY){ + if (t == Type::DICTIONARY) { return dict_util::LogicalNullCount(*this); } return GetNullCount(); @@ -621,6 +625,10 @@ bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const { return ree_util::ValuesArray(*this).MayHaveLogicalNulls(); } +bool ArraySpan::DictionaryMayHaveLogicalNulls() const { + return this->GetNullCount() != 0 || this->dictionary().GetNullCount() != 0; +} + // ---------------------------------------------------------------------- // Implement internal::GetArrayView diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 40a77640cd1..4c2df838149 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -38,7 +38,7 @@ struct ArrayData; namespace internal { // ---------------------------------------------------------------------- -// Null handling for types without a validity bitmap +// Null handling for types without a validity bitmap and the dictionary type ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i); ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i); @@ -46,6 +46,7 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); +ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); } // namespace internal // When slicing, we do not know the null count of the sliced range without @@ -280,7 +281,7 @@ struct ARROW_EXPORT ArrayData { /// \brief Return true if the validity bitmap may have 0's in it, or if the /// child arrays (in the case of types without a validity bitmap) may have - /// nulls + /// nulls, or if the dictionary of dictionay array may have nulls. /// /// This is not a drop-in replacement for MayHaveNulls, as historically /// MayHaveNulls() has been used to check for the presence of a validity @@ -325,6 +326,9 @@ struct ARROW_EXPORT ArrayData { if (t == Type::RUN_END_ENCODED) { return internal::RunEndEncodedMayHaveLogicalNulls(*this); } + if (t == Type::DICTIONARY) { + return internal::DictionaryMayHaveLogicalNulls(*this); + } return null_count.load() != 0; } @@ -505,7 +509,7 @@ struct ARROW_EXPORT ArraySpan { /// \brief Return true if the validity bitmap may have 0's in it, or if the /// child arrays (in the case of types without a validity bitmap) may have - /// nulls + /// nulls, or if the dictionary of dictionay array may have nulls. /// /// \see ArrayData::MayHaveLogicalNulls bool MayHaveLogicalNulls() const { @@ -519,6 +523,9 @@ struct ARROW_EXPORT ArraySpan { if (t == Type::RUN_END_ENCODED) { return RunEndEncodedMayHaveLogicalNulls(); } + if (t == Type::DICTIONARY) { + return DictionaryMayHaveLogicalNulls(); + } return null_count != 0; } @@ -560,6 +567,7 @@ struct ARROW_EXPORT ArraySpan { bool UnionMayHaveLogicalNulls() const; bool RunEndEncodedMayHaveLogicalNulls() const; + bool DictionaryMayHaveLogicalNulls() const; }; namespace internal { diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index 14963fed087..7d13d94289c 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -16,7 +16,7 @@ // under the License. #include "arrow/util/dict_util.h" -#include "array_dict.h" +#include "arrow/array/array_dict.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" @@ -33,10 +33,9 @@ int64_t LogicalNullCount(const ArraySpan& span) { using CType = typename IndexArrowType::c_type; const CType* indices_data = span.GetValues(1); auto index_length = span.length; - CType dict_len = static_cast(span.dictionary().length); int64_t null_count = 0; for (int64_t i = 0; i < index_length; i++) { - if (!bit_util::GetBit(indices_null_bit_map, i)) { + if (indices_null_bit_map != nullptr && !bit_util::GetBit(indices_null_bit_map, i)) { null_count++; continue; } @@ -56,7 +55,7 @@ int64_t LogicalNullCount(const ArraySpan& span) { return span.GetNullCount(); } - const auto& dict_array_type = internal::checked_cast(*span.type); + const auto& dict_array_type = internal::checked_cast(*span.type); switch (dict_array_type.index_type()->id()) { case Type::UINT8: return LogicalNullCount(span); From 8663dbc90d24ebd242952097fb19c066e97a6212 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:47:22 +0800 Subject: [PATCH 14/25] Revert "fix comment" This reverts commit 8b1bd8e88068d2d66143a7c26d4bc7d60b17cec0. --- cpp/src/arrow/array/array_dict.cc | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 603f4231bd4..8cd24a8d30b 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -146,7 +146,7 @@ bool DictionaryArray::CanCompareIndices(const DictionaryArray& other) const { } // ---------------------------------------------------------------------- -// Dictionary transposition, count null values, compact +// Dictionary transposition namespace { @@ -212,12 +212,12 @@ Result> TransposeDictIndices( return out_data; } -struct CountDictionaryArrayNullValuesVistor { +struct CountDictionaryNullValuesVistor { const std::shared_ptr& data; int64_t& out_null_count; template - Status CountDictionaryArrayNullValuesImpl() { + Status CountDictionaryNullValuesImpl() { auto index_length = data->length; auto dict_length = data->dictionary->length; const auto* dictionary_null_bit_map = data->dictionary->GetValues(0); @@ -227,7 +227,6 @@ struct CountDictionaryArrayNullValuesVistor { CType dict_len = static_cast(dict_length); for (int64_t i = 0; i < index_length; i++) { if (data->IsNull(i)) { - out_null_count++; continue; } @@ -246,20 +245,20 @@ struct CountDictionaryArrayNullValuesVistor { template enable_if_integer Visit(const Type&) { - return CountDictionaryArrayNullValuesImpl(); + return CountDictionaryNullValuesImpl(); } Status Visit(const DataType& type) { - return Status::TypeError("Expected an Index Type of Int or UInt, but got a type:", - type); + return Status::TypeError("Expected an Index Type of Int or UInt: ", type); } }; -Result CountDictionaryArrayNullValues(const std::shared_ptr& data) { +Result CountDictionaryNullValues(const std::shared_ptr& data) { int64_t out_null_count = 0; const auto& dict_type = checked_cast(*data->type); - CountDictionaryArrayNullValuesVistor vistor{data, out_null_count}; + CountDictionaryNullValuesVistor vistor{data, out_null_count}; RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); + return out_null_count; } @@ -379,8 +378,8 @@ Result DictionaryArray::CountNullValues() const { return this->indices()->null_count(); } - ARROW_ASSIGN_OR_RAISE(int64_t null_count, CountDictionaryArrayNullValues(data_)); - return null_count; + ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, CountDictionaryNullValues(data_)); + return dictionary_null_count + this->indices()->null_count(); } Result> DictionaryArray::Compact(MemoryPool* pool) const { From b5553a9aca03e76b9fcc16eef26fa87010f64c25 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:47:35 +0800 Subject: [PATCH 15/25] Revert "Update cpp/src/arrow/array/array_dict.cc" This reverts commit 3050800496fc7277ca004c183092f699e63bfad5. --- cpp/src/arrow/array/array_dict.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 8cd24a8d30b..ce704a95958 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -249,7 +249,7 @@ struct CountDictionaryNullValuesVistor { } Status Visit(const DataType& type) { - return Status::TypeError("Expected an Index Type of Int or UInt: ", type); + return Status::TypeError("Expected an Index Type of Int or UInt"); } }; From 5072e98b565ccd5cb20b27f8f76e576db4c0a001 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:47:41 +0800 Subject: [PATCH 16/25] Revert "Update cpp/src/arrow/array/array_dict.cc" This reverts commit 00198a3238f5ef644c210d4b4ba5cd8e5e2e570b. --- cpp/src/arrow/array/array_dict.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index ce704a95958..ee0d14c24b4 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -218,9 +218,9 @@ struct CountDictionaryNullValuesVistor { template Status CountDictionaryNullValuesImpl() { - auto index_length = data->length; - auto dict_length = data->dictionary->length; - const auto* dictionary_null_bit_map = data->dictionary->GetValues(0); + int64_t index_length = data->length; + int64_t dict_length = data->dictionary->length; + const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); using CType = typename IndexArrowType::c_type; const CType* indices_data = data->GetValues(1); From dd95225c8ce002b3175fbb1ddc5013e4f382a5f1 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:47:48 +0800 Subject: [PATCH 17/25] Revert "Update cpp/src/arrow/array/array_dict_test.cc" This reverts commit 7015905b3c0341bebe6e8469e6677ae95d9346e3. --- cpp/src/arrow/array/array_dict_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index dcd0c5ac789..78c18aefc78 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1435,7 +1435,7 @@ void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); const auto& input_ref = checked_cast(*input); - ASSERT_OK_AND_ASSIGN(auto actual, input_ref.CountNullValues()); + ASSERT_OK_AND_ASSIGN(int64_t actual, input_ref.CountNullValues()); ASSERT_EQ(expected_null_count, actual); } From 5f75ef431c1349e24470d936a4495d94151b55d2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:47:52 +0800 Subject: [PATCH 18/25] Revert "Update cpp/src/arrow/array/array_dict_test.cc" This reverts commit 0adeb6b7c2177352b0582be0019e1336c441fc30. --- cpp/src/arrow/array/array_dict_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 78c18aefc78..1ee4dad922d 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1433,7 +1433,7 @@ void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, const std::string& input_index_json, const int64_t& expected_null_count) { auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); - const auto& input_ref = checked_cast(*input); + const DictionaryArray& input_ref = checked_cast(*input); ASSERT_OK_AND_ASSIGN(int64_t actual, input_ref.CountNullValues()); ASSERT_EQ(expected_null_count, actual); From fcc15056d20d37d3b2f46af0224cfda3a4317ba6 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:47:56 +0800 Subject: [PATCH 19/25] Revert "lint" This reverts commit 4a9621ff4e7600ca953ec9d0961355df30a36ab5. --- cpp/src/arrow/array/array_dict_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 1ee4dad922d..b69df67d68f 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1429,9 +1429,9 @@ TEST(TestDictionary, IndicesArray) { } void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, - const std::string& input_dictionary_json, - const std::string& input_index_json, - const int64_t& expected_null_count) { + const std::string& input_dictionary_json, + const std::string& input_index_json, + const int64_t& expected_null_count) { auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); const DictionaryArray& input_ref = checked_cast(*input); @@ -1463,9 +1463,9 @@ TEST(TestDictionary, CountNullValues) { // both indices and dictionary contain null value CheckDictionaryCountNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", - 3); + 3); CheckDictionaryCountNullValues(dict_type, "[null, true, null, false]", - "[null, 1, 0, 2, 3]", 3); + "[null, 1, 0, 2, 3]", 3); } } From 34d8856fa0392ee4b10fabd7bb9068bb7cd05bd0 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:48:07 +0800 Subject: [PATCH 20/25] Revert "change name" This reverts commit 5faa77c26876023f8bd6b2e5f2eaa325f77072de. --- cpp/src/arrow/array/array_dict_test.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index b69df67d68f..87821a52c76 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1428,7 +1428,7 @@ TEST(TestDictionary, IndicesArray) { ASSERT_OK(arr->indices()->ValidateFull()); } -void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, +void CheckDictionaryComputeNullValues(const std::shared_ptr& dict_type, const std::string& input_dictionary_json, const std::string& input_index_json, const int64_t& expected_null_count) { @@ -1439,7 +1439,7 @@ void CheckDictionaryCountNullValues(const std::shared_ptr& dict_type, ASSERT_EQ(expected_null_count, actual); } -TEST(TestDictionary, CountNullValues) { +TEST(TestDictionary, ComputeNullValues) { std::shared_ptr type; std::shared_ptr dict_type; @@ -1450,21 +1450,21 @@ TEST(TestDictionary, CountNullValues) { dict_type = dictionary(index_type, type); // no null value - CheckDictionaryCountNullValues(dict_type, "[]", "[]", 0); - CheckDictionaryCountNullValues(dict_type, "[true, false]", "[0, 1, 0]", 0); + CheckDictionaryComputeNullValues(dict_type, "[]", "[]", 0); + CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[0, 1, 0]", 0); // only indices contain null value - CheckDictionaryCountNullValues(dict_type, "[true, false]", "[null, 0, 1]", 1); - CheckDictionaryCountNullValues(dict_type, "[true, false]", "[null, null]", 2); + CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, 0, 1]", 1); + CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, null]", 2); // only dictionary contains null value - CheckDictionaryCountNullValues(dict_type, "[null, true]", "[]", 0); - CheckDictionaryCountNullValues(dict_type, "[null, true, false]", "[0, 1, 0]", 2); + CheckDictionaryComputeNullValues(dict_type, "[null, true]", "[]", 0); + CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0]", 2); // both indices and dictionary contain null value - CheckDictionaryCountNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", + CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", 3); - CheckDictionaryCountNullValues(dict_type, "[null, true, null, false]", + CheckDictionaryComputeNullValues(dict_type, "[null, true, null, false]", "[null, 1, 0, 2, 3]", 3); } } From fecda422f02c61ce0b230b4681c2a2d7671ca8a3 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:48:11 +0800 Subject: [PATCH 21/25] Revert "add test" This reverts commit f6e8958382bc6dc314a87a8fd7dc11ffa96a1162. --- cpp/src/arrow/array/array_dict.cc | 15 +++++----- cpp/src/arrow/array/array_dict_test.cc | 41 -------------------------- 2 files changed, 8 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index ee0d14c24b4..0b714ddc2b9 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -212,12 +212,12 @@ Result> TransposeDictIndices( return out_data; } -struct CountDictionaryNullValuesVistor { +struct CompactDictionaryNullValuesVistor { const std::shared_ptr& data; int64_t& out_null_count; template - Status CountDictionaryNullValuesImpl() { + Status CompactDictionaryNullValuesImpl() { int64_t index_length = data->length; int64_t dict_length = data->dictionary->length; const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); @@ -245,7 +245,7 @@ struct CountDictionaryNullValuesVistor { template enable_if_integer Visit(const Type&) { - return CountDictionaryNullValuesImpl(); + return CompactDictionaryNullValuesImpl(); } Status Visit(const DataType& type) { @@ -253,10 +253,10 @@ struct CountDictionaryNullValuesVistor { } }; -Result CountDictionaryNullValues(const std::shared_ptr& data) { +Result CompactDictionaryNullValues(const std::shared_ptr& data) { int64_t out_null_count = 0; const auto& dict_type = checked_cast(*data->type); - CountDictionaryNullValuesVistor vistor{data, out_null_count}; + CompactDictionaryNullValuesVistor vistor{data, out_null_count}; RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); return out_null_count; @@ -374,11 +374,12 @@ Result> DictionaryArray::Transpose( } Result DictionaryArray::CountNullValues() const { - if (this->dictionary()->null_count() == 0 || this->indices()->length() == 0) { + if (this->dictionary()->null_count() == 0 || this->indices()->null_count() == 0) { return this->indices()->null_count(); } - ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, CountDictionaryNullValues(data_)); + ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, + CompactDictionaryNullValues(data_)); return dictionary_null_count + this->indices()->null_count(); } diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 87821a52c76..2f3ee6e2d49 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1428,47 +1428,6 @@ TEST(TestDictionary, IndicesArray) { ASSERT_OK(arr->indices()->ValidateFull()); } -void CheckDictionaryComputeNullValues(const std::shared_ptr& dict_type, - const std::string& input_dictionary_json, - const std::string& input_index_json, - const int64_t& expected_null_count) { - auto input = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); - const DictionaryArray& input_ref = checked_cast(*input); - - ASSERT_OK_AND_ASSIGN(int64_t actual, input_ref.CountNullValues()); - ASSERT_EQ(expected_null_count, actual); -} - -TEST(TestDictionary, ComputeNullValues) { - std::shared_ptr type; - std::shared_ptr dict_type; - - for (const auto& index_type : all_dictionary_index_types()) { - ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - - type = boolean(); - dict_type = dictionary(index_type, type); - - // no null value - CheckDictionaryComputeNullValues(dict_type, "[]", "[]", 0); - CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[0, 1, 0]", 0); - - // only indices contain null value - CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, 0, 1]", 1); - CheckDictionaryComputeNullValues(dict_type, "[true, false]", "[null, null]", 2); - - // only dictionary contains null value - CheckDictionaryComputeNullValues(dict_type, "[null, true]", "[]", 0); - CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0]", 2); - - // both indices and dictionary contain null value - CheckDictionaryComputeNullValues(dict_type, "[null, true, false]", "[0, 1, 0, null]", - 3); - CheckDictionaryComputeNullValues(dict_type, "[null, true, null, false]", - "[null, 1, 0, 2, 3]", 3); - } -} - void CheckDictionaryCompact(const std::shared_ptr& dict_type, const std::string& input_dictionary_json, const std::string& input_index_json, From 1cb3381ed20d5fe9f8eecc092691b59e25656455 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:48:38 +0800 Subject: [PATCH 22/25] Revert "ComputeNullValues" This reverts commit 66e1c1ff276a5fe37bc6fc8027a41e12784e0f30. --- cpp/src/arrow/array/array_dict.cc | 60 ------------------------------- cpp/src/arrow/array/array_dict.h | 9 ----- 2 files changed, 69 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 0b714ddc2b9..28fccdbfcff 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -212,56 +212,6 @@ Result> TransposeDictIndices( return out_data; } -struct CompactDictionaryNullValuesVistor { - const std::shared_ptr& data; - int64_t& out_null_count; - - template - Status CompactDictionaryNullValuesImpl() { - int64_t index_length = data->length; - int64_t dict_length = data->dictionary->length; - const uint8_t* dictionary_null_bit_map = data->dictionary->GetValues(0); - - using CType = typename IndexArrowType::c_type; - const CType* indices_data = data->GetValues(1); - CType dict_len = static_cast(dict_length); - for (int64_t i = 0; i < index_length; i++) { - if (data->IsNull(i)) { - continue; - } - - CType current_index = indices_data[i]; - if (current_index < 0 || current_index >= dict_len) { - return Status::IndexError( - "Index out of bounds while counting dictionary array: ", current_index, - "(dictionary is ", dict_length, " long) at position ", i); - } - if (!bit_util::GetBit(dictionary_null_bit_map, current_index)) { - out_null_count++; - } - } - return Status::OK(); - } - - template - enable_if_integer Visit(const Type&) { - return CompactDictionaryNullValuesImpl(); - } - - Status Visit(const DataType& type) { - return Status::TypeError("Expected an Index Type of Int or UInt"); - } -}; - -Result CompactDictionaryNullValues(const std::shared_ptr& data) { - int64_t out_null_count = 0; - const auto& dict_type = checked_cast(*data->type); - CompactDictionaryNullValuesVistor vistor{data, out_null_count}; - RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), &vistor)); - - return out_null_count; -} - struct CompactTransposeMapVistor { const std::shared_ptr& data; arrow::MemoryPool* pool; @@ -373,16 +323,6 @@ Result> DictionaryArray::Transpose( return MakeArray(std::move(transposed)); } -Result DictionaryArray::CountNullValues() const { - if (this->dictionary()->null_count() == 0 || this->indices()->null_count() == 0) { - return this->indices()->null_count(); - } - - ARROW_ASSIGN_OR_RAISE(int64_t dictionary_null_count, - CompactDictionaryNullValues(data_)); - return dictionary_null_count + this->indices()->null_count(); -} - Result> DictionaryArray::Compact(MemoryPool* pool) const { std::shared_ptr compact_dictionary; ARROW_ASSIGN_OR_RAISE(std::unique_ptr transpose_map, diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index bbc224c3d86..9aa0a7bcc2d 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -96,15 +96,6 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& type, const std::shared_ptr& dictionary, const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; - /// \brief Count the number of null values as the dictionary array is decoded. - Result CountNullValues() const; - - /// \brief Compat this DictionaryArray - /// - /// This method returns a compacted dictionary array. All the - /// values in the dictionary are referenced by indices. - /// - /// \param[in] pool a pool to allocate the array data from Result> Compact(MemoryPool* pool = default_memory_pool()) const; /// \brief Determine whether dictionary arrays may be compared without unification From 686670a09d01e4022e62242cc1f5de731e03cef3 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 15 Nov 2023 09:58:31 +0800 Subject: [PATCH 23/25] lint --- cpp/src/arrow/array/array_test.cc | 3 ++- cpp/src/arrow/util/dict_util.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 3a1c04aa305..63fa5554558 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -87,7 +87,8 @@ void CheckDictionaryNullCount(const std::shared_ptr& dict_type, const int64_t& expected_logical_null_count, bool expected_may_have_nulls, bool expected_may_have_logical_nulls) { - std::shared_ptr arr = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); + std::shared_ptr arr = + DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); ASSERT_EQ(expected_null_count, arr->null_count()); ASSERT_EQ(expected_logical_null_count, arr->ComputeLogicalNullCount()); diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index 7d13d94289c..76e511f3b03 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -76,4 +76,4 @@ int64_t LogicalNullCount(const ArraySpan& span) { } } } // namespace dict_util -} // namespace arrow \ No newline at end of file +} // namespace arrow From 3c0b315621366c80c8eab03118dca20f0cfe7ffc Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 16 Nov 2023 08:38:45 +0800 Subject: [PATCH 24/25] Update cpp/src/arrow/array/array_test.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/array/array_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 63fa5554558..3a160bd81f4 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -90,10 +90,10 @@ void CheckDictionaryNullCount(const std::shared_ptr& dict_type, std::shared_ptr arr = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); - ASSERT_EQ(expected_null_count, arr->null_count()); - ASSERT_EQ(expected_logical_null_count, arr->ComputeLogicalNullCount()); - ASSERT_EQ(expected_may_have_nulls, arr->data()->MayHaveNulls()); - ASSERT_EQ(expected_may_have_logical_nulls, arr->data()->MayHaveLogicalNulls()); + ASSERT_EQ(arr->null_count(), expected_null_count); + ASSERT_EQ(arr->ComputeLogicalNullCount(), expected_logical_null_count); + ASSERT_EQ(arr->data()->MayHaveNulls(), expected_may_have_nulls); + ASSERT_EQ(arr->data()->MayHaveLogicalNulls(), expected_may_have_logical_nulls); } TEST_F(TestArray, TestNullCount) { From ed46f284dc17e7efa45ffa2d82a72e185b16f5ca Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 16 Nov 2023 09:44:12 +0800 Subject: [PATCH 25/25] slice --- cpp/src/arrow/array/array_test.cc | 12 +++++++++++- cpp/src/arrow/util/dict_util.cc | 14 ++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 63fa5554558..d51b2f0c401 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -144,7 +144,7 @@ TEST_F(TestArray, TestNullCount) { ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls()); ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls()); - // dictionary type + // Dictionary type std::shared_ptr type; std::shared_ptr dict_type; @@ -184,6 +184,16 @@ TEST_F(TestArray, TestSlicePreservesAllNullCount) { Int32Array arr(/*length=*/100, data, null_bitmap, /*null_count*/ 100); EXPECT_EQ(arr.Slice(1, 99)->data()->null_count, arr.Slice(1, 99)->length()); + + // Dictionary type + std::shared_ptr dict_type = dictionary(int64(), boolean()); + std::shared_ptr dict_arr = + DictArrayFromJSON(dict_type, /*indices=*/"[null, 0, 0, 0, 0, 0, 1, 2, 0, 0]", + /*dictionary=*/"[null, true, false]"); + ASSERT_EQ(dict_arr->null_count(), 1); + ASSERT_EQ(dict_arr->ComputeLogicalNullCount(), 8); + ASSERT_EQ(dict_arr->Slice(2, 8)->null_count(), 0); + ASSERT_EQ(dict_arr->Slice(2, 8)->ComputeLogicalNullCount(), 6); } TEST_F(TestArray, TestLength) { diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index 76e511f3b03..feab2324a40 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -27,21 +27,23 @@ namespace { template int64_t LogicalNullCount(const ArraySpan& span) { - const auto* indices_null_bit_map = span.GetValues(0); - const auto* dictionary_null_bit_map = span.dictionary().GetValues(0); + const auto* indices_null_bit_map = span.buffers[0].data; + const auto& dictionary_span = span.dictionary(); + const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data; using CType = typename IndexArrowType::c_type; const CType* indices_data = span.GetValues(1); - auto index_length = span.length; int64_t null_count = 0; - for (int64_t i = 0; i < index_length; i++) { - if (indices_null_bit_map != nullptr && !bit_util::GetBit(indices_null_bit_map, i)) { + for (int64_t i = 0; i < span.length; i++) { + if (indices_null_bit_map != nullptr && + !bit_util::GetBit(indices_null_bit_map, i + span.offset)) { null_count++; continue; } CType current_index = indices_data[i]; - if (!bit_util::GetBit(dictionary_null_bit_map, current_index)) { + if (!bit_util::GetBit(dictionary_null_bit_map, + current_index + dictionary_span.offset)) { null_count++; } }