From 771536add5b0f9427c248b2ddeda65dd52989589 Mon Sep 17 00:00:00 2001 From: Ariana Villegas Date: Wed, 8 Jun 2022 12:48:53 -0500 Subject: [PATCH 01/11] Draft sort indices on array dictionary --- cpp/src/arrow/array/array_dict.h | 4 ++ .../compute/kernels/vector_array_sort.cc | 57 ++++++++++++++++++- cpp/src/arrow/compute/kernels/vector_sort.cc | 8 +-- .../compute/kernels/vector_sort_internal.h | 8 +-- .../arrow/compute/kernels/vector_sort_test.cc | 11 ++++ 5 files changed, 79 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index 8791eaa07db..207cffe54c8 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -111,6 +111,10 @@ class ARROW_EXPORT DictionaryArray : public Array { const DictionaryType* dict_type() const { return dict_type_; } + bool IsNull(int64_t i) const { + return indices_->IsNull(i) || dictionary_->IsNull(GetValueIndex(i)); + } + private: void SetData(const std::shared_ptr& data); const DictionaryType* dict_type_; diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc index 324a435441f..8e65308602d 100644 --- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc @@ -173,6 +173,52 @@ class ArrayCompareSorter { } }; +template <> +class ArrayCompareSorter { + public: + NullPartitionResult operator()(uint64_t* indices_begin, uint64_t* indices_end, + const Array& array, int64_t offset, + const ArraySortOptions& options) { + const auto& dict_array = checked_cast(array); + + const auto& indices = checked_cast(*(dict_array.indices())); + const auto& values = dict_array.dictionary(); + + const auto p = PartitionNulls( + indices_begin, indices_end, dict_array, offset, options.null_placement); + + auto indices_array = + CallFunction("array_sort_indices", {values}, &options).ValueOrDie().make_array(); + const auto& indices_values = checked_cast(*indices_array); + + std::vector sort_order(indices_values.length()); + uint64_t cur = 0; + auto cur_idx = GetViewType::LogicalValue(indices_values.GetView(cur)); + auto cur_val = values->GetScalar(cur_idx); + for (int i = 0; i < indices_values.length(); i++) { + auto tmp_idx = GetViewType::LogicalValue(indices_values.GetView(i)); + auto tmp_val = values->GetScalar(tmp_idx); + if (cur_val != tmp_val) { + cur = i; + cur_val = tmp_val; + } + sort_order[tmp_idx] = cur; + } + + std::stable_sort( + p.non_nulls_begin, p.non_nulls_end, + [&indices, &sort_order, &offset](uint64_t left, uint64_t right) { + const auto lhs = + GetViewType::LogicalValue(indices.GetView(left - offset)); + const auto rhs = + GetViewType::LogicalValue(indices.GetView(right - offset)); + return sort_order[lhs] < sort_order[rhs]; + }); + + return p; + } +}; + template class ArrayCountSorter { using ArrayType = typename TypeTraits::ArrayType; @@ -405,7 +451,8 @@ struct ArraySorter::value && template struct ArraySorter< Type, enable_if_t::value || is_base_binary_type::value || - is_fixed_size_binary_type::value>> { + is_fixed_size_binary_type::value || + is_dictionary_type::value>> { ArrayCompareSorter impl; }; @@ -507,6 +554,13 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) { DCHECK_OK(func->AddKernel(base)); } +template