From ce5953331363ee302f60161d02d45eb42aafc7be Mon Sep 17 00:00:00 2001 From: Ariana Villegas Date: Wed, 8 Jun 2022 12:48:53 -0500 Subject: [PATCH 01/11] Changes from original PR Draft sort indices on array dictionary Update sort indices Update sorter Address feedback Address feedback Add tests and benchmarks Some nits re:benchmarking Some nits Use the faster algorithm (rank-then-take-then-sort) Remove unused code More TODOs --- cpp/src/arrow/array/array_dict.h | 4 + .../compute/kernels/vector_array_sort.cc | 83 ++++++++++++++++++- cpp/src/arrow/compute/kernels/vector_sort.cc | 8 +- .../compute/kernels/vector_sort_benchmark.cc | 70 ++++++++++++++++ .../compute/kernels/vector_sort_internal.h | 8 +- .../arrow/compute/kernels/vector_sort_test.cc | 71 ++++++++++++++++ cpp/src/arrow/util/benchmark_util.h | 2 +- 7 files changed, 236 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index 8791eaa07db..676ab87b8f3 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -111,6 +111,10 @@ class ARROW_EXPORT DictionaryArray : public Array { const DictionaryType* dict_type() const { return dict_type_; } + bool IsNull(int64_t i) const { + return indices_->IsNull(i) || dictionary()->IsNull(GetValueIndex(i)); + } + private: void SetData(const std::shared_ptr& data); const DictionaryType* dict_type_; diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc index ccf691669b8..0d5104e695e 100644 --- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc @@ -173,6 +173,78 @@ class ArrayCompareSorter { } }; +template <> +class ArrayCompareSorter { + public: + NullPartitionResult operator()(uint64_t* indices_begin, uint64_t* indices_end, + const Array& array, int64_t offset, + const ArraySortOptions& options) { + const auto& dict_array = checked_cast(array); + const auto& dict_values = dict_array.dictionary(); + const auto& dict_indices = dict_array.indices(); + + // Algorithm: + // 1) Use the Rank function to get an exactly-equivalent-order array + // of the dictionary values, but with a datatype that's friendlier to + // sorting (uint64). + // 2) Act as if we were sorting a dictionary array with the same indices, + // but with the ranks as dictionary values. + // 2a) Dictionary-decode the ranks by calling Take. + // 2b) Sort the decoded ranks. Not only those are uint64, they are dense + // in a [0, k) range where k is the number of unique dictionary values. + // Therefore, unless the dictionary is very large, a fast counting sort + // will be used. + // + // The bottom line is that performance will usually be much better + // (potentially an order of magnitude faster) than by naively decoding + // the original dictionary and sorting the decoded version. + + // TODO special-case all-nulls arrays to avoid ranking and decoding them? + + // FIXME Should be able to use the caller's KernelContext for rank() and take() + + // FIXME Propagate errors instead of aborting + auto ranks = *RanksWithNulls(dict_values); + + auto decoded_ranks = *Take(*ranks, *dict_indices); + + auto rank_sorter = *GetArraySorter(*decoded_ranks->type() /* should be uint64 */); + return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, options); + } + + private: + Result> RanksWithNulls(const std::shared_ptr& array) { + // Notes: + // * The order is always ascending here, since the goal is to produce + // an exactly-equivalent-order of the dictionary values. + // * We're going to re-emit nulls in the output, so we can just always consider + // them "at the end". Note that choosing AtStart would merely shift other + // ranks by 1 if there are any nulls... + RankOptions rank_options(SortOrder::Ascending, NullPlacement::AtEnd, + RankOptions::Dense); + + // XXX Should this support Type::NA? + auto data = array->data(); + std::shared_ptr null_bitmap; + if (array->null_count() > 0) { + null_bitmap = array->null_bitmap(); + data = array->data()->Copy(); + data->buffers[0] = nullptr; + data->null_count = 0; + DCHECK_EQ(data->offset, 0); // FIXME + } + ARROW_ASSIGN_OR_RAISE(auto rank_datum, CallFunction("rank", {array}, &rank_options)); + auto rank_data = rank_datum.array(); + DCHECK_EQ(rank_data->GetNullCount(), 0); + // If there were nulls in the input, paste them in the output + if (null_bitmap) { + rank_data->buffers[0] = std::move(null_bitmap); + rank_data->null_count = array->null_count(); + } + return MakeArray(rank_data); + } +}; + template class ArrayCountSorter { using ArrayType = typename TypeTraits::ArrayType; @@ -405,7 +477,8 @@ struct ArraySorter::value && template struct ArraySorter< Type, enable_if_t::value || is_base_binary_type::value || - is_fixed_size_binary_type::value>> { + is_fixed_size_binary_type::value || + is_dictionary_type::value>> { ArrayCompareSorter impl; }; @@ -508,6 +581,13 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) { DCHECK_OK(func->AddKernel(base)); } +template