diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index 8791eaa07db..676ab87b8f3 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -111,6 +111,10 @@ class ARROW_EXPORT DictionaryArray : public Array { const DictionaryType* dict_type() const { return dict_type_; } + bool IsNull(int64_t i) const { + return indices_->IsNull(i) || dictionary()->IsNull(GetValueIndex(i)); + } + private: void SetData(const std::shared_ptr& data); const DictionaryType* dict_type_; diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc index 324a435441f..8d55c645084 100644 --- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc @@ -173,6 +173,78 @@ class ArrayCompareSorter { } }; +template <> +class ArrayCompareSorter { + public: + NullPartitionResult operator()(uint64_t* indices_begin, uint64_t* indices_end, + const Array& array, int64_t offset, + const ArraySortOptions& options) { + const auto& dict_array = checked_cast(array); + const auto& dict_values = dict_array.dictionary(); + const auto& dict_indices = dict_array.indices(); + + // Algorithm: + // 1) Use the Rank function to get an exactly-equivalent-order array + // of the dictionary values, but with a datatype that's friendlier to + // sorting (uint64). + // 2) Act as if we were sorting a dictionary array with the same indices, + // but with the ranks as dictionary values. + // 2a) Dictionary-decode the ranks by calling Take. + // 2b) Sort the decoded ranks. Not only those are uint64, they are dense + // in a [0, k) range where k is the number of unique dictionary values. + // Therefore, unless the dictionary is very large, a fast counting sort + // will be used. + // + // The bottom line is that performance will usually be much better + // (potentially an order of magnitude faster) than by naively decoding + // the original dictionary and sorting the decoded version. + + // TODO special-case all-nulls arrays to avoid ranking and decoding them? + + // FIXME Should be able to use the caller's KernelContext for rank() and take() + + // FIXME Propagate errors instead of aborting + auto ranks = *RanksWithNulls(dict_values); + + auto decoded_ranks = *Take(*ranks, *dict_indices); + + auto rank_sorter = *GetArraySorter(*decoded_ranks->type() /* should be uint64 */); + return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, options); + } + + private: + Result> RanksWithNulls(const std::shared_ptr& array) { + // Notes: + // * The order is always ascending here, since the goal is to produce + // an exactly-equivalent-order of the dictionary values. + // * We're going to re-emit nulls in the output, so we can just always consider + // them "at the end". Note that choosing AtStart would merely shift other + // ranks by 1 if there are any nulls... + RankOptions rank_options(SortOrder::Ascending, NullPlacement::AtEnd, + RankOptions::Dense); + + // XXX Should this support Type::NA? + auto data = array->data(); + std::shared_ptr null_bitmap; + if (array->null_count() > 0) { + null_bitmap = array->null_bitmap(); + data = array->data()->Copy(); + data->buffers[0] = nullptr; + data->null_count = 0; + DCHECK_EQ(data->offset, 0); // FIXME + } + ARROW_ASSIGN_OR_RAISE(auto rank_datum, CallFunction("rank", {array}, &rank_options)); + auto rank_data = rank_datum.array(); + DCHECK_EQ(rank_data->GetNullCount(), 0); + // If there were nulls in the input, paste them in the output + if (null_bitmap) { + rank_data->buffers[0] = std::move(null_bitmap); + rank_data->null_count = array->null_count(); + } + return MakeArray(rank_data); + } +}; + template class ArrayCountSorter { using ArrayType = typename TypeTraits::ArrayType; @@ -405,7 +477,8 @@ struct ArraySorter::value && template struct ArraySorter< Type, enable_if_t::value || is_base_binary_type::value || - is_fixed_size_binary_type::value>> { + is_fixed_size_binary_type::value || + is_dictionary_type::value>> { ArrayCompareSorter impl; }; @@ -507,6 +580,13 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) { DCHECK_OK(func->AddKernel(base)); } +template