diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 777adee41a5..25d950db93f 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -2177,29 +2177,52 @@ garrow_array_is_in_chunked_array(GArrowArray *left, } /** - * garrow_array_sort_to_indices: + * garrow_array_sort_indices: * @array: A #GArrowArray. + * @order: The order for sort. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable) (transfer full): The indices that would sort - * an array on success, %NULL on error. + * an array in the specified order on success, %NULL on error. * - * Since: 0.15.0 + * Since: 3.0.0 */ GArrowUInt64Array * -garrow_array_sort_to_indices(GArrowArray *array, - GError **error) +garrow_array_sort_indices(GArrowArray *array, + GArrowSortOrder order, + GError **error) { auto arrow_array = garrow_array_get_raw(array); auto arrow_array_raw = arrow_array.get(); - auto arrow_indices_array = arrow::compute::SortToIndices(*arrow_array_raw); - if (garrow::check(error, arrow_indices_array, "[array][sort-to-indices]")) { + auto arrow_order = static_cast(order); + auto arrow_indices_array = + arrow::compute::SortIndices(*arrow_array_raw, arrow_order); + if (garrow::check(error, arrow_indices_array, "[array][sort-indices]")) { return GARROW_UINT64_ARRAY(garrow_array_new_raw(&(*arrow_indices_array))); } else { return NULL; } } +/** + * garrow_array_sort_to_indices: + * @array: A #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): The indices that would sort + * an array in ascending order on success, %NULL on error. + * + * Since: 0.15.0 + * + * Deprecated: 3.0.0: Use garrow_array_sort_indices() instead. + */ +GArrowUInt64Array * +garrow_array_sort_to_indices(GArrowArray *array, + GError **error) +{ + return garrow_array_sort_indices(array, GARROW_SORT_ORDER_ASCENDING, error); +} + /** * garrow_table_filter: * @table: A #GArrowTable. diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 48fdc3a1c1e..2e304d6e5a6 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -88,7 +88,7 @@ GArrowCastOptions *garrow_cast_options_new(void); * @GARROW_COUNT_ALL: Count all non-null values. * @GARROW_COUNT_NULL: Count all null values. * - * They are corresponding to `arrow::compute::CountOptions::mode` values. + * They are corresponding to `arrow::compute::CountOptions::Mode` values. */ typedef enum { GARROW_COUNT_ALL, @@ -377,10 +377,32 @@ GArrowBooleanArray * garrow_array_is_in_chunked_array(GArrowArray *left, GArrowChunkedArray *right, GError **error); + +/** + * GArrowSortOrder: + * @GARROW_SORT_ORDER_ASCENDING: Sort in ascending order. + * @GARROW_SORT_ORDER_DESCENDING: Sort in descending order. + * + * They are corresponding to `arrow::compute::SortOrder` values. + * + * Since: 3.0.0 + */ +typedef enum { + GARROW_SORT_ORDER_ASCENDING, + GARROW_SORT_ORDER_DESCENDING, +} GArrowSortOrder; + +GARROW_AVAILABLE_IN_3_0 +GArrowUInt64Array * +garrow_array_sort_indices(GArrowArray *array, + GArrowSortOrder order, + GError **error); +GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_sort_indices) GARROW_AVAILABLE_IN_0_15 GArrowUInt64Array * garrow_array_sort_to_indices(GArrowArray *array, GError **error); + GARROW_AVAILABLE_IN_0_16 GArrowTable * garrow_table_filter(GArrowTable *table, diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 0d2069fa626..4ad59afd1e3 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_3_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 3.0.0 + */ +#define GARROW_VERSION_3_0 G_ENCODE_VERSION(3, 0) + /** * GARROW_VERSION_2_0: * @@ -229,6 +238,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_3_0 +# define GARROW_DEPRECATED_IN_3_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_3_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_3_0 +# define GARROW_DEPRECATED_IN_3_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_3_0 +# define GARROW_AVAILABLE_IN_3_0 GARROW_UNAVAILABLE(3, 0) +#else +# define GARROW_AVAILABLE_IN_3_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_2_0 # define GARROW_DEPRECATED_IN_2_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_2_0_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 72a01f50e4f..bbb858eef7d 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -179,6 +179,10 @@ Index of deprecated API + + Index of new symbols in 3.0.0 + + Index of new symbols in 2.0.0 diff --git a/c_glib/test/test-sort-to-indices.rb b/c_glib/test/test-sort-indices.rb similarity index 85% rename from c_glib/test/test-sort-to-indices.rb rename to c_glib/test/test-sort-indices.rb index 355efd7e1d9..d7745c75f25 100644 --- a/c_glib/test/test-sort-to-indices.rb +++ b/c_glib/test/test-sort-indices.rb @@ -15,20 +15,20 @@ # specific language governing permissions and limitations # under the License. -class TestSortToIndices < Test::Unit::TestCase +class TestSortIndices < Test::Unit::TestCase include Helper::Buildable sub_test_case("Integer") do def test_no_null array = build_int16_array([1, 0, 4, -3]) assert_equal(build_uint64_array([3, 1, 0, 2]), - array.sort_to_indices) + array.sort_indices(:ascending)) end def test_null array = build_int16_array([nil, 1, 0, nil, 4, 3]) assert_equal(build_uint64_array([2, 1, 5, 4, 0, 3]), - array.sort_to_indices) + array.sort_indices(:ascending)) end end @@ -36,13 +36,13 @@ def test_null def test_no_null array = build_string_array(["hello", "world", "a", "z"]) assert_equal(build_uint64_array([2, 0, 1, 3]), - array.sort_to_indices) + array.sort_indices(:ascending)) end def test_null array = build_string_array([nil, "b", "a", nil, "c", "d"]) assert_equal(build_uint64_array([2, 1, 4, 5, 0, 3]), - array.sort_to_indices) + array.sort_indices(:ascending)) end end end diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml index 90cef3ea2d1..534ede6df1c 100644 --- a/ci/conda_env_cpp.yml +++ b/ci/conda_env_cpp.yml @@ -16,7 +16,7 @@ # under the License. aws-sdk-cpp -benchmark=1.4.1 +benchmark=1.5.2 boost-cpp>=1.68.0 brotli bzip2 diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 22531fcfc57..131e8ad6caa 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -184,9 +184,9 @@ macro(resolve_dependency DEPENDENCY_NAME) if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") if(ARG_REQUIRED_VERSION) - find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION} MODULE) + find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION}) else() - find_package(${DEPENDENCY_NAME} MODULE) + find_package(${DEPENDENCY_NAME}) endif() if(${${DEPENDENCY_NAME}_FOUND}) set(${DEPENDENCY_NAME}_SOURCE "SYSTEM") @@ -1797,7 +1797,21 @@ macro(build_benchmark) endmacro() if(ARROW_BUILD_BENCHMARKS) - resolve_dependency(benchmark) + # ArgsProduct() is available since 1.5.2 + set(BENCHMARK_REQUIRED_VERSION 1.5.2) + if("${ARROW_DEPENDENCY_SOURCE}" STREQUAL "CONDA" + AND "${benchmark_SOURCE}" STREQUAL "SYSTEM") + # TODO: Remove this workaround once + # https://github.com/google/benchmark/issues/1046 is resolved. + # + # benchmark doesn't set suitable version when we use released + # archive. So the benchmark package on conda-forge isn't report + # the real version. We accept all the benchmark package with + # conda. Conda users should install benchmark 1.5.2 or later by + # ci/conda_env_cpp.yml. + set(BENCHMARK_REQUIRED_VERSION 0.0.0) + endif() + resolve_dependency(benchmark REQUIRED_VERSION ${BENCHMARK_REQUIRED_VERSION}) # TODO: Don't use global includes but rather target_include_directories get_target_property(BENCHMARK_INCLUDE_DIR benchmark::benchmark INTERFACE_INCLUDE_DIRECTORIES) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 1f6972860ae..5eac72c9912 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -46,8 +46,26 @@ Result> NthToIndices(const Array& values, int64_t n, return result.make_array(); } -Result> SortToIndices(const Array& values, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("sort_indices", {Datum(values)}, ctx)); +Result> SortIndices(const Array& values, SortOrder order, + ExecContext* ctx) { + ArraySortOptions options(order); + ARROW_ASSIGN_OR_RAISE( + Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx)); + return result.make_array(); +} + +Result> SortIndices(const ChunkedArray& chunked_array, + SortOrder order, ExecContext* ctx) { + SortOptions options({SortKey("not-used", order)}); + ARROW_ASSIGN_OR_RAISE( + Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx)); + return result.make_array(); +} + +Result> SortIndices(const Table& table, const SortOptions& options, + ExecContext* ctx) { + ARROW_ASSIGN_OR_RAISE(Datum result, + CallFunction("sort_indices", {Datum(table)}, &options, ctx)); return result.make_array(); } @@ -135,5 +153,9 @@ Result> Take(const Table& table, const ChunkedArray& indi return result.table(); } +Result> SortToIndices(const Array& values, ExecContext* ctx) { + return SortIndices(values, SortOrder::Ascending, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 2c77e8ee155..0ed3f4d36ff 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -58,6 +58,34 @@ struct ARROW_EXPORT TakeOptions : public FunctionOptions { static TakeOptions Defaults() { return BoundsCheck(); } }; +enum class SortOrder { + Ascending, + Descending, +}; + +/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices +struct ARROW_EXPORT SortKey { + explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending) + : name(name), order(order) {} + + /// The name of the sort column. + std::string name; + /// How to order by this sort key. + SortOrder order; +}; + +struct ARROW_EXPORT ArraySortOptions : public FunctionOptions { + explicit ArraySortOptions(SortOrder order = SortOrder::Ascending) : order(order) {} + + SortOrder order; +}; + +struct ARROW_EXPORT SortOptions : public FunctionOptions { + explicit SortOptions(std::vector sort_keys = {}) : sort_keys(sort_keys) {} + + std::vector sort_keys; +}; + /// \brief Partitioning options for NthToIndices struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions { explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {} @@ -152,21 +180,71 @@ ARROW_EXPORT Result> NthToIndices(const Array& values, int64_t n, ExecContext* ctx = NULLPTR); -/// \brief Returns the indices that would sort an array. +/// \brief Returns the indices that would sort an array in the +/// specified order. /// /// Perform an indirect sort of array. The output array will contain /// indices that would sort an array, which would be the same length -/// as input. Nulls will be stably partitioned to the end of the output. +/// as input. Nulls will be stably partitioned to the end of the output +/// regardless of order. /// -/// For example given values = [null, 1, 3.3, null, 2, 5.3], the output -/// will be [1, 4, 2, 5, 0, 3] +/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order +/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0, +/// 3]. /// -/// \param[in] values array to sort +/// \param[in] array array to sort +/// \param[in] order ascending or descending /// \param[in] ctx the function execution context, optional /// \return offsets indices that would sort an array ARROW_EXPORT -Result> SortToIndices(const Array& values, - ExecContext* ctx = NULLPTR); +Result> SortIndices(const Array& array, + SortOrder order = SortOrder::Ascending, + ExecContext* ctx = NULLPTR); + +/// \brief Returns the indices that would sort a chunked array in the +/// specified order. +/// +/// Perform an indirect sort of chunked array. The output array will +/// contain indices that would sort a chunked array, which would be +/// the same length as input. Nulls will be stably partitioned to the +/// end of the output regardless of order. +/// +/// For example given chunked_array = [[null, 1], [3.3], [null, 2, +/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2, +/// 4, 1, 0, 3]. +/// +/// \param[in] chunked_array chunked array to sort +/// \param[in] order ascending or descending +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort an array +ARROW_EXPORT +Result> SortIndices(const ChunkedArray& chunked_array, + SortOrder order = SortOrder::Ascending, + ExecContext* ctx = NULLPTR); + +/// \brief Returns the indices that would sort a table in the +/// specified order. +/// +/// Perform an indirect sort of table. The output array will contain +/// indices that would sort a table, which would be the same length as +/// input. Nulls will be stably partitioned to the end of the output +/// regardless of order. +/// +/// For example given table = { +/// "column1": [[null, 1], [ 3, null, 2, 1]], +/// "column2": [[ 5], [3, null, null, 5, 5]], +/// } and options = { +/// {"column1", SortOrder::Ascending}, +/// {"column2", SortOrder::Descending}, +/// }, the output will be [5, 1, 4, 2, 0, 3]. +/// +/// \param[in] table table to sort +/// \param[in] options options +/// \param[in] ctx the function execution context, optional +/// \return offsets indices that would sort a table +ARROW_EXPORT +Result> SortIndices(const Table& table, const SortOptions& options, + ExecContext* ctx = NULLPTR); /// \brief Compute unique elements from an array-like object /// @@ -254,5 +332,10 @@ Result> Take(const Table& table, const ChunkedArray& indi const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR); +ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()") +ARROW_EXPORT +Result> SortToIndices(const Array& values, + ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index c595736912d..7b181eaec6e 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -142,7 +142,7 @@ struct TakeBenchmark { indices_null_proportion); if (monotonic_indices) { - auto arg_sorter = *SortToIndices(*indices); + auto arg_sorter = *SortIndices(*indices); indices = *Take(*indices, *arg_sorter); } diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 989c75757f6..9f7788a25de 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -19,19 +19,61 @@ #include #include #include +#include #include #include "arrow/array/data.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common.h" +#include "arrow/table.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/optional.h" namespace arrow { + +using internal::checked_cast; + namespace compute { namespace internal { namespace { +// The target chunk in chunked array. +template +struct ResolvedChunk { + using ViewType = decltype(std::declval().GetView(0)); + + // The target array in chunked array. + const ArrayType* array; + // The index in the target array. + const int64_t index; + + ResolvedChunk(const ArrayType* array, int64_t index) : array(array), index(index) {} + + bool IsNull() const { return array->IsNull(index); } + + ViewType GetView() const { return array->GetView(index); } +}; + +// Finds the target chunk and index in the target chunk from an index +// in chunked array. `chunks` is not shared array of +// ChunkedArray::chunks() for performance. +template +ResolvedChunk ResolveChunk(const std::vector& chunks, + int64_t index) { + const auto num_chunks = chunks.size(); + int64_t offset = 0; + for (size_t i = 0; i < num_chunks; ++i) { + if (index < offset + chunks[i]->length()) { + return ResolvedChunk(checked_cast(chunks[i]), + index - offset); + } + offset += chunks[i]->length(); + } + // Never reach here. `index` must be validated in caller. + return ResolvedChunk(nullptr, 0); +} + // NOTE: std::partition is usually faster than std::stable_partition. struct NonStablePartitioner { @@ -49,37 +91,87 @@ struct StablePartitioner { } }; -// Move Nulls to end of array. Return where Null starts. +// Move nulls to end of array. Return where null starts. +// +// `offset` is used when this is called on a chunk of a chunked array template enable_if_t::value, uint64_t*> -PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) { +PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values, + int64_t offset) { if (values.null_count() == 0) { return indices_end; } Partitioner partitioner; - return partitioner(indices_begin, indices_end, - [&values](uint64_t ind) { return !values.IsNull(ind); }); + return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) { + return !values.IsNull(ind - offset); + }); +} + +// For chunked array. +template +enable_if_t::value, uint64_t*> +PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end, + const std::vector& arrays, int64_t null_count) { + if (null_count == 0) { + return indices_end; + } + Partitioner partitioner; + return partitioner(indices_begin, indices_end, [&arrays](uint64_t ind) { + const auto chunk = ResolveChunk(arrays, ind); + return !chunk.IsNull(); + }); } -// Move NaNs and Nulls to end of array, Nulls after NaN. -// Return where NaN/Null starts. +// Move NaNs and nulls to end of array, nulls after NaN. Return where +// NaN/null starts. +// +// `offset` is used when this is called on a chunk of a chunked array template enable_if_t::value, uint64_t*> -PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) { +PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values, + int64_t offset) { Partitioner partitioner; if (values.null_count() == 0) { - return partitioner(indices_begin, indices_end, [&values](uint64_t ind) { - return !std::isnan(values.GetView(ind)); + return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) { + return !std::isnan(values.GetView(ind - offset)); }); } uint64_t* nulls_begin = - partitioner(indices_begin, indices_end, [&values](uint64_t ind) { - return !values.IsNull(ind) && !std::isnan(values.GetView(ind)); + partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) { + return !values.IsNull(ind - offset) && !std::isnan(values.GetView(ind - offset)); }); - // move Nulls after NaN + // move nulls after NaN if (values.null_count() < static_cast(indices_end - nulls_begin)) { - partitioner(nulls_begin, indices_end, - [&values](uint64_t ind) { return !values.IsNull(ind); }); + partitioner(nulls_begin, indices_end, [&values, &offset](uint64_t ind) { + return !values.IsNull(ind - offset); + }); + } + return nulls_begin; +} + +// For chunked array. +template +enable_if_t::value, uint64_t*> +PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end, + const std::vector& arrays, int64_t null_count) { + Partitioner partitioner; + if (null_count == 0) { + return partitioner(indices_begin, indices_end, [&arrays](uint64_t ind) { + const auto chunk = ResolveChunk(arrays, ind); + return !std::isnan(chunk.GetView()); + }); + } + uint64_t* nulls_begin = + partitioner(indices_begin, indices_end, [&arrays](uint64_t ind) { + const auto chunk = ResolveChunk(arrays, ind); + return !chunk.IsNull() && !std::isnan(chunk.GetView()); + }); + // move nulls after NaN + if (null_count < static_cast(indices_end - nulls_begin)) { + partitioner(nulls_begin, indices_end, [&arrays](uint64_t ind) { + const auto chunk = ResolveChunk(arrays, ind); + return !chunk.IsNull(); + }); } return nulls_begin; } @@ -130,7 +222,7 @@ struct PartitionNthToIndices { return; } auto nulls_begin = - PartitionNulls(out_begin, out_end, arr); + PartitionNulls(out_begin, out_end, arr, 0); auto nth_begin = out_begin + pivot; if (nth_begin < nulls_begin) { std::nth_element(out_begin, nth_begin, nulls_begin, @@ -164,30 +256,43 @@ inline void VisitRawValuesInline(const ArrayType& values, } template -class CompareSorter { +class ArrayCompareSorter { using ArrayType = typename TypeTraits::ArrayType; public: - void Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) { - std::iota(indices_begin, indices_end, 0); - auto nulls_begin = - PartitionNulls(indices_begin, indices_end, values); - std::stable_sort(indices_begin, nulls_begin, - [&values](uint64_t left, uint64_t right) { - return values.GetView(left) < values.GetView(right); - }); + // Returns where null starts. + // + // `offset` is used when this is called on a chunk of a chunked array + uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values, + int64_t offset, const ArraySortOptions& options) { + auto nulls_begin = PartitionNulls( + indices_begin, indices_end, values, offset); + if (options.order == SortOrder::Ascending) { + std::stable_sort( + indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) { + return values.GetView(left - offset) < values.GetView(right - offset); + }); + } else { + std::stable_sort( + indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) { + // We don't use 'left > right' here to reduce required operator. + // If we use 'right < left' here, '<' is only required. + return values.GetView(right - offset) < values.GetView(left - offset); + }); + } + return nulls_begin; } }; template -class CountSorter { +class ArrayCountSorter { using ArrayType = typename TypeTraits::ArrayType; using c_type = typename ArrowType::c_type; public: - CountSorter() = default; + ArrayCountSorter() = default; - explicit CountSorter(c_type min, c_type max) { SetMinMax(min, max); } + explicit ArrayCountSorter(c_type min, c_type max) { SetMinMax(min, max); } // Assume: max >= min && (max - min) < 4Gi void SetMinMax(c_type min, c_type max) { @@ -195,12 +300,14 @@ class CountSorter { value_range_ = static_cast(max - min) + 1; } - void Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) { + // Returns where null starts. + uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values, + int64_t offset, const ArraySortOptions& options) { // 32bit counter performs much better than 64bit one if (values.length() < (1LL << 32)) { - SortInternal(indices_begin, indices_end, values); + return SortInternal(indices_begin, indices_end, values, offset, options); } else { - SortInternal(indices_begin, indices_end, values); + return SortInternal(indices_begin, indices_end, values, offset, options); } } @@ -208,25 +315,45 @@ class CountSorter { c_type min_{0}; uint32_t value_range_{0}; + // Returns where null starts. + // + // `offset` is used when this is called on a chunk of a chunked array template - void SortInternal(uint64_t* indices_begin, uint64_t* indices_end, - const ArrayType& values) { + uint64_t* SortInternal(uint64_t* indices_begin, uint64_t* indices_end, + const ArrayType& values, int64_t offset, + const ArraySortOptions& options) { const uint32_t value_range = value_range_; // first slot reserved for prefix sum std::vector counts(1 + value_range); - VisitRawValuesInline( - values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {}); - - for (uint32_t i = 1; i <= value_range; ++i) { - counts[i] += counts[i - 1]; + if (options.order == SortOrder::Ascending) { + VisitRawValuesInline( + values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {}); + for (uint32_t i = 1; i <= value_range; ++i) { + counts[i] += counts[i - 1]; + } + auto null_position = counts[value_range]; + auto nulls_begin = indices_begin + null_position; + int64_t index = offset; + VisitRawValuesInline( + values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; }, + [&]() { indices_begin[null_position++] = index++; }); + return nulls_begin; + } else { + VisitRawValuesInline( + values, [&](c_type v) { ++counts[v - min_]; }, []() {}); + for (uint32_t i = value_range; i >= 1; --i) { + counts[i - 1] += counts[i]; + } + auto null_position = counts[0]; + auto nulls_begin = indices_begin + null_position; + int64_t index = offset; + VisitRawValuesInline( + values, [&](c_type v) { indices_begin[counts[v - min_ + 1]++] = index++; }, + [&]() { indices_begin[null_position++] = index++; }); + return nulls_begin; } - - int64_t index = 0; - VisitRawValuesInline( - values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; }, - [&]() { indices_begin[counts[value_range]++] = index++; }); } }; @@ -234,12 +361,16 @@ class CountSorter { // - Use O(n) counting sort if values are in a small range // - Use O(nlogn) std::stable_sort otherwise template -class CountOrCompareSorter { +class ArrayCountOrCompareSorter { using ArrayType = typename TypeTraits::ArrayType; using c_type = typename ArrowType::c_type; public: - void Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) { + // Returns where null starts. + // + // `offset` is used when this is called on a chunk of a chunked array + uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values, + int64_t offset, const ArraySortOptions& options) { if (values.length() >= countsort_min_len_ && values.length() > values.null_count()) { c_type min{std::numeric_limits::max()}; c_type max{std::numeric_limits::min()}; @@ -257,17 +388,16 @@ class CountOrCompareSorter { if (static_cast(max) - static_cast(min) <= countsort_max_range_) { count_sorter_.SetMinMax(min, max); - count_sorter_.Sort(indices_begin, indices_end, values); - return; + return count_sorter_.Sort(indices_begin, indices_end, values, offset, options); } } - compare_sorter_.Sort(indices_begin, indices_end, values); + return compare_sorter_.Sort(indices_begin, indices_end, values, offset, options); } private: - CompareSorter compare_sorter_; - CountSorter count_sorter_; + ArrayCompareSorter compare_sorter_; + ArrayCountSorter count_sorter_; // Cross point to prefer counting sort than stl::stable_sort(merge sort) // - array to be sorted is longer than "count_min_len_" @@ -283,36 +413,40 @@ class CountOrCompareSorter { }; template -struct Sorter; +struct ArraySorter; template <> -struct Sorter { - CountSorter impl; - Sorter() : impl(0, 255) {} +struct ArraySorter { + ArrayCountSorter impl; + ArraySorter() : impl(0, 255) {} }; template <> -struct Sorter { - CountSorter impl; - Sorter() : impl(-128, 127) {} +struct ArraySorter { + ArrayCountSorter impl; + ArraySorter() : impl(-128, 127) {} }; template -struct Sorter::value && - (sizeof(typename Type::c_type) > 1)>> { - CountOrCompareSorter impl; +struct ArraySorter::value && + (sizeof(typename Type::c_type) > 1)>> { + ArrayCountOrCompareSorter impl; }; template -struct Sorter::value || - is_base_binary_type::value>> { - CompareSorter impl; +struct ArraySorter::value || + is_base_binary_type::value>> { + ArrayCompareSorter impl; }; +using ArraySortIndicesState = internal::OptionsWrapper; + template -struct SortIndices { +struct ArraySortIndices { using ArrayType = typename TypeTraits::ArrayType; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& options = ArraySortIndicesState::Get(ctx); + std::shared_ptr arg0; KERNEL_RETURN_IF_ERROR( ctx, @@ -321,9 +455,10 @@ struct SortIndices { ArrayData* out_arr = out->mutable_array(); uint64_t* out_begin = out_arr->GetMutableValues(1); uint64_t* out_end = out_begin + arr.length(); + std::iota(out_begin, out_end, 0); - Sorter sorter; - sorter.impl.Sort(out_begin, out_end, arr); + ArraySorter sorter; + sorter.impl.Sort(out_begin, out_end, arr, 0, options); } }; @@ -346,14 +481,652 @@ void AddSortingKernels(VectorKernel base, VectorFunction* func) { } } +// Sort a chunked array directly without sorting each array in the +// chunked array. This is used for processing the second and following +// sort keys in TableRadixSorter. +// +// This uses the same algorithm as ArrayCompareSorter. +template +class ChunkedArrayCompareSorter { + using ArrayType = typename TypeTraits::ArrayType; + + public: + // Returns where null starts. + uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, + const std::vector& arrays, int64_t null_count, + const ArraySortOptions& options) { + auto nulls_begin = PartitionNulls( + indices_begin, indices_end, arrays, null_count); + if (options.order == SortOrder::Ascending) { + std::stable_sort(indices_begin, nulls_begin, + [&arrays](uint64_t left, uint64_t right) { + const auto chunk_left = ResolveChunk(arrays, left); + const auto chunk_right = ResolveChunk(arrays, right); + return chunk_left.GetView() < chunk_right.GetView(); + }); + } else { + std::stable_sort(indices_begin, nulls_begin, + [&arrays](uint64_t left, uint64_t right) { + const auto chunk_left = ResolveChunk(arrays, left); + const auto chunk_right = ResolveChunk(arrays, right); + // We don't use 'left > right' here to reduce required operator. + // If we use 'right < left' here, '<' is only required. + return chunk_right.GetView() < chunk_left.GetView(); + }); + } + return nulls_begin; + } +}; + +// Sort a chunked array by sorting each array in the chunked array. +// +// TODO: This is a naive implementation. We'll be able to improve +// performance of this. For example, we'll be able to use threads for +// sorting each array. +class ChunkedArraySorter : public TypeVisitor { + public: + ChunkedArraySorter(uint64_t* indices_begin, uint64_t* indices_end, + const ChunkedArray& chunked_array, const SortOrder order, + bool can_use_array_sorter = true) + : TypeVisitor(), + indices_begin_(indices_begin), + indices_end_(indices_end), + chunked_array_(chunked_array), + order_(order), + can_use_array_sorter_(can_use_array_sorter) {} + + Status Sort() { return chunked_array_.type()->Accept(this); } + +#define VISIT(TYPE) \ + Status Visit(const TYPE##Type& type) override { return SortInternal(); } + + VISIT(Int8) + VISIT(Int16) + VISIT(Int32) + VISIT(Int64) + VISIT(UInt8) + VISIT(UInt16) + VISIT(UInt32) + VISIT(UInt64) + VISIT(Float) + VISIT(Double) + VISIT(String) + VISIT(Binary) + VISIT(LargeString) + VISIT(LargeBinary) + +#undef VISIT + + private: + template + Status SortInternal() { + using ArrayType = typename TypeTraits::ArrayType; + ArraySortOptions options(order_); + const auto num_chunks = chunked_array_.num_chunks(); + const auto& shared_arrays = chunked_array_.chunks(); + std::vector arrays(num_chunks); + for (int i = 0; i < num_chunks; ++i) { + const auto& array = shared_arrays[i]; + arrays[i] = array.get(); + } + if (can_use_array_sorter_) { + // Sort each chunk from the beginning and merge to sorted indices. + // This is a naive implementation. + ArraySorter sorter; + int64_t begin_offset = 0; + int64_t end_offset = 0; + int64_t null_count = 0; + uint64_t* left_nulls_begin = indices_begin_; + for (int i = 0; i < num_chunks; ++i) { + const auto array = checked_cast(arrays[i]); + end_offset += array->length(); + null_count += array->null_count(); + uint64_t* right_nulls_begin; + right_nulls_begin = + sorter.impl.Sort(indices_begin_ + begin_offset, indices_begin_ + end_offset, + *array, begin_offset, options); + if (i > 0) { + left_nulls_begin = Merge( + indices_begin_, indices_begin_ + begin_offset, indices_begin_ + end_offset, + left_nulls_begin, right_nulls_begin, arrays, null_count, order_); + } else { + left_nulls_begin = right_nulls_begin; + } + begin_offset = end_offset; + } + } else { + // Sort the chunked array directory. + ChunkedArrayCompareSorter sorter; + sorter.Sort(indices_begin_, indices_end_, arrays, chunked_array_.null_count(), + options); + } + return Status::OK(); + } + + // Merges two sorted indices arrays and returns where nulls starts. + // Where nulls starts is used when the next merge to detect the + // sorted indices locations. + template + uint64_t* Merge(uint64_t* indices_begin, uint64_t* indices_middle, + uint64_t* indices_end, uint64_t* left_nulls_begin, + uint64_t* right_nulls_begin, const std::vector& arrays, + int64_t null_count, const SortOrder order) { + auto left_num_non_nulls = left_nulls_begin - indices_begin; + auto right_num_non_nulls = right_nulls_begin - indices_middle; + auto nulls_begin = PartitionNulls( + indices_begin, indices_end, arrays, null_count); + indices_middle = indices_begin + left_num_non_nulls; + indices_end = indices_middle + right_num_non_nulls; + if (order == SortOrder::Ascending) { + std::inplace_merge(indices_begin, indices_middle, indices_end, + [&arrays](uint64_t left, uint64_t right) { + const auto chunk_left = ResolveChunk(arrays, left); + const auto chunk_right = + ResolveChunk(arrays, right); + return chunk_left.GetView() < chunk_right.GetView(); + }); + } else { + std::inplace_merge(indices_begin, indices_middle, indices_end, + [&arrays](uint64_t left, uint64_t right) { + const auto chunk_left = ResolveChunk(arrays, left); + const auto chunk_right = + ResolveChunk(arrays, right); + // We don't use 'left > right' here to reduce required + // operator. If we use 'right < left' here, '<' is only + // required. + return chunk_right.GetView() < chunk_left.GetView(); + }); + } + return nulls_begin; + } + + uint64_t* indices_begin_; + uint64_t* indices_end_; + const ChunkedArray& chunked_array_; + const SortOrder order_; + const bool can_use_array_sorter_; +}; + +// Sort a table using a radix sort-like algorithm. +// A distinct stable sort is called for each sort key, from the last key to the first. +class TableRadixSorter { + public: + Status Sort(uint64_t* indices_begin, uint64_t* indices_end, const Table& table, + const SortOptions& options) { + for (auto i = options.sort_keys.size(); i > 0; --i) { + const auto& sort_key = options.sort_keys[i - 1]; + const auto& chunked_array = table.GetColumnByName(sort_key.name); + if (!chunked_array) { + return Status::Invalid("Nonexistent sort key column: ", sort_key.name); + } + // We can use ArraySorter only for the sort key that is + // processed first because ArraySorter doesn't care about + // existing indices. + const auto can_use_array_sorter = (i == 0); + ChunkedArraySorter sorter(indices_begin, indices_end, *chunked_array.get(), + sort_key.order, can_use_array_sorter); + ARROW_RETURN_NOT_OK(sorter.Sort()); + } + return Status::OK(); + } +}; + +// Sort a table using a single sort and multiple-key comparisons. +class MultipleKeyTableSorter : public TypeVisitor { + private: + // Preprocessed sort key. + struct ResolvedSortKey { + ResolvedSortKey(const ChunkedArray& chunked_array, const SortOrder order) + : order(order) { + type = chunked_array.type().get(); + null_count = chunked_array.null_count(); + num_chunks = chunked_array.num_chunks(); + for (const auto& chunk : chunked_array.chunks()) { + chunks.push_back(chunk.get()); + } + } + + // Finds the target chunk and index in the target chunk from an + // index in chunked array. + template + ResolvedChunk GetChunk(int64_t index) const { + return ResolveChunk(chunks, index); + } + + SortOrder order; + DataType* type; + int64_t null_count; + int num_chunks; + std::vector chunks; + }; + + // Compare two records in the same table. + class Comparer : public TypeVisitor { + public: + Comparer(const Table& table, const std::vector& sort_keys) + : TypeVisitor(), + status_(Status::OK()), + sort_keys_(ResolveSortKeys(table, sort_keys, &status_)) {} + + Status status() { return status_; } + + const std::vector& sort_keys() { return sort_keys_; } + + // Returns true if the left-th value should be ordered before the + // right-th value, false otherwise. The start_sort_key_index-th + // sort key and subsequent sort keys are used for comparison. + bool Compare(uint64_t left, uint64_t right, size_t start_sort_key_index) { + current_left_ = left; + current_right_ = right; + current_compared_ = 0; + auto num_sort_keys = sort_keys_.size(); + for (size_t i = start_sort_key_index; i < num_sort_keys; ++i) { + current_sort_key_index_ = i; + status_ = sort_keys_[i].type->Accept(this); + // If the left value equals to the right value, we need to + // continue to sort. + if (current_compared_ != 0) { + break; + } + } + return current_compared_ < 0; + } + +#define VISIT(TYPE) \ + Status Visit(const TYPE##Type& type) override { \ + current_compared_ = CompareType(); \ + return Status::OK(); \ + } + + VISIT(Int8) + VISIT(Int16) + VISIT(Int32) + VISIT(Int64) + VISIT(UInt8) + VISIT(UInt16) + VISIT(UInt32) + VISIT(UInt64) + VISIT(Float) + VISIT(Double) + VISIT(String) + VISIT(Binary) + VISIT(LargeString) + VISIT(LargeBinary) + +#undef VISIT + + private: + // Compares two records in the same table and returns -1, 0 or 1. + // + // -1: The left is less than the right. + // 0: The left equals to the right. + // 1: The left is greater than the right. + // + // This supports null and NaN. Null is processed in this and NaN + // is processed in CompareTypeValue(). + template + int32_t CompareType() { + using ArrayType = typename TypeTraits::ArrayType; + const auto& sort_key = sort_keys_[current_sort_key_index_]; + auto order = sort_key.order; + const auto chunk_left = sort_key.GetChunk(current_left_); + const auto chunk_right = sort_key.GetChunk(current_right_); + if (sort_key.null_count > 0) { + auto is_null_left = chunk_left.IsNull(); + auto is_null_right = chunk_right.IsNull(); + if (is_null_left && is_null_right) { + return 0; + } else if (is_null_left) { + return 1; + } else if (is_null_right) { + return -1; + } + } + return CompareTypeValue(chunk_left, chunk_right, order); + } + + // For non-float types. Value is never NaN. + template + enable_if_t::value, int32_t> CompareTypeValue( + const ResolvedChunk::ArrayType>& chunk_left, + const ResolvedChunk::ArrayType>& chunk_right, + const SortOrder order) { + const auto left = chunk_left.GetView(); + const auto right = chunk_right.GetView(); + int32_t compared; + if (left == right) { + compared = 0; + } else if (left > right) { + compared = 1; + } else { + compared = -1; + } + if (order == SortOrder::Descending) { + compared = -compared; + } + return compared; + } + + // For float types. Value may be NaN. + template + enable_if_t::value, int32_t> CompareTypeValue( + const ResolvedChunk::ArrayType>& chunk_left, + const ResolvedChunk::ArrayType>& chunk_right, + const SortOrder order) { + const auto left = chunk_left.GetView(); + const auto right = chunk_right.GetView(); + auto is_nan_left = std::isnan(left); + auto is_nan_right = std::isnan(right); + if (is_nan_left && is_nan_right) { + return 0; + } else if (is_nan_left) { + return 1; + } else if (is_nan_right) { + return -1; + } + int32_t compared; + if (left == right) { + compared = 0; + } else if (left > right) { + compared = 1; + } else { + compared = -1; + } + if (order == SortOrder::Descending) { + compared = -compared; + } + return compared; + } + + static std::vector ResolveSortKeys( + const Table& table, const std::vector& sort_keys, Status* status) { + std::vector resolved; + for (const auto& sort_key : sort_keys) { + const auto& chunked_array = table.GetColumnByName(sort_key.name); + if (!chunked_array) { + *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name); + break; + } + resolved.emplace_back(*chunked_array, sort_key.order); + } + return resolved; + } + + Status status_; + const std::vector sort_keys_; + int64_t current_left_; + int64_t current_right_; + size_t current_sort_key_index_; + int32_t current_compared_; + }; + + public: + MultipleKeyTableSorter(uint64_t* indices_begin, uint64_t* indices_end, + const Table& table, const SortOptions& options) + : indices_begin_(indices_begin), + indices_end_(indices_end), + comparer_(table, options.sort_keys) {} + + // This is optimized for the first sort key. The first sort key sort + // is processed in this class. The second and following sort keys + // are processed in Comparer. + Status Sort() { + ARROW_RETURN_NOT_OK(comparer_.status()); + return comparer_.sort_keys()[0].type->Accept(this); + } + +#define VISIT(TYPE) \ + Status Visit(const TYPE##Type& type) override { return SortInternal(); } + + VISIT(Int8) + VISIT(Int16) + VISIT(Int32) + VISIT(Int64) + VISIT(UInt8) + VISIT(UInt16) + VISIT(UInt32) + VISIT(UInt64) + VISIT(Float) + VISIT(Double) + VISIT(String) + VISIT(Binary) + VISIT(LargeString) + VISIT(LargeBinary) + +#undef VISIT + + private: + template + Status SortInternal() { + using ArrayType = typename TypeTraits::ArrayType; + + auto& comparer = comparer_; + const auto& first_sort_key = comparer.sort_keys()[0]; + auto nulls_begin = indices_end_; + nulls_begin = PartitionNullsInternal(first_sort_key); + std::stable_sort(indices_begin_, nulls_begin, + [&first_sort_key, &comparer](uint64_t left, uint64_t right) { + // Both values are never null nor NaN. + auto chunk_left = first_sort_key.GetChunk(left); + auto chunk_right = first_sort_key.GetChunk(right); + auto value_left = chunk_left.GetView(); + auto value_right = chunk_right.GetView(); + if (value_left == value_right) { + // If the left value equals to the right value, + // we need to compare the second and following + // sort keys. + return comparer.Compare(left, right, 1); + } else { + auto compared = value_left < value_right; + if (first_sort_key.order == SortOrder::Ascending) { + return compared; + } else { + return !compared; + } + } + }); + return Status::OK(); + } + + // Behaves like PatitionNulls() but this supports multiple sort keys. + // + // For non-float types. + template + enable_if_t::value, uint64_t*> PartitionNullsInternal( + const ResolvedSortKey& first_sort_key) { + using ArrayType = typename TypeTraits::ArrayType; + if (first_sort_key.null_count == 0) { + return indices_end_; + } + StablePartitioner partitioner; + auto nulls_begin = + partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) { + const auto chunk = first_sort_key.GetChunk(index); + return !chunk.IsNull(); + }); + auto& comparer = comparer_; + std::stable_sort(nulls_begin, indices_end_, + [&comparer](uint64_t left, uint64_t right) { + return comparer.Compare(left, right, 1); + }); + return nulls_begin; + } + + // Behaves like PatitionNulls() but this supports multiple sort keys. + // + // For float types. + template + enable_if_t::value, uint64_t*> PartitionNullsInternal( + const ResolvedSortKey& first_sort_key) { + using ArrayType = typename TypeTraits::ArrayType; + StablePartitioner partitioner; + if (first_sort_key.null_count == 0) { + return partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) { + const auto chunk = first_sort_key.GetChunk(index); + return !std::isnan(chunk.GetView()); + }); + } + auto nans_and_nulls_begin = + partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) { + const auto chunk = first_sort_key.GetChunk(index); + return !chunk.IsNull() && !std::isnan(chunk.GetView()); + }); + auto nulls_begin = nans_and_nulls_begin; + if (first_sort_key.null_count < static_cast(indices_end_ - nulls_begin)) { + // move nulls after NaN + nulls_begin = partitioner( + nans_and_nulls_begin, indices_end_, [&first_sort_key](uint64_t index) { + const auto chunk = first_sort_key.GetChunk(index); + return !chunk.IsNull(); + }); + } + auto& comparer = comparer_; + if (nans_and_nulls_begin != nulls_begin) { + // Sort all NaNs by the second and following sort keys. + std::stable_sort(nans_and_nulls_begin, nulls_begin, + [&comparer](uint64_t left, uint64_t right) { + return comparer.Compare(left, right, 1); + }); + } + // Sort all nulls by the second and following sort keys. + std::stable_sort(nulls_begin, indices_end_, + [&comparer](uint64_t left, uint64_t right) { + return comparer.Compare(left, right, 1); + }); + return nans_and_nulls_begin; + } + + uint64_t* indices_begin_; + uint64_t* indices_end_; + Comparer comparer_; +}; + const FunctionDoc sort_indices_doc( + "Return the indices that would sort an array, record batch or table", + ("This function computes an array of indices that define a stable sort\n" + "of the input array, record batch or table. Null values are considered\n" + "greater than any other value and are therefore sorted at the end of the\n" + "input. For floating-point types, NaNs are considered greater than any\n" + "other non-null value, but smaller than null values."), + {"input"}, "SortOptions"); + +class SortIndicesMetaFunction : public MetaFunction { + public: + SortIndicesMetaFunction() + : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc) {} + + Result ExecuteImpl(const std::vector& args, + const FunctionOptions* options, + ExecContext* ctx) const override { + const SortOptions& sort_options = static_cast(*options); + switch (args[0].kind()) { + case Datum::ARRAY: + return SortIndices(*args[0].make_array(), sort_options, ctx); + break; + case Datum::CHUNKED_ARRAY: + return SortIndices(*args[0].chunked_array(), sort_options, ctx); + break; + case Datum::RECORD_BATCH: { + ARROW_ASSIGN_OR_RAISE(auto table, + Table::FromRecordBatches({args[0].record_batch()})); + return SortIndices(*table, sort_options, ctx); + } break; + case Datum::TABLE: + return SortIndices(*args[0].table(), sort_options, ctx); + break; + default: + break; + } + return Status::NotImplemented( + "Unsupported types for sort_indices operation: " + "values=", + args[0].ToString()); + } + + private: + Result SortIndices(const Array& values, const SortOptions& options, + ExecContext* ctx) const { + SortOrder order = SortOrder::Ascending; + if (!options.sort_keys.empty()) { + order = options.sort_keys[0].order; + } + ArraySortOptions array_options(order); + return CallFunction("array_sort_indices", {values}, &array_options, ctx); + } + + Result SortIndices(const ChunkedArray& chunked_array, const SortOptions& options, + ExecContext* ctx) const { + SortOrder order = SortOrder::Ascending; + if (!options.sort_keys.empty()) { + order = options.sort_keys[0].order; + } + + auto out_type = uint64(); + auto length = chunked_array.length(); + auto buffer_size = BitUtil::BytesForBits( + length * std::static_pointer_cast(out_type)->bit_width()); + std::vector> buffers(2); + ARROW_ASSIGN_OR_RAISE(buffers[1], + AllocateResizableBuffer(buffer_size, ctx->memory_pool())); + auto out = std::make_shared(out_type, length, buffers, 0); + auto out_begin = out->GetMutableValues(1); + auto out_end = out_begin + length; + std::iota(out_begin, out_end, 0); + + ChunkedArraySorter sorter(out_begin, out_end, chunked_array, order); + ARROW_RETURN_NOT_OK(sorter.Sort()); + return Datum(out); + } + + Result SortIndices(const Table& table, const SortOptions& options, + ExecContext* ctx) const { + auto n_sort_keys = options.sort_keys.size(); + if (n_sort_keys == 0) { + return Status::Invalid("Must specify one or more sort keys"); + } + if (n_sort_keys == 1) { + auto chunked_array = table.GetColumnByName(options.sort_keys[0].name); + if (!chunked_array) { + return Status::Invalid("Nonexistent sort key column: ", + options.sort_keys[0].name); + } + return SortIndices(*chunked_array, options, ctx); + } + + auto out_type = uint64(); + auto length = table.num_rows(); + auto buffer_size = BitUtil::BytesForBits( + length * std::static_pointer_cast(out_type)->bit_width()); + std::vector> buffers(2); + ARROW_ASSIGN_OR_RAISE(buffers[1], + AllocateResizableBuffer(buffer_size, ctx->memory_pool())); + auto out = std::make_shared(out_type, length, buffers, 0); + auto out_begin = out->GetMutableValues(1); + auto out_end = out_begin + length; + std::iota(out_begin, out_end, 0); + + // TODO: We should choose suitable sort implementation + // automatically. The current TableRadixSorter implementation is + // faster than MultipleKeyTableSorter only when the number of + // sort keys is 2 and counting sort is used. So we always + // MultipleKeyTableSorter for now. + // + // TableRadixSorter sorter; + // ARROW_RETURN_NOT_OK(sorter.Sort(out_begin, out_end, table, options)); + MultipleKeyTableSorter sorter(out_begin, out_end, table, options); + ARROW_RETURN_NOT_OK(sorter.Sort()); + return Datum(out); + } +}; + +const FunctionDoc array_sort_indices_doc( "Return the indices that would sort an array", ("This function computes an array of indices that define a stable sort\n" "of the input array. Null values are considered greater than any\n" "other value and are therefore sorted at the end of the array.\n" "For floating-point types, NaNs are considered greater than any\n" "other non-null value, but smaller than null values."), - {"array"}); + {"array"}, "ArraySortOptions"); const FunctionDoc partition_nth_indices_doc( "Return the indices that would partition an array around a pivot", @@ -380,10 +1153,13 @@ void RegisterVectorSort(FunctionRegistry* registry) { base.mem_allocation = MemAllocation::PREALLOCATE; base.null_handling = NullHandling::OUTPUT_NOT_NULL; - auto sort_indices = - std::make_shared("sort_indices", Arity::Unary(), &sort_indices_doc); - AddSortingKernels(base, sort_indices.get()); - DCHECK_OK(registry->AddFunction(std::move(sort_indices))); + auto array_sort_indices = std::make_shared( + "array_sort_indices", Arity::Unary(), &array_sort_indices_doc); + base.init = ArraySortIndicesState::Init; + AddSortingKernels(base, array_sort_indices.get()); + DCHECK_OK(registry->AddFunction(std::move(array_sort_indices))); + + DCHECK_OK(registry->AddFunction(std::make_shared())); // partition_nth_indices has a parameter so needs its init function auto part_indices = std::make_shared( diff --git a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc index be541b2dad6..1a7eb031f20 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc @@ -19,6 +19,7 @@ #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/benchmark_util.h" @@ -27,51 +28,201 @@ namespace arrow { namespace compute { constexpr auto kSeed = 0x0ff1ce; -static void SortToIndicesBenchmark(benchmark::State& state, - const std::shared_ptr& values) { +static void ArraySortIndicesBenchmark(benchmark::State& state, + const std::shared_ptr& values) { for (auto _ : state) { - ABORT_NOT_OK(SortToIndices(*values).status()); + ABORT_NOT_OK(SortIndices(*values).status()); } state.SetItemsProcessed(state.iterations() * values->length()); } -static void SortToIndicesInt64Count(benchmark::State& state) { +static void ChunkedArraySortIndicesBenchmark( + benchmark::State& state, const std::shared_ptr& values) { + for (auto _ : state) { + ABORT_NOT_OK(SortIndices(*values).status()); + } + state.SetItemsProcessed(state.iterations() * values->length()); +} + +static void ArraySortIndicesInt64Benchmark(benchmark::State& state, int64_t min, + int64_t max) { RegressionArgs args(state); const int64_t array_size = args.size / sizeof(int64_t); auto rand = random::RandomArrayGenerator(kSeed); + auto values = rand.Int64(array_size, min, max, args.null_proportion); - auto values = rand.Int64(array_size, -100, 100, args.null_proportion); - - SortToIndicesBenchmark(state, values); + ArraySortIndicesBenchmark(state, values); } -static void SortToIndicesInt64Compare(benchmark::State& state) { +static void ChunkedArraySortIndicesInt64Benchmark(benchmark::State& state, int64_t min, + int64_t max) { RegressionArgs args(state); - const int64_t array_size = args.size / sizeof(int64_t); + const int64_t n_chunks = 10; + const int64_t array_size = args.size / n_chunks / sizeof(int64_t); auto rand = random::RandomArrayGenerator(kSeed); + ArrayVector chunks; + for (int64_t i = 0; i < n_chunks; ++i) { + chunks.push_back(rand.Int64(array_size, min, max, args.null_proportion)); + } - auto min = std::numeric_limits::min(); - auto max = std::numeric_limits::max(); - auto values = rand.Int64(array_size, min, max, args.null_proportion); + ChunkedArraySortIndicesBenchmark(state, std::make_shared(chunks)); +} + +static void ArraySortIndicesInt64Narrow(benchmark::State& state) { + ArraySortIndicesInt64Benchmark(state, -100, 100); +} + +static void ArraySortIndicesInt64Wide(benchmark::State& state) { + const auto min = std::numeric_limits::min(); + const auto max = std::numeric_limits::max(); + ArraySortIndicesInt64Benchmark(state, min, max); +} + +static void ChunkedArraySortIndicesInt64Narrow(benchmark::State& state) { + ChunkedArraySortIndicesInt64Benchmark(state, -100, 100); +} + +static void ChunkedArraySortIndicesInt64Wide(benchmark::State& state) { + const auto min = std::numeric_limits::min(); + const auto max = std::numeric_limits::max(); + ChunkedArraySortIndicesInt64Benchmark(state, min, max); +} + +static void TableSortIndicesBenchmark(benchmark::State& state, + const std::shared_ptr& table, + const SortOptions& options) { + for (auto _ : state) { + ABORT_NOT_OK(SortIndices(*table, options).status()); + } +} + +// Extract benchmark args from benchmark::State +struct TableSortIndicesArgs { + // the number of records + const int64_t num_records; + + // proportion of nulls in generated arrays + const double null_proportion; + + // the number of columns + const int64_t num_columns; + + // the number of chunks in each generated column + const int64_t num_chunks; + + // Extract args + explicit TableSortIndicesArgs(benchmark::State& state) + : num_records(state.range(0)), + null_proportion(ComputeNullProportion(state.range(1))), + num_columns(state.range(2)), + num_chunks(state.range(3)), + state_(state) {} + + ~TableSortIndicesArgs() { state_.SetItemsProcessed(state_.iterations() * num_records); } - SortToIndicesBenchmark(state, values); + private: + double ComputeNullProportion(int64_t inverse_null_proportion) { + if (inverse_null_proportion == 0) { + return 0.0; + } else { + return std::min(1., 1. / static_cast(inverse_null_proportion)); + } + } + + benchmark::State& state_; +}; + +static void TableSortIndicesInt64(benchmark::State& state, int64_t min, int64_t max) { + TableSortIndicesArgs args(state); + + auto rand = random::RandomArrayGenerator(kSeed); + std::vector> fields; + std::vector sort_keys; + std::vector> columns; + for (int64_t i = 0; i < args.num_columns; ++i) { + auto name = std::to_string(i); + fields.push_back(field(name, int64())); + auto order = (i % 2) == 0 ? SortOrder::Ascending : SortOrder::Descending; + sort_keys.emplace_back(name, order); + std::vector> arrays; + if ((args.num_records % args.num_chunks) != 0) { + Status::Invalid("The number of chunks (", args.num_chunks, + ") must be " + "a multiple of the number of records (", + args.num_records, ")") + .Abort(); + } + auto num_records_in_array = args.num_records / args.num_chunks; + for (int64_t j = 0; j < args.num_chunks; ++j) { + arrays.push_back(rand.Int64(num_records_in_array, min, max, args.null_proportion)); + } + ASSIGN_OR_ABORT(auto chunked_array, ChunkedArray::Make(arrays, int64())); + columns.push_back(chunked_array); + } + + auto table = Table::Make(schema(fields), columns, args.num_records); + SortOptions options(sort_keys); + TableSortIndicesBenchmark(state, table, options); } -BENCHMARK(SortToIndicesInt64Count) +static void TableSortIndicesInt64Narrow(benchmark::State& state) { + TableSortIndicesInt64(state, -100, 100); +} + +static void TableSortIndicesInt64Wide(benchmark::State& state) { + TableSortIndicesInt64(state, std::numeric_limits::min(), + std::numeric_limits::max()); +} + +BENCHMARK(ArraySortIndicesInt64Narrow) ->Apply(RegressionSetArgs) ->Args({1 << 20, 100}) ->Args({1 << 23, 100}) ->MinTime(1.0) ->Unit(benchmark::TimeUnit::kNanosecond); -BENCHMARK(SortToIndicesInt64Compare) +BENCHMARK(ArraySortIndicesInt64Wide) ->Apply(RegressionSetArgs) ->Args({1 << 20, 100}) ->Args({1 << 23, 100}) ->MinTime(1.0) ->Unit(benchmark::TimeUnit::kNanosecond); +BENCHMARK(ChunkedArraySortIndicesInt64Narrow) + ->Apply(RegressionSetArgs) + ->Args({1 << 20, 100}) + ->Args({1 << 23, 100}) + ->MinTime(1.0) + ->Unit(benchmark::TimeUnit::kNanosecond); + +BENCHMARK(ChunkedArraySortIndicesInt64Wide) + ->Apply(RegressionSetArgs) + ->Args({1 << 20, 100}) + ->Args({1 << 23, 100}) + ->MinTime(1.0) + ->Unit(benchmark::TimeUnit::kNanosecond); + +BENCHMARK(TableSortIndicesInt64Narrow) + ->ArgsProduct({ + {1 << 20}, // the number of records + {100, 0}, // inverse null proportion + {16, 8, 2, 1}, // the number of columns + {32, 4, 1}, // the number of chunks + }) + ->MinTime(1.0) + ->Unit(benchmark::TimeUnit::kNanosecond); + +BENCHMARK(TableSortIndicesInt64Wide) + ->ArgsProduct({ + {1 << 20}, // the number of records + {100, 0}, // inverse null proportion + {16, 8, 2, 1}, // the number of columns + {32, 4, 1}, // the number of chunks + }) + ->MinTime(1.0) + ->Unit(benchmark::TimeUnit::kNanosecond); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 448576da8ce..e70fbae2819 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -20,8 +20,9 @@ #include #include +#include "arrow/array/concatenate.h" #include "arrow/compute/api_vector.h" -#include "arrow/compute/kernels/test_util.h" +#include "arrow/table.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" @@ -30,6 +31,7 @@ namespace arrow { +using internal::checked_cast; using internal::checked_pointer_cast; namespace compute { @@ -51,7 +53,7 @@ class NthComparator { template class SortComparator { public: - bool operator()(const ArrayType& array, uint64_t lhs, uint64_t rhs) { + bool operator()(const ArrayType& array, SortOrder order, uint64_t lhs, uint64_t rhs) { if (array.IsNull(rhs) && array.IsNull(lhs)) return lhs < rhs; if (array.IsNull(rhs)) return true; if (array.IsNull(lhs)) return false; @@ -63,7 +65,11 @@ class SortComparator { if (lhs_isnan) return false; } if (array.GetView(lhs) == array.GetView(rhs)) return lhs < rhs; - return array.GetView(lhs) < array.GetView(rhs); + if (order == SortOrder::Ascending) { + return array.GetView(lhs) < array.GetView(rhs); + } else { + return array.GetView(lhs) > array.GetView(rhs); + } } }; @@ -228,124 +234,157 @@ TYPED_TEST(TestNthToIndicesRandom, RandomValues) { using arrow::internal::checked_pointer_cast; template -class TestSortToIndicesKernel : public TestBase { +class TestArraySortIndicesKernel : public TestBase { private: - void AssertSortToIndicesArrays(const std::shared_ptr values, - const std::shared_ptr expected) { - ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, SortToIndices(*values)); + void AssertArraysSortIndices(const std::shared_ptr values, SortOrder order, + const std::shared_ptr expected) { + ASSERT_OK_AND_ASSIGN(std::shared_ptr actual, SortIndices(*values, order)); ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } protected: - virtual void AssertSortToIndices(const std::string& values, - const std::string& expected) { + virtual void AssertSortIndices(const std::string& values, SortOrder order, + const std::string& expected) { auto type = TypeTraits::type_singleton(); - AssertSortToIndicesArrays(ArrayFromJSON(type, values), - ArrayFromJSON(uint64(), expected)); + AssertArraysSortIndices(ArrayFromJSON(type, values), order, + ArrayFromJSON(uint64(), expected)); + } + + virtual void AssertSortIndices(const std::string& values, const std::string& expected) { + AssertSortIndices(values, SortOrder::Ascending, expected); } }; template -class TestSortToIndicesKernelForReal : public TestSortToIndicesKernel {}; -TYPED_TEST_SUITE(TestSortToIndicesKernelForReal, RealArrowTypes); +class TestArraySortIndicesKernelForReal : public TestArraySortIndicesKernel {}; +TYPED_TEST_SUITE(TestArraySortIndicesKernelForReal, RealArrowTypes); template -class TestSortToIndicesKernelForIntegral : public TestSortToIndicesKernel {}; -TYPED_TEST_SUITE(TestSortToIndicesKernelForIntegral, IntegralArrowTypes); +class TestArraySortIndicesKernelForIntegral + : public TestArraySortIndicesKernel {}; +TYPED_TEST_SUITE(TestArraySortIndicesKernelForIntegral, IntegralArrowTypes); template -class TestSortToIndicesKernelForStrings : public TestSortToIndicesKernel {}; -TYPED_TEST_SUITE(TestSortToIndicesKernelForStrings, testing::Types); +class TestArraySortIndicesKernelForStrings + : public TestArraySortIndicesKernel {}; +TYPED_TEST_SUITE(TestArraySortIndicesKernelForStrings, testing::Types); + +TYPED_TEST(TestArraySortIndicesKernelForReal, SortReal) { + this->AssertSortIndices("[]", "[]"); + + this->AssertSortIndices("[3.4, 2.6, 6.3]", "[1, 0, 2]"); + this->AssertSortIndices("[1.1, 2.4, 3.5, 4.3, 5.1, 6.8, 7.3]", "[0, 1, 2, 3, 4, 5, 6]"); + this->AssertSortIndices("[7, 6, 5, 4, 3, 2, 1]", "[6, 5, 4, 3, 2, 1, 0]"); + this->AssertSortIndices("[10.4, 12, 4.2, 50, 50.3, 32, 11]", "[2, 0, 6, 1, 5, 3, 4]"); + + this->AssertSortIndices("[null, 1, 3.3, null, 2, 5.3]", SortOrder::Ascending, + "[1, 4, 2, 5, 0, 3]"); + this->AssertSortIndices("[null, 1, 3.3, null, 2, 5.3]", SortOrder::Descending, + "[5, 2, 4, 1, 0, 3]"); + + this->AssertSortIndices("[3, 4, NaN, 1, 2, null]", SortOrder::Ascending, + "[3, 4, 0, 1, 2, 5]"); + this->AssertSortIndices("[3, 4, NaN, 1, 2, null]", SortOrder::Descending, + "[1, 0, 4, 3, 2, 5]"); + this->AssertSortIndices("[NaN, 2, NaN, 3, 1]", SortOrder::Ascending, "[4, 1, 3, 0, 2]"); + this->AssertSortIndices("[NaN, 2, NaN, 3, 1]", SortOrder::Descending, + "[3, 1, 4, 0, 2]"); + this->AssertSortIndices("[null, NaN, NaN, null]", SortOrder::Ascending, "[1, 2, 0, 3]"); + this->AssertSortIndices("[null, NaN, NaN, null]", SortOrder::Descending, + "[1, 2, 0, 3]"); +} -TYPED_TEST(TestSortToIndicesKernelForReal, SortReal) { - this->AssertSortToIndices("[]", "[]"); +TYPED_TEST(TestArraySortIndicesKernelForIntegral, SortIntegral) { + this->AssertSortIndices("[]", "[]"); - this->AssertSortToIndices("[3.4, 2.6, 6.3]", "[1, 0, 2]"); - this->AssertSortToIndices("[1.1, 2.4, 3.5, 4.3, 5.1, 6.8, 7.3]", "[0,1,2,3,4,5,6]"); - this->AssertSortToIndices("[7, 6, 5, 4, 3, 2, 1]", "[6,5,4,3,2,1,0]"); - this->AssertSortToIndices("[10.4, 12, 4.2, 50, 50.3, 32, 11]", "[2,0,6,1,5,3,4]"); + this->AssertSortIndices("[3, 2, 6]", "[1, 0, 2]"); + this->AssertSortIndices("[1, 2, 3, 4, 5, 6, 7]", "[0, 1, 2, 3, 4, 5, 6]"); + this->AssertSortIndices("[7, 6, 5, 4, 3, 2, 1]", "[6, 5, 4, 3, 2, 1, 0]"); - this->AssertSortToIndices("[null, 1, 3.3, null, 2, 5.3]", "[1,4,2,5,0,3]"); + this->AssertSortIndices("[10, 12, 4, 50, 50, 32, 11]", SortOrder::Ascending, + "[2, 0, 6, 1, 5, 3, 4]"); + this->AssertSortIndices("[10, 12, 4, 50, 50, 32, 11]", SortOrder::Descending, + "[3, 4, 5, 1, 6, 0, 2]"); - this->AssertSortToIndices("[3, 4, NaN, 1, 2, null]", "[3,4,0,1,2,5]"); - this->AssertSortToIndices("[NaN, 2, NaN, 3, 1]", "[4,1,3,0,2]"); - this->AssertSortToIndices("[null, NaN, NaN, null]", "[1,2,0,3]"); + this->AssertSortIndices("[null, 1, 3, null, 2, 5]", SortOrder::Ascending, + "[1, 4, 2, 5, 0, 3]"); + this->AssertSortIndices("[null, 1, 3, null, 2, 5]", SortOrder::Descending, + "[5, 2, 4, 1, 0, 3]"); } -TYPED_TEST(TestSortToIndicesKernelForIntegral, SortIntegral) { - this->AssertSortToIndices("[]", "[]"); +TYPED_TEST(TestArraySortIndicesKernelForStrings, SortStrings) { + this->AssertSortIndices("[]", "[]"); - this->AssertSortToIndices("[3, 2, 6]", "[1, 0, 2]"); - this->AssertSortToIndices("[1, 2, 3, 4, 5, 6, 7]", "[0,1,2,3,4,5,6]"); - this->AssertSortToIndices("[7, 6, 5, 4, 3, 2, 1]", "[6,5,4,3,2,1,0]"); - this->AssertSortToIndices("[10, 12, 4, 50, 50, 32, 11]", "[2,0,6,1,5,3,4]"); + this->AssertSortIndices(R"(["a", "b", "c"])", "[0, 1, 2]"); + this->AssertSortIndices(R"(["foo", "bar", "baz"])", "[1,2,0]"); + this->AssertSortIndices(R"(["testing", "sort", "for", "strings"])", "[2, 1, 3, 0]"); - this->AssertSortToIndices("[null, 1, 3, null, 2, 5]", "[1,4,2,5,0,3]"); -} - -TYPED_TEST(TestSortToIndicesKernelForStrings, SortStrings) { - this->AssertSortToIndices("[]", "[]"); - - this->AssertSortToIndices(R"(["a", "b", "c"])", "[0, 1, 2]"); - this->AssertSortToIndices(R"(["foo", "bar", "baz"])", "[1,2,0]"); - this->AssertSortToIndices(R"(["testing", "sort", "for", "strings"])", "[2, 1, 3, 0]"); + this->AssertSortIndices(R"(["c", "b", "a", "b"])", SortOrder::Ascending, + "[2, 1, 3, 0]"); + this->AssertSortIndices(R"(["c", "b", "a", "b"])", SortOrder::Descending, + "[0, 1, 3, 2]"); } template -class TestSortToIndicesKernelForUInt8 : public TestSortToIndicesKernel {}; -TYPED_TEST_SUITE(TestSortToIndicesKernelForUInt8, UInt8Type); +class TestArraySortIndicesKernelForUInt8 : public TestArraySortIndicesKernel { +}; +TYPED_TEST_SUITE(TestArraySortIndicesKernelForUInt8, UInt8Type); template -class TestSortToIndicesKernelForInt8 : public TestSortToIndicesKernel {}; -TYPED_TEST_SUITE(TestSortToIndicesKernelForInt8, Int8Type); +class TestArraySortIndicesKernelForInt8 : public TestArraySortIndicesKernel {}; +TYPED_TEST_SUITE(TestArraySortIndicesKernelForInt8, Int8Type); -TYPED_TEST(TestSortToIndicesKernelForUInt8, SortUInt8) { - this->AssertSortToIndices("[255, null, 0, 255, 10, null, 128, 0]", "[2,7,4,6,0,3,1,5]"); +TYPED_TEST(TestArraySortIndicesKernelForUInt8, SortUInt8) { + this->AssertSortIndices("[255, null, 0, 255, 10, null, 128, 0]", + "[2, 7, 4, 6, 0, 3, 1, 5]"); } -TYPED_TEST(TestSortToIndicesKernelForInt8, SortInt8) { - this->AssertSortToIndices("[null, 10, 127, 0, -128, -128, null]", "[4,5,3,1,2,0,6]"); +TYPED_TEST(TestArraySortIndicesKernelForInt8, SortInt8) { + this->AssertSortIndices("[null, 10, 127, 0, -128, -128, null]", + "[4, 5, 3, 1, 2, 0, 6]"); } template -class TestSortToIndicesKernelRandom : public TestBase {}; +class TestArraySortIndicesKernelRandom : public TestBase {}; template -class TestSortToIndicesKernelRandomCount : public TestBase {}; +class TestArraySortIndicesKernelRandomCount : public TestBase {}; template -class TestSortToIndicesKernelRandomCompare : public TestBase {}; +class TestArraySortIndicesKernelRandomCompare : public TestBase {}; -using SortToIndicesableTypes = +using SortIndicesableTypes = ::testing::Types; template -void ValidateSorted(const ArrayType& array, UInt64Array& offsets) { +void ValidateSorted(const ArrayType& array, UInt64Array& offsets, SortOrder order) { ASSERT_OK(array.ValidateFull()); SortComparator compare; for (int i = 1; i < array.length(); i++) { uint64_t lhs = offsets.Value(i - 1); uint64_t rhs = offsets.Value(i); - ASSERT_TRUE(compare(array, lhs, rhs)); + ASSERT_TRUE(compare(array, order, lhs, rhs)); } } -TYPED_TEST_SUITE(TestSortToIndicesKernelRandom, SortToIndicesableTypes); +TYPED_TEST_SUITE(TestArraySortIndicesKernelRandom, SortIndicesableTypes); -TYPED_TEST(TestSortToIndicesKernelRandom, SortRandomValues) { +TYPED_TEST(TestArraySortIndicesKernelRandom, SortRandomValues) { using ArrayType = typename TypeTraits::ArrayType; Random rand(0x5487655); int times = 5; - int length = 1000; + int length = 100; for (int test = 0; test < times; test++) { for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) { - auto array = rand.Generate(length, null_probability); - ASSERT_OK_AND_ASSIGN(std::shared_ptr offsets, SortToIndices(*array)); - ValidateSorted(*checked_pointer_cast(array), - *checked_pointer_cast(offsets)); + for (auto order : {SortOrder::Ascending, SortOrder::Descending}) { + auto array = rand.Generate(length, null_probability); + ASSERT_OK_AND_ASSIGN(std::shared_ptr offsets, SortIndices(*array, order)); + ValidateSorted(*checked_pointer_cast(array), + *checked_pointer_cast(offsets), order); + } } } } @@ -353,43 +392,408 @@ TYPED_TEST(TestSortToIndicesKernelRandom, SortRandomValues) { // Long array with small value range: counting sort // - length >= 1024(CountCompareSorter::countsort_min_len_) // - range <= 4096(CountCompareSorter::countsort_max_range_) -TYPED_TEST_SUITE(TestSortToIndicesKernelRandomCount, IntegralArrowTypes); +TYPED_TEST_SUITE(TestArraySortIndicesKernelRandomCount, IntegralArrowTypes); -TYPED_TEST(TestSortToIndicesKernelRandomCount, SortRandomValuesCount) { +TYPED_TEST(TestArraySortIndicesKernelRandomCount, SortRandomValuesCount) { using ArrayType = typename TypeTraits::ArrayType; RandomRange rand(0x5487656); int times = 5; - int length = 4000; + int length = 100; int range = 2000; for (int test = 0; test < times; test++) { for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) { - auto array = rand.Generate(length, range, null_probability); - ASSERT_OK_AND_ASSIGN(std::shared_ptr offsets, SortToIndices(*array)); - ValidateSorted(*checked_pointer_cast(array), - *checked_pointer_cast(offsets)); + for (auto order : {SortOrder::Ascending, SortOrder::Descending}) { + auto array = rand.Generate(length, range, null_probability); + ASSERT_OK_AND_ASSIGN(std::shared_ptr offsets, SortIndices(*array, order)); + ValidateSorted(*checked_pointer_cast(array), + *checked_pointer_cast(offsets), order); + } } } } // Long array with big value range: std::stable_sort -TYPED_TEST_SUITE(TestSortToIndicesKernelRandomCompare, IntegralArrowTypes); +TYPED_TEST_SUITE(TestArraySortIndicesKernelRandomCompare, IntegralArrowTypes); -TYPED_TEST(TestSortToIndicesKernelRandomCompare, SortRandomValuesCompare) { +TYPED_TEST(TestArraySortIndicesKernelRandomCompare, SortRandomValuesCompare) { using ArrayType = typename TypeTraits::ArrayType; Random rand(0x5487657); int times = 5; - int length = 4000; + int length = 100; for (int test = 0; test < times; test++) { for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) { - auto array = rand.Generate(length, null_probability); - ASSERT_OK_AND_ASSIGN(std::shared_ptr offsets, SortToIndices(*array)); - ValidateSorted(*checked_pointer_cast(array), - *checked_pointer_cast(offsets)); + for (auto order : {SortOrder::Ascending, SortOrder::Descending}) { + auto array = rand.Generate(length, null_probability); + ASSERT_OK_AND_ASSIGN(std::shared_ptr offsets, SortIndices(*array, order)); + ValidateSorted(*checked_pointer_cast(array), + *checked_pointer_cast(offsets), order); + } } } } +// Test basic cases for chunked array. +class TestChunkedArraySortIndices : public ::testing::Test { + protected: + void AssertSortIndices(const std::shared_ptr chunked_array, + SortOrder order, const std::shared_ptr expected) { + ASSERT_OK_AND_ASSIGN(auto actual, SortIndices(*chunked_array, order)); + AssertArraysEqual(*expected, *actual); + } + + void AssertSortIndices(const std::shared_ptr chunked_array, + SortOrder order, const std::string expected) { + AssertSortIndices(chunked_array, order, ArrayFromJSON(uint64(), expected)); + } +}; + +TEST_F(TestChunkedArraySortIndices, SortNull) { + auto chunked_array = ChunkedArrayFromJSON(uint8(), { + "[null, 1]", + "[3, null, 2]", + "[1]", + }); + this->AssertSortIndices(chunked_array, SortOrder::Ascending, "[1, 5, 4, 2, 0, 3]"); + this->AssertSortIndices(chunked_array, SortOrder::Descending, "[2, 4, 1, 5, 0, 3]"); +} + +TEST_F(TestChunkedArraySortIndices, SortNaN) { + auto chunked_array = ChunkedArrayFromJSON(float32(), { + "[null, 1]", + "[3, null, NaN]", + "[NaN, 1]", + }); + this->AssertSortIndices(chunked_array, SortOrder::Ascending, "[1, 6, 2, 4, 5, 0, 3]"); + this->AssertSortIndices(chunked_array, SortOrder::Descending, "[2, 1, 6, 4, 5, 0, 3]"); +} + +// Base class for testing against random chunked array. +template +class TestChunkedArrayRandomBase : public TestBase { + protected: + // Generates a chunk. This should be implemented in subclasses. + virtual std::shared_ptr GenerateArray(int length, double null_probability) = 0; + + // All tests uses this. + void TestSortIndices(int length) { + using ArrayType = typename TypeTraits::ArrayType; + // We can use INSTANTIATE_TEST_SUITE_P() instead of using fors in a test. + for (auto null_probability : {0.0, 0.1, 0.5, 0.9, 1.0}) { + for (auto order : {SortOrder::Ascending, SortOrder::Descending}) { + for (auto num_chunks : {1, 5, 10}) { + std::vector> arrays; + for (int i = 0; i < num_chunks; ++i) { + auto array = this->GenerateArray(length, null_probability); + arrays.push_back(array); + } + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make(arrays)); + ASSERT_OK_AND_ASSIGN(auto offsets, SortIndices(*chunked_array, order)); + // Concatenates chunks to use existing ValidateSorted() for array. + ASSERT_OK_AND_ASSIGN(auto concatenated_array, Concatenate(arrays)); + ValidateSorted(*checked_pointer_cast(concatenated_array), + *checked_pointer_cast(offsets), order); + } + } + } + } +}; + +// Long array with big value range: std::stable_sort +template +class TestChunkedArrayRandom : public TestChunkedArrayRandomBase { + public: + void SetUp() override { rand_ = new Random(0x5487655); } + + void TearDown() override { delete rand_; } + + protected: + std::shared_ptr GenerateArray(int length, double null_probability) override { + return rand_->Generate(length, null_probability); + } + + private: + Random* rand_; +}; +TYPED_TEST_SUITE(TestChunkedArrayRandom, SortIndicesableTypes); +TYPED_TEST(TestChunkedArrayRandom, SortIndices) { this->TestSortIndices(100); } + +// Long array with small value range: counting sort +// - length >= 1024(CountCompareSorter::countsort_min_len_) +// - range <= 4096(CountCompareSorter::countsort_max_range_) +template +class TestChunkedArrayRandomNarrow : public TestChunkedArrayRandomBase { + public: + void SetUp() override { + range_ = 2000; + rand_ = new RandomRange(0x5487655); + } + + void TearDown() override { delete rand_; } + + protected: + std::shared_ptr GenerateArray(int length, double null_probability) override { + return rand_->Generate(length, range_, null_probability); + } + + private: + int range_; + RandomRange* rand_; +}; +TYPED_TEST_SUITE(TestChunkedArrayRandomNarrow, IntegralArrowTypes); +TYPED_TEST(TestChunkedArrayRandomNarrow, SortIndices) { this->TestSortIndices(100); } + +// Test basic cases for table. +class TestTableSortIndices : public ::testing::Test { + protected: + void AssertSortIndices(const std::shared_ptr
table, const SortOptions& options, + const std::shared_ptr expected) { + ASSERT_OK_AND_ASSIGN(auto actual, SortIndices(*table, options)); + AssertArraysEqual(*expected, *actual); + } + + void AssertSortIndices(const std::shared_ptr
table, const SortOptions& options, + const std::string expected) { + AssertSortIndices(table, options, ArrayFromJSON(uint64(), expected)); + } +}; + +TEST_F(TestTableSortIndices, SortNull) { + auto table = TableFromJSON(schema({ + {field("a", uint8())}, + {field("b", uint8())}, + }), + {"[" + "{\"a\": null, \"b\": 5}," + "{\"a\": 1, \"b\": 3}," + "{\"a\": 3, \"b\": null}," + "{\"a\": null, \"b\": null}," + "{\"a\": 2, \"b\": 5}," + "{\"a\": 1, \"b\": 5}" + "]"}); + SortOptions options( + {SortKey("a", SortOrder::Ascending), SortKey("b", SortOrder::Descending)}); + this->AssertSortIndices(table, options, "[5, 1, 4, 2, 0, 3]"); +} + +TEST_F(TestTableSortIndices, SortNaN) { + auto table = TableFromJSON(schema({ + {field("a", float32())}, + {field("b", float32())}, + }), + {"[" + "{\"a\": null, \"b\": 5}," + "{\"a\": 1, \"b\": 3}," + "{\"a\": 3, \"b\": null}," + "{\"a\": null, \"b\": null}," + "{\"a\": NaN, \"b\": null}," + "{\"a\": NaN, \"b\": NaN}," + "{\"a\": NaN, \"b\": 5}," + "{\"a\": 1, \"b\": 5}" + "]"}); + SortOptions options( + {SortKey("a", SortOrder::Ascending), SortKey("b", SortOrder::Descending)}); + this->AssertSortIndices(table, options, "[7, 1, 2, 6, 5, 4, 0, 3]"); +} + +// For random table tests. +using RandomParam = std::tuple; +class TestTableSortIndicesRandom : public testing::TestWithParam { + // Compares two records in the same table. + class Comparator : public TypeVisitor { + public: + Comparator(const Table& table, const SortOptions& options) : options_(options) { + for (const auto& sort_key : options_.sort_keys) { + sort_columns_.emplace_back(table.GetColumnByName(sort_key.name).get(), + sort_key.order); + } + } + + // Returns true if the left record is less or equals to the right + // record, false otherwise. + // + // This supports null and NaN. + bool operator()(uint64_t lhs, uint64_t rhs) { + lhs_ = lhs; + rhs_ = rhs; + for (const auto& pair : sort_columns_) { + const auto& chunked_array = *pair.first; + lhs_array_ = FindTargetArray(chunked_array, lhs, &lhs_index_); + rhs_array_ = FindTargetArray(chunked_array, rhs, &rhs_index_); + if (rhs_array_->IsNull(rhs_index_) && lhs_array_->IsNull(lhs_index_)) continue; + if (rhs_array_->IsNull(rhs_index_)) return true; + if (lhs_array_->IsNull(lhs_index_)) return false; + status_ = lhs_array_->type()->Accept(this); + if (compared_ == 0) continue; + if (pair.second == SortOrder::Ascending) { + return compared_ < 0; + } else { + return compared_ > 0; + } + } + return lhs < rhs; + } + + Status status() const { return status_; } + +#define VISIT(TYPE) \ + Status Visit(const TYPE##Type& type) override { \ + compared_ = CompareType(); \ + return Status::OK(); \ + } + + VISIT(Int8) + VISIT(Int16) + VISIT(Int32) + VISIT(Int64) + VISIT(UInt8) + VISIT(UInt16) + VISIT(UInt32) + VISIT(UInt64) + VISIT(Float) + VISIT(Double) + VISIT(String) + +#undef VISIT + + private: + // Finds the target chunk and index in the target chunk from an + // index in chunked array. + const Array* FindTargetArray(const ChunkedArray& chunked_array, int64_t i, + int64_t* chunk_index) { + int64_t offset = 0; + for (const auto& chunk : chunked_array.chunks()) { + if (i < offset + chunk->length()) { + *chunk_index = i - offset; + return chunk.get(); + } + offset += chunk->length(); + } + return nullptr; + } + + // Compares two values in the same chunked array. Values are never + // null but may be NaN. + // + // Returns true if the left value is less or equals to the right + // value, false otherwise. + template + int CompareType() { + using ArrayType = typename TypeTraits::ArrayType; + auto lhs_value = checked_cast(lhs_array_)->GetView(lhs_index_); + auto rhs_value = checked_cast(rhs_array_)->GetView(rhs_index_); + if (is_floating_type::value) { + const bool lhs_isnan = lhs_value != lhs_value; + const bool rhs_isnan = rhs_value != rhs_value; + if (lhs_isnan && rhs_isnan) return 0; + if (rhs_isnan) return 1; + if (lhs_isnan) return -1; + } + if (lhs_value == rhs_value) { + return 0; + } else if (lhs_value > rhs_value) { + return 1; + } else { + return -1; + } + } + + const SortOptions& options_; + std::vector> sort_columns_; + int64_t lhs_; + const Array* lhs_array_; + int64_t lhs_index_; + int64_t rhs_; + const Array* rhs_array_; + int64_t rhs_index_; + int compared_; + Status status_; + }; + + public: + // Validates the sorted indexes are really sorted. + void Validate(const Table& table, const SortOptions& options, UInt64Array& offsets) { + ASSERT_OK(offsets.ValidateFull()); + Comparator comparator{table, options}; + for (int i = 1; i < table.num_rows(); i++) { + uint64_t lhs = offsets.Value(i - 1); + uint64_t rhs = offsets.Value(i); + ASSERT_TRUE(comparator(lhs, rhs)); + ASSERT_OK(comparator.status()); + } + } +}; + +TEST_P(TestTableSortIndicesRandom, Sort) { + const auto first_sort_key_name = std::get<0>(GetParam()); + const auto null_probability = std::get<1>(GetParam()); + const auto seed = 0x61549225; + std::vector column_names = { + "uint8", "uint16", "uint32", "uint64", "int8", "int16", + "int32", "int64", "float", "double", "string", + }; + std::vector> fields = { + {field(column_names[0], uint8())}, {field(column_names[1], uint16())}, + {field(column_names[2], uint32())}, {field(column_names[3], uint64())}, + {field(column_names[4], int8())}, {field(column_names[5], int16())}, + {field(column_names[6], int32())}, {field(column_names[7], int64())}, + {field(column_names[8], float32())}, {field(column_names[9], float64())}, + {field(column_names[10], utf8())}, + }; + const auto length = 100; + std::vector> columns = { + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + Random(seed).Generate(length, null_probability), + }; + const auto table = Table::Make(schema(fields), columns, length); + std::default_random_engine engine(seed); + std::uniform_int_distribution<> distribution(0); + const auto n_sort_keys = 5; + std::vector sort_keys; + const auto first_sort_key_order = + (distribution(engine) % 2) == 0 ? SortOrder::Ascending : SortOrder::Descending; + sort_keys.emplace_back(first_sort_key_name, first_sort_key_order); + for (int i = 1; i < n_sort_keys; ++i) { + const auto& column_name = column_names[distribution(engine) % column_names.size()]; + const auto order = + (distribution(engine) % 2) == 0 ? SortOrder::Ascending : SortOrder::Descending; + sort_keys.emplace_back(column_name, order); + } + SortOptions options(sort_keys); + ASSERT_OK_AND_ASSIGN(auto offsets, SortIndices(*table, options)); + Validate(*table, options, *checked_pointer_cast(offsets)); +} + +INSTANTIATE_TEST_SUITE_P(NoNull, TestTableSortIndicesRandom, + testing::Combine(testing::Values("uint8", "uint16", "uint32", + "uint64", "int8", "int16", + "int32", "int64", "float", + "double", "string"), + testing::Values(0.0))); + +INSTANTIATE_TEST_SUITE_P(MayNull, TestTableSortIndicesRandom, + testing::Combine(testing::Values("uint8", "uint16", "uint32", + "uint64", "int8", "int16", + "int32", "int64", "float", + "double", "string"), + testing::Values(0.1, 0.5))); + +INSTANTIATE_TEST_SUITE_P(AllNull, TestTableSortIndicesRandom, + testing::Combine(testing::Values("uint8", "uint16", "uint32", + "uint64", "int8", "int16", + "int32", "int64", "float", + "double", "string"), + testing::Values(1.0))); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc index bcfc8d745dc..caab28e3f7c 100644 --- a/cpp/src/arrow/dataset/filter.cc +++ b/cpp/src/arrow/dataset/filter.cc @@ -1718,7 +1718,7 @@ Result> MakeGroupings(const StructArray& by) { ARROW_ASSIGN_OR_RAISE(auto fused, StructDictionary::Encode(by.fields())); - ARROW_ASSIGN_OR_RAISE(auto sort_indices, compute::SortToIndices(*fused.indices)); + ARROW_ASSIGN_OR_RAISE(auto sort_indices, compute::SortIndices(*fused.indices)); ARROW_ASSIGN_OR_RAISE(Datum sorted, compute::Take(fused.indices, *sort_indices)); fused.indices = checked_pointer_cast(sorted.make_array()); diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 545943d30c6..969de58ea79 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -32,7 +32,7 @@ ARROW_BOOST_BUILD_VERSION=1.71.0 ARROW_BROTLI_BUILD_VERSION=v1.0.7 ARROW_BZIP2_BUILD_VERSION=1.0.8 ARROW_CARES_BUILD_VERSION=1.16.1 -ARROW_GBENCHMARK_BUILD_VERSION=v1.5.1 +ARROW_GBENCHMARK_BUILD_VERSION=v1.5.2 ARROW_GFLAGS_BUILD_VERSION=v2.2.2 ARROW_GLOG_BUILD_VERSION=v0.4.0 ARROW_GRPC_BUILD_VERSION=v1.29.1 diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index c1d3ac7e61b..9c10b95adc6 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -31,7 +31,7 @@ The generic Compute API Functions and function registry ------------------------------- -Functions represent compute operations over inputs of possibly varying +Functions represent compute operations over inputs of possibly varying types. Internally, a function is implemented by one or several "kernels", depending on the concrete input types (for example, a function adding values from two inputs can have different kernels depending on @@ -621,17 +621,21 @@ In these functions, nulls are considered greater than any other value Floating-point NaN values are considered greater than any other non-null value, but smaller than nulls. -+-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=======================+============+=========================+===================+================================+=============+ -| partition_nth_indices | Unary | Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(1) \(3) | -+-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ -| partition_nth_indices | Unary | Numeric | UInt64 | :struct:`PartitionNthOptions` | \(1) | -+-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ -| sort_indices | Unary | Binary- and String-like | UInt64 | | \(2) \(3) | -+-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ -| sort_indices | Unary | Numeric | UInt64 | | \(2) | -+-----------------------+------------+-------------------------+-------------------+--------------------------------+-------------+ ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================+===================+================================+================+ +| partition_nth_indices | Unary | Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(1) \(3) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ +| partition_nth_indices | Unary | Numeric | UInt64 | :struct:`PartitionNthOptions` | \(1) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ +| array_sort_indices | Unary | Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(2) \(3) \(4) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ +| array_sort_indices | Unary | Numeric | UInt64 | :struct:`ArraySortOptions` | \(2) \(4) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ +| sort_indices | Unary | Binary- and String-like | UInt64 | :struct:`SortOptions` | \(2) \(3) \(5) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ +| sort_indices | Unary | Numeric | UInt64 | :struct:`SortOptions` | \(2) \(5) | ++-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+ * \(1) The output is an array of indices into the input array, that define a partial non-stable sort such that the *N*'th index points to the *N*'th @@ -640,12 +644,17 @@ value, but smaller than nulls. :func:`std::nth_element`). *N* is given in :member:`PartitionNthOptions::pivot`. -* \(2) The output is an array of indices into the input array, that define - a stable sort of the input array. +* \(2) The output is an array of indices into the input, that define a + stable sort of the input. * \(3) Input values are ordered lexicographically as bytestrings (even for String arrays). +* \(4) The input must be an array. The default order is ascending. + +* \(5) The input can be an array, chunked array, record batch or + table. If the input is a record batch or table, one or more sort + keys must be specified. Structural transforms ~~~~~~~~~~~~~~~~~~~~~