Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
85bed4d
ARROW-8199: [C++] Add support for multi-column sort indices on Table
kou Nov 6, 2020
50dd97f
Remove unused variables
kou Nov 9, 2020
121b51b
Remove unused variables
kou Nov 9, 2020
e4e5d68
Remove unused assignments
kou Nov 9, 2020
b5d234f
Use auto
kou Nov 9, 2020
8c9c7c0
Add missing Status check
kou Nov 9, 2020
96d0e52
Use random::SeedType explicitly
kou Nov 9, 2020
7b4b7ae
Don't use >
kou Nov 9, 2020
67a3d40
Don't use max
kou Nov 9, 2020
3bce727
Add more tests
kou Nov 9, 2020
9289fdc
Adjust benchmark parameters
kou Nov 11, 2020
3285131
Add support for NaN
kou Nov 11, 2020
4a76c39
Update document
kou Nov 11, 2020
efa3ee6
Use enum class
kou Nov 17, 2020
2c4d236
Use Capital for enum name to avoid conflict
kou Nov 17, 2020
0d097db
Fix wrong description
kou Nov 17, 2020
493313a
Remove SortIndices(Array, ExecContext)
kou Nov 17, 2020
628e3ec
Add more description about null and order
kou Nov 17, 2020
629e291
Use std::pair to return two value
kou Nov 17, 2020
ff81915
Fix type
kou Nov 17, 2020
8170331
Add a comment to Compare()
kou Nov 17, 2020
ac0f654
Add more benchmark patterns
kou Nov 18, 2020
f45664e
Add chunked array sorter and radix sort based table sorter
kou Nov 19, 2020
3631766
Format JSON
kou Nov 19, 2020
eca3902
Add more comments
kou Nov 19, 2020
af9c880
Don't use shared_ptr
kou Nov 19, 2020
aa368eb
Format
kou Nov 19, 2020
8deb74c
Use benchmark 1.5.2 for ArgsProduct()
kou Nov 19, 2020
e9c03d2
Remove benchmark version check with conda
kou Nov 24, 2020
f84f748
Add missing override
kou Nov 24, 2020
a7eb62f
Fix format
kou Nov 24, 2020
e9b481c
Don't use unique_ptr
kou Nov 24, 2020
bbda240
Fix format
kou Nov 24, 2020
e3ed89a
Make tests faster
pitrou Nov 24, 2020
5ca9e8d
Fix typos in doc
pitrou Nov 24, 2020
fecd557
More small changes
pitrou Nov 24, 2020
8e5026a
Remove unused variable
pitrou Nov 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions c_glib/arrow-glib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2177,29 +2177,52 @@ garrow_array_is_in_chunked_array(GArrowArray *left,
}

/**
* garrow_array_sort_to_indices:
* garrow_array_sort_indices:
* @array: A #GArrowArray.
* @order: The order for sort.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (nullable) (transfer full): The indices that would sort
* an array on success, %NULL on error.
* an array in the specified order on success, %NULL on error.
*
* Since: 0.15.0
* Since: 3.0.0
*/
GArrowUInt64Array *
garrow_array_sort_to_indices(GArrowArray *array,
GError **error)
garrow_array_sort_indices(GArrowArray *array,
GArrowSortOrder order,
GError **error)
{
auto arrow_array = garrow_array_get_raw(array);
auto arrow_array_raw = arrow_array.get();
auto arrow_indices_array = arrow::compute::SortToIndices(*arrow_array_raw);
if (garrow::check(error, arrow_indices_array, "[array][sort-to-indices]")) {
auto arrow_order = static_cast<arrow::compute::SortOrder>(order);
auto arrow_indices_array =
arrow::compute::SortIndices(*arrow_array_raw, arrow_order);
if (garrow::check(error, arrow_indices_array, "[array][sort-indices]")) {
return GARROW_UINT64_ARRAY(garrow_array_new_raw(&(*arrow_indices_array)));
} else {
return NULL;
}
}

/**
* garrow_array_sort_to_indices:
* @array: A #GArrowArray.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (nullable) (transfer full): The indices that would sort
* an array in ascending order on success, %NULL on error.
*
* Since: 0.15.0
*
* Deprecated: 3.0.0: Use garrow_array_sort_indices() instead.
*/
GArrowUInt64Array *
garrow_array_sort_to_indices(GArrowArray *array,
GError **error)
{
return garrow_array_sort_indices(array, GARROW_SORT_ORDER_ASCENDING, error);
}

/**
* garrow_table_filter:
* @table: A #GArrowTable.
Expand Down
24 changes: 23 additions & 1 deletion c_glib/arrow-glib/compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ GArrowCastOptions *garrow_cast_options_new(void);
* @GARROW_COUNT_ALL: Count all non-null values.
* @GARROW_COUNT_NULL: Count all null values.
*
* They are corresponding to `arrow::compute::CountOptions::mode` values.
* They are corresponding to `arrow::compute::CountOptions::Mode` values.
*/
typedef enum {
GARROW_COUNT_ALL,
Expand Down Expand Up @@ -377,10 +377,32 @@ GArrowBooleanArray *
garrow_array_is_in_chunked_array(GArrowArray *left,
GArrowChunkedArray *right,
GError **error);

/**
* GArrowSortOrder:
* @GARROW_SORT_ORDER_ASCENDING: Sort in ascending order.
* @GARROW_SORT_ORDER_DESCENDING: Sort in descending order.
*
* They are corresponding to `arrow::compute::SortOrder` values.
*
* Since: 3.0.0
*/
typedef enum {
GARROW_SORT_ORDER_ASCENDING,
GARROW_SORT_ORDER_DESCENDING,
} GArrowSortOrder;

GARROW_AVAILABLE_IN_3_0
GArrowUInt64Array *
garrow_array_sort_indices(GArrowArray *array,
GArrowSortOrder order,
GError **error);
GARROW_DEPRECATED_IN_3_0_FOR(garrow_array_sort_indices)
GARROW_AVAILABLE_IN_0_15
GArrowUInt64Array *
garrow_array_sort_to_indices(GArrowArray *array,
GError **error);

GARROW_AVAILABLE_IN_0_16
GArrowTable *
garrow_table_filter(GArrowTable *table,
Expand Down
23 changes: 23 additions & 0 deletions c_glib/arrow-glib/version.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@
# define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor)
#endif

/**
* GARROW_VERSION_3_0:
*
* You can use this macro value for compile time API version check.
*
* Since: 3.0.0
*/
#define GARROW_VERSION_3_0 G_ENCODE_VERSION(3, 0)

/**
* GARROW_VERSION_2_0:
*
Expand Down Expand Up @@ -229,6 +238,20 @@

#define GARROW_AVAILABLE_IN_ALL

#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_3_0
# define GARROW_DEPRECATED_IN_3_0 GARROW_DEPRECATED
# define GARROW_DEPRECATED_IN_3_0_FOR(function) GARROW_DEPRECATED_FOR(function)
#else
# define GARROW_DEPRECATED_IN_3_0
# define GARROW_DEPRECATED_IN_3_0_FOR(function)
#endif

#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_3_0
# define GARROW_AVAILABLE_IN_3_0 GARROW_UNAVAILABLE(3, 0)
#else
# define GARROW_AVAILABLE_IN_3_0
#endif

#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_2_0
# define GARROW_DEPRECATED_IN_2_0 GARROW_DEPRECATED
# define GARROW_DEPRECATED_IN_2_0_FOR(function) GARROW_DEPRECATED_FOR(function)
Expand Down
4 changes: 4 additions & 0 deletions c_glib/doc/arrow-glib/arrow-glib-docs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@
<title>Index of deprecated API</title>
<xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
</index>
<index id="api-index-3-0-0" role="3.0.0">
<title>Index of new symbols in 3.0.0</title>
<xi:include href="xml/api-index-3.0.0.xml"><xi:fallback /></xi:include>
</index>
<index id="api-index-2-0-0" role="2.0.0">
<title>Index of new symbols in 2.0.0</title>
<xi:include href="xml/api-index-2.0.0.xml"><xi:fallback /></xi:include>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,34 @@
# specific language governing permissions and limitations
# under the License.

class TestSortToIndices < Test::Unit::TestCase
class TestSortIndices < Test::Unit::TestCase
include Helper::Buildable

sub_test_case("Integer") do
def test_no_null
array = build_int16_array([1, 0, 4, -3])
assert_equal(build_uint64_array([3, 1, 0, 2]),
array.sort_to_indices)
array.sort_indices(:ascending))
end

def test_null
array = build_int16_array([nil, 1, 0, nil, 4, 3])
assert_equal(build_uint64_array([2, 1, 5, 4, 0, 3]),
array.sort_to_indices)
array.sort_indices(:ascending))
end
end

sub_test_case("String") do
def test_no_null
array = build_string_array(["hello", "world", "a", "z"])
assert_equal(build_uint64_array([2, 0, 1, 3]),
array.sort_to_indices)
array.sort_indices(:ascending))
end

def test_null
array = build_string_array([nil, "b", "a", nil, "c", "d"])
assert_equal(build_uint64_array([2, 1, 4, 5, 0, 3]),
array.sort_to_indices)
array.sort_indices(:ascending))
end
end
end
2 changes: 1 addition & 1 deletion ci/conda_env_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.

aws-sdk-cpp
benchmark=1.4.1
benchmark=1.5.2
boost-cpp>=1.68.0
brotli
bzip2
Expand Down
20 changes: 17 additions & 3 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,9 @@ macro(resolve_dependency DEPENDENCY_NAME)

if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO")
if(ARG_REQUIRED_VERSION)
find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION} MODULE)
find_package(${DEPENDENCY_NAME} ${ARG_REQUIRED_VERSION})
else()
find_package(${DEPENDENCY_NAME} MODULE)
find_package(${DEPENDENCY_NAME})
endif()
if(${${DEPENDENCY_NAME}_FOUND})
set(${DEPENDENCY_NAME}_SOURCE "SYSTEM")
Expand Down Expand Up @@ -1797,7 +1797,21 @@ macro(build_benchmark)
endmacro()

if(ARROW_BUILD_BENCHMARKS)
resolve_dependency(benchmark)
# ArgsProduct() is available since 1.5.2
set(BENCHMARK_REQUIRED_VERSION 1.5.2)
if("${ARROW_DEPENDENCY_SOURCE}" STREQUAL "CONDA"
AND "${benchmark_SOURCE}" STREQUAL "SYSTEM")
# TODO: Remove this workaround once
# https://github.com/google/benchmark/issues/1046 is resolved.
#
# benchmark doesn't set suitable version when we use released
# archive. So the benchmark package on conda-forge isn't report
# the real version. We accept all the benchmark package with
# conda. Conda users should install benchmark 1.5.2 or later by
# ci/conda_env_cpp.yml.
set(BENCHMARK_REQUIRED_VERSION 0.0.0)
endif()
resolve_dependency(benchmark REQUIRED_VERSION ${BENCHMARK_REQUIRED_VERSION})
# TODO: Don't use global includes but rather target_include_directories
get_target_property(BENCHMARK_INCLUDE_DIR benchmark::benchmark
INTERFACE_INCLUDE_DIRECTORIES)
Expand Down
26 changes: 24 additions & 2 deletions cpp/src/arrow/compute/api_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,26 @@ Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
return result.make_array();
}

Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("sort_indices", {Datum(values)}, ctx));
Result<std::shared_ptr<Array>> SortIndices(const Array& values, SortOrder order,
ExecContext* ctx) {
ArraySortOptions options(order);
ARROW_ASSIGN_OR_RAISE(
Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx));
return result.make_array();
}

Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
SortOrder order, ExecContext* ctx) {
SortOptions options({SortKey("not-used", order)});
ARROW_ASSIGN_OR_RAISE(
Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
return result.make_array();
}

Result<std::shared_ptr<Array>> SortIndices(const Table& table, const SortOptions& options,
ExecContext* ctx) {
ARROW_ASSIGN_OR_RAISE(Datum result,
CallFunction("sort_indices", {Datum(table)}, &options, ctx));
return result.make_array();
}

Expand Down Expand Up @@ -135,5 +153,9 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
return result.table();
}

Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
return SortIndices(values, SortOrder::Ascending, ctx);
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This hunk is for SortToIndices() -> SortIndices() rename.

} // namespace compute
} // namespace arrow
97 changes: 90 additions & 7 deletions cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,34 @@ struct ARROW_EXPORT TakeOptions : public FunctionOptions {
static TakeOptions Defaults() { return BoundsCheck(); }
};

enum class SortOrder {
Ascending,
Descending,
};

/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
struct ARROW_EXPORT SortKey {
explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
: name(name), order(order) {}

/// The name of the sort column.
std::string name;
/// How to order by this sort key.
SortOrder order;
};

struct ARROW_EXPORT ArraySortOptions : public FunctionOptions {
explicit ArraySortOptions(SortOrder order = SortOrder::Ascending) : order(order) {}

SortOrder order;
};

struct ARROW_EXPORT SortOptions : public FunctionOptions {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps name this "TableSortOptions"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to keep this because this is used by all inputs (Array, ChunkedArray, RecordBatch and Table) with the sort_indices() compute kernel.
So I think that no Table prefix is suitable.

explicit SortOptions(std::vector<SortKey> sort_keys = {}) : sort_keys(sort_keys) {}

std::vector<SortKey> sort_keys;
};

/// \brief Partitioning options for NthToIndices
struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {}
Expand Down Expand Up @@ -152,21 +180,71 @@ ARROW_EXPORT
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
ExecContext* ctx = NULLPTR);

/// \brief Returns the indices that would sort an array.
/// \brief Returns the indices that would sort an array in the
/// specified order.
///
/// Perform an indirect sort of array. The output array will contain
/// indices that would sort an array, which would be the same length
/// as input. Nulls will be stably partitioned to the end of the output.
/// as input. Nulls will be stably partitioned to the end of the output
/// regardless of order.
///
/// For example given values = [null, 1, 3.3, null, 2, 5.3], the output
/// will be [1, 4, 2, 5, 0, 3]
/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
/// 3].
///
/// \param[in] values array to sort
/// \param[in] array array to sort
/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
ExecContext* ctx = NULLPTR);
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
SortOrder order = SortOrder::Ascending,
ExecContext* ctx = NULLPTR);

/// \brief Returns the indices that would sort a chunked array in the
/// specified order.
///
/// Perform an indirect sort of chunked array. The output array will
/// contain indices that would sort a chunked array, which would be
/// the same length as input. Nulls will be stably partitioned to the
/// end of the output regardless of order.
///
/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
/// 4, 1, 0, 3].
///
/// \param[in] chunked_array chunked array to sort
/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
SortOrder order = SortOrder::Ascending,
ExecContext* ctx = NULLPTR);

/// \brief Returns the indices that would sort a table in the
/// specified order.
///
/// Perform an indirect sort of table. The output array will contain
/// indices that would sort a table, which would be the same length as
/// input. Nulls will be stably partitioned to the end of the output
/// regardless of order.
///
/// For example given table = {
/// "column1": [[null, 1], [ 3, null, 2, 1]],
/// "column2": [[ 5], [3, null, null, 5, 5]],
/// } and options = {
/// {"column1", SortOrder::Ascending},
/// {"column2", SortOrder::Descending},
/// }, the output will be [5, 1, 4, 2, 0, 3].
///
/// \param[in] table table to sort
/// \param[in] options options
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort a table
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const Table& table, const SortOptions& options,
ExecContext* ctx = NULLPTR);

/// \brief Compute unique elements from an array-like object
///
Expand Down Expand Up @@ -254,5 +332,10 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* context = NULLPTR);

ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
ExecContext* ctx = NULLPTR);

} // namespace compute
} // namespace arrow
Loading