Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_sort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include "arrow/table.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bitmap.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/optional.h"
#include "arrow/visitor_inline.h"
Expand All @@ -42,6 +44,7 @@ namespace internal {

// Visit all physical types for which sorting is implemented.
#define VISIT_PHYSICAL_TYPES(VISIT) \
VISIT(BooleanType) \
VISIT(Int8Type) \
VISIT(Int16Type) \
VISIT(Int32Type) \
Expand Down Expand Up @@ -370,6 +373,24 @@ inline void VisitRawValuesInline(const ArrayType& values,
[&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
}

template <typename VisitorNotNull, typename VisitorNull>
inline void VisitRawValuesInline(const BooleanArray& values,
VisitorNotNull&& visitor_not_null,
VisitorNull&& visitor_null) {
if (values.null_count() != 0) {
const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
VisitBitBlocksVoid(
values.null_bitmap(), values.offset(), values.length(),
[&](int64_t i) { visitor_not_null(BitUtil::GetBit(data, values.offset() + i)); },
[&]() { visitor_null(); });
} else {
// Can avoid GetBit() overhead in the no-nulls case
VisitBitBlocksVoid(
values.data()->buffers[1], values.offset(), values.length(),
[&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
}
}

template <typename ArrowType>
class ArrayCompareSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
Expand Down Expand Up @@ -477,6 +498,42 @@ class ArrayCountSorter {
}
};

using ::arrow::internal::Bitmap;

template <>
class ArrayCountSorter<BooleanType> {
public:
ArrayCountSorter() = default;

// Returns where null starts.
// `offset` is used when this is called on a chunk of a chunked array
uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
const BooleanArray& values, int64_t offset,
const ArraySortOptions& options) {
std::array<int64_t, 2> counts{0, 0};

const int64_t nulls = values.null_count();
const int64_t ones = values.true_count();
const int64_t zeros = values.length() - ones - nulls;

int64_t null_position = values.length() - nulls;
int64_t index = offset;
const auto nulls_begin = indices_begin + null_position;

if (options.order == SortOrder::Ascending) {
// ones start after zeros
counts[1] = zeros;
} else {
// zeros start after ones
counts[0] = ones;
}
VisitRawValuesInline(
values, [&](bool v) { indices_begin[counts[v]++] = index++; },
[&]() { indices_begin[null_position++] = index++; });
return nulls_begin;
}
};

// Sort integers with counting sort or comparison based sorting algorithm
// - Use O(n) counting sort if values are in a small range
// - Use O(nlogn) std::stable_sort otherwise
Expand Down Expand Up @@ -527,6 +584,11 @@ class ArrayCountOrCompareSorter {
template <typename Type, typename Enable = void>
struct ArraySorter;

template <>
struct ArraySorter<BooleanType> {
ArrayCountSorter<BooleanType> impl;
};

template <>
struct ArraySorter<UInt8Type> {
ArrayCountSorter<UInt8Type> impl;
Expand Down Expand Up @@ -576,11 +638,17 @@ struct ArraySortIndices {

// Sort indices kernels implemented for
//
// * Boolean type
// * Number types
// * Base binary types

template <template <typename...> class ExecTemplate>
void AddSortingKernels(VectorKernel base, VectorFunction* func) {
// bool type
base.signature = KernelSignature::Make({InputType::Array(boolean())}, uint64());
base.exec = ExecTemplate<UInt64Type, BooleanType>::Exec;
DCHECK_OK(func->AddKernel(base));

for (const auto& ty : NumericTypes()) {
auto physical_type = GetPhysicalType(ty);
base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ static void ArraySortIndicesInt64Wide(benchmark::State& state) {
ArraySortIndicesInt64Benchmark(state, min, max);
}

static void ArraySortIndicesBool(benchmark::State& state) {
RegressionArgs args(state);

const int64_t array_size = args.size * 8;
auto rand = random::RandomArrayGenerator(kSeed);
auto values = rand.Boolean(array_size, 0.5, args.null_proportion);

ArraySortIndicesBenchmark(state, values);
}

static void ChunkedArraySortIndicesInt64Narrow(benchmark::State& state) {
ChunkedArraySortIndicesInt64Benchmark(state, -100, 100);
}
Expand Down Expand Up @@ -235,6 +245,12 @@ BENCHMARK(ArraySortIndicesInt64Wide)
->Args({1 << 23, 100})
->Unit(benchmark::TimeUnit::kNanosecond);

BENCHMARK(ArraySortIndicesBool)
->Apply(RegressionSetArgs)
->Args({1 << 20, 100})
->Args({1 << 23, 100})
->Unit(benchmark::TimeUnit::kNanosecond);

BENCHMARK(ChunkedArraySortIndicesInt64Narrow)
->Apply(RegressionSetArgs)
->Args({1 << 20, 100})
Expand Down
77 changes: 76 additions & 1 deletion cpp/src/arrow/compute/kernels/vector_sort_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ template <typename ArrowType>
class TestNthToIndicesForIntegral : public TestNthToIndices<ArrowType> {};
TYPED_TEST_SUITE(TestNthToIndicesForIntegral, IntegralArrowTypes);

template <typename ArrowType>
class TestNthToIndicesForBool : public TestNthToIndices<ArrowType> {};
TYPED_TEST_SUITE(TestNthToIndicesForBool, ::testing::Types<BooleanType>);

template <typename ArrowType>
class TestNthToIndicesForTemporal : public TestNthToIndices<ArrowType> {};
TYPED_TEST_SUITE(TestNthToIndicesForTemporal, TemporalArrowTypes);
Expand Down Expand Up @@ -223,6 +227,13 @@ TYPED_TEST(TestNthToIndicesForIntegral, Integral) {
this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 6);
}

TYPED_TEST(TestNthToIndicesForBool, Bool) {
this->AssertNthToIndicesJson("[null, false, true, null, false, true]", 0);
this->AssertNthToIndicesJson("[null, false, true, null, false, true]", 2);
this->AssertNthToIndicesJson("[null, false, true, null, false, true]", 5);
this->AssertNthToIndicesJson("[null, false, true, null, false, true]", 6);
}

TYPED_TEST(TestNthToIndicesForTemporal, Temporal) {
this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 0);
this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 2);
Expand Down Expand Up @@ -402,6 +413,10 @@ template <typename ArrowType>
class TestArraySortIndicesForReal : public TestArraySortIndices<ArrowType> {};
TYPED_TEST_SUITE(TestArraySortIndicesForReal, RealArrowTypes);

template <typename ArrowType>
class TestArraySortIndicesForBool : public TestArraySortIndices<ArrowType> {};
TYPED_TEST_SUITE(TestArraySortIndicesForBool, ::testing::Types<BooleanType>);

template <typename ArrowType>
class TestArraySortIndicesForIntegral : public TestArraySortIndices<ArrowType> {};
TYPED_TEST_SUITE(TestArraySortIndicesForIntegral, IntegralArrowTypes);
Expand Down Expand Up @@ -464,6 +479,26 @@ TYPED_TEST(TestArraySortIndicesForIntegral, SortIntegral) {
"[5, 2, 4, 1, 0, 3]");
}

TYPED_TEST(TestArraySortIndicesForBool, SortBool) {
this->AssertSortIndices("[]", "[]");

this->AssertSortIndices("[true, true, false]", "[2, 0, 1]");
this->AssertSortIndices("[false, false, false, true, true, true, true]",
"[0, 1, 2, 3, 4, 5, 6]");
this->AssertSortIndices("[true, true, true, true, false, false, false]",
"[4, 5, 6, 0, 1, 2, 3]");

this->AssertSortIndices("[false, true, false, true, true, false, false]",
SortOrder::Ascending, "[0, 2, 5, 6, 1, 3, 4]");
this->AssertSortIndices("[false, true, false, true, true, false, false]",
SortOrder::Descending, "[1, 3, 4, 0, 2, 5, 6]");

this->AssertSortIndices("[null, true, false, null, false, true]", SortOrder::Ascending,
"[2, 4, 1, 5, 0, 3]");
this->AssertSortIndices("[null, true, false, null, false, true]", SortOrder::Descending,
"[1, 5, 2, 4, 0, 3]");
}

TYPED_TEST(TestArraySortIndicesForTemporal, SortTemporal) {
this->AssertSortIndices("[]", "[]");

Expand Down Expand Up @@ -546,7 +581,7 @@ class TestArraySortIndicesRandomCompare : public TestBase {};
using SortIndicesableTypes =
::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
Int32Type, Int64Type, FloatType, DoubleType, StringType,
Decimal128Type>;
Decimal128Type, BooleanType>;

template <typename ArrayType>
void ValidateSorted(const ArrayType& array, UInt64Array& offsets, SortOrder order) {
Expand Down Expand Up @@ -842,6 +877,27 @@ TEST_F(TestRecordBatchSortIndices, NaNAndNull) {
AssertSortIndices(batch, options, "[7, 1, 2, 6, 5, 4, 0, 3]");
}

TEST_F(TestRecordBatchSortIndices, Boolean) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's worth testing the non-null case separately. This will make less test code to maintain.

auto schema = ::arrow::schema({
{field("a", boolean())},
{field("b", boolean())},
});
SortOptions options(
{SortKey("a", SortOrder::Ascending), SortKey("b", SortOrder::Descending)});

auto batch = RecordBatchFromJSON(schema,
R"([{"a": true, "b": null},
{"a": false, "b": null},
{"a": true, "b": true},
{"a": false, "b": true},
{"a": true, "b": false},
{"a": null, "b": false},
{"a": false, "b": null},
{"a": null, "b": true}
])");
AssertSortIndices(batch, options, "[3, 1, 6, 2, 4, 0, 7, 5]");
}

TEST_F(TestRecordBatchSortIndices, MoreTypes) {
auto schema = ::arrow::schema({
{field("a", timestamp(TimeUnit::MICRO))},
Expand Down Expand Up @@ -980,6 +1036,25 @@ TEST_F(TestTableSortIndices, NaNAndNull) {
AssertSortIndices(table, options, "[7, 1, 2, 6, 5, 4, 0, 3]");
}

TEST_F(TestTableSortIndices, Boolean) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

auto schema = ::arrow::schema({
{field("a", boolean())},
{field("b", boolean())},
});
SortOptions options(
{SortKey("a", SortOrder::Ascending), SortKey("b", SortOrder::Descending)});
auto table = TableFromJSON(schema, {R"([{"a": true, "b": null},
{"a": false, "b": null},
{"a": true, "b": true},
{"a": false, "b": true}])",
R"([{"a": true, "b": false},
{"a": null, "b": false},
{"a": false, "b": null},
{"a": null, "b": true}
])"});
AssertSortIndices(table, options, "[3, 1, 6, 2, 4, 0, 7, 5]");
}

TEST_F(TestTableSortIndices, BinaryLike) {
auto schema = ::arrow::schema({
{field("a", large_utf8())},
Expand Down
30 changes: 15 additions & 15 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1163,21 +1163,21 @@ In these functions, nulls are considered greater than any other value
Floating-point NaN values are considered greater than any other non-null
value, but smaller than nulls.

+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+=======================+============+=========================+===================+================================+================+
| partition_nth_indices | Unary | Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(1) \(3) |
+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
| partition_nth_indices | Unary | Numeric | UInt64 | :struct:`PartitionNthOptions` | \(1) |
+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
| array_sort_indices | Unary | Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(2) \(3) \(4) |
+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
| array_sort_indices | Unary | Numeric | UInt64 | :struct:`ArraySortOptions` | \(2) \(4) |
+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
| sort_indices | Unary | Binary- and String-like | UInt64 | :struct:`SortOptions` | \(2) \(3) \(5) |
+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
| sort_indices | Unary | Numeric | UInt64 | :struct:`SortOptions` | \(2) \(5) |
+-----------------------+------------+-------------------------+-------------------+--------------------------------+----------------+
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+=======================+============+=============================+===================+================================+================+
| partition_nth_indices | Unary | Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(1) \(3) |
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+
| partition_nth_indices | Unary | Boolean, Numeric, Temporal | UInt64 | :struct:`PartitionNthOptions` | \(1) |
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+
| array_sort_indices | Unary | Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(2) \(3) \(4) |
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+
| array_sort_indices | Unary | Boolean, Numeric, Temporal | UInt64 | :struct:`ArraySortOptions` | \(2) \(4) |
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+
| sort_indices | Unary | Binary- and String-like | UInt64 | :struct:`SortOptions` | \(2) \(3) \(5) |
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+
| sort_indices | Unary | Boolean, Numeric, Temporal | UInt64 | :struct:`SortOptions` | \(2) \(5) |
+-----------------------+------------+-----------------------------+-------------------+--------------------------------+----------------+

* \(1) The output is an array of indices into the input array, that define
a partial non-stable sort such that the *N*'th index points to the *N*'th
Expand Down
2 changes: 1 addition & 1 deletion r/tests/testthat/helper-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ example_data_for_sorting <- tibble::tibble(
int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_),
dbl = c(-Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin, pi, .Machine$double.xmax, Inf, NaN, NA_real_),
chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_),
lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA), # bool is not supported (ARROW-12016)
lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA),
dttm = lubridate::ymd_hms(c(
"0000-01-01 00:00:00",
"1919-05-29 13:08:55",
Expand Down
1 change: 0 additions & 1 deletion r/tests/testthat/test-dplyr-arrange.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ test_that("arrange() on datetime columns", {
})

test_that("arrange() on logical columns", {
skip("Sorting by bool columns is not supported (ARROW-12016)")
expect_dplyr_equal(
input %>%
arrange(lgl, int) %>%
Expand Down