From c53d9cecfec0de260b84c7c6ab3f23f532087130 Mon Sep 17 00:00:00 2001 From: Frank Du Date: Tue, 12 May 2020 06:48:39 +0000 Subject: [PATCH] [C++] Unrooled aggregate dense for better speculative execution 1. Expand SumKernel benchmark to more types(Float, Double, Int8, Int16, Int32, Int64). 2. Unlooped the aggregate kernel dense part to speculative add the result in parrel. Signed-off-by: Frank Du --- .../arrow/compute/kernels/aggregate_basic.cc | 19 ++++++++++++++++- .../compute/kernels/aggregate_benchmark.cc | 21 ++++++++++++++----- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 14f9be3f93e..2f9f3d7a61e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -137,9 +137,26 @@ struct SumState { ThisType local; const auto values = array.raw_values(); const int64_t length = array.length(); - for (int64_t i = 0; i < length; i++) { + + constexpr int64_t kRoundFactor = 8; + const int64_t length_rounded = BitUtil::RoundDown(length, kRoundFactor); + typename SumType::c_type sum_rounded[kRoundFactor] = {0}; + + // Unrolled the loop to add the results in parrel + for (int64_t i = 0; i < length_rounded; i += kRoundFactor) { + for (int64_t k = 0; k < kRoundFactor; k++) { + sum_rounded[k] += values[i + k]; + } + } + for (int64_t k = 0; k < kRoundFactor; k++) { + local.sum += sum_rounded[k]; + } + + // The trailing part + for (int64_t i = length_rounded; i < length; ++i) { local.sum += values[i]; } + local.count = length; return local; } diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index 46d726d4071..101284ec6b0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -305,12 +305,14 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) ->Apply(BenchmarkSetArgs); #endif // ARROW_WITH_BENCHMARKS_REFERENCE +template static void SumKernel(benchmark::State& state) { - const int64_t array_size = state.range(0) / sizeof(int64_t); + using CType = typename TypeTraits::CType; + + const int64_t array_size = state.range(0) / sizeof(CType); const double null_percent = static_cast(state.range(1)) / 100.0; auto rand = random::RandomArrayGenerator(1923); - auto array = std::static_pointer_cast>( - rand.Int64(array_size, -100, 100, null_percent)); + auto array = rand.Numeric(array_size, -100, 100, null_percent); for (auto _ : state) { ABORT_NOT_OK(Sum(array).status()); @@ -318,10 +320,19 @@ static void SumKernel(benchmark::State& state) { state.counters["size"] = static_cast(state.range(0)); state.counters["null_percent"] = static_cast(state.range(1)); - state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); + state.SetBytesProcessed(state.iterations() * array_size * sizeof(CType)); } -BENCHMARK(SumKernel)->Apply(RegressionSetArgs); +#define SUM_KERNEL_BENCHMARK(FuncName, Type) \ + static void FuncName(benchmark::State& state) { SumKernel(state); } \ + BENCHMARK(FuncName)->Apply(RegressionSetArgs) + +SUM_KERNEL_BENCHMARK(SumKernelFloat, FloatType); +SUM_KERNEL_BENCHMARK(SumKernelDouble, DoubleType); +SUM_KERNEL_BENCHMARK(SumKernelInt8, Int8Type); +SUM_KERNEL_BENCHMARK(SumKernelInt16, Int16Type); +SUM_KERNEL_BENCHMARK(SumKernelInt32, Int32Type); +SUM_KERNEL_BENCHMARK(SumKernelInt64, Int64Type); } // namespace compute } // namespace arrow