From f84ce882d152308c2c1f4d4ac054a46c443fe113 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 20 Jan 2021 18:41:42 -0600 Subject: [PATCH 1/6] Also access memory addresses --- cpp/src/arrow/compute/CMakeLists.txt | 1 + cpp/src/arrow/compute/internals_benchmark.cc | 86 ++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 cpp/src/arrow/compute/internals_benchmark.cc diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 897dc32f357..da3b2b46b2c 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -66,6 +66,7 @@ add_arrow_compute_test(internals_test registry_test.cc) add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(internals_benchmark PREFIX "arrow-compute") add_subdirectory(kernels) diff --git a/cpp/src/arrow/compute/internals_benchmark.cc b/cpp/src/arrow/compute/internals_benchmark.cc new file mode 100644 index 00000000000..dc93c20ccd7 --- /dev/null +++ b/cpp/src/arrow/compute/internals_benchmark.cc @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/compute/exec_internal.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { +namespace compute { +namespace detail { + +constexpr int32_t kSeed = 0xfede4a7e; + +void BM_ExecBatchIterator(benchmark::State& state) { + // Measure overhead related to deconstructing vector into a sequence of ExecBatch + random::RandomArrayGenerator rag(kSeed); + + const int64_t length = 1 << 20; + const int num_fields = 10; + + std::vector args(num_fields); + for (int i = 0; i < num_fields; ++i) { + args[i] = rag.Int64(length, 0, 100)->data(); + } + + for (auto _ : state) { + std::unique_ptr it = + *ExecBatchIterator::Make(args, state.range(0)); + ExecBatch batch; + while (it->Next(&batch)) { + for (int i = 0; i < num_fields; ++i) { + auto data = batch.values[i].array()->buffers[1]->data(); + benchmark::DoNotOptimize(data); + } + continue; + } + benchmark::DoNotOptimize(batch); + } + + state.SetItemsProcessed(state.iterations()); +} + +void BM_DatumSlice(benchmark::State& state) { + // Measure overhead related to deconstructing vector into a sequence of ExecBatch + random::RandomArrayGenerator rag(kSeed); + + const int64_t length = 1000; + + int num_datums = 1000; + std::vector datums(num_datums); + for (int i = 0; i < num_datums; ++i) { + datums[i] = rag.Int64(length, 0, 100)->data(); + } + + for (auto _ : state) { + for (const Datum& datum : datums) { + auto slice = datum.array()->Slice(16, 64); + benchmark::DoNotOptimize(slice); + } + } + state.SetItemsProcessed(state.iterations() * num_datums); +} + +BENCHMARK(BM_DatumSlice); +BENCHMARK(BM_ExecBatchIterator)->RangeMultiplier(2)->Range(256, 32768); + +} // namespace detail +} // namespace compute +} // namespace arrow From 81c9be225e2108541ea18a7e7079cd94aef0db5d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 1 Mar 2021 11:19:40 -0600 Subject: [PATCH 2/6] Put benchmark in function_benchmark.cc --- cpp/src/arrow/compute/CMakeLists.txt | 1 - cpp/src/arrow/compute/function_benchmark.cc | 30 +++++++ cpp/src/arrow/compute/internals_benchmark.cc | 86 -------------------- 3 files changed, 30 insertions(+), 87 deletions(-) delete mode 100644 cpp/src/arrow/compute/internals_benchmark.cc diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index da3b2b46b2c..897dc32f357 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -66,7 +66,6 @@ add_arrow_compute_test(internals_test registry_test.cc) add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(internals_benchmark PREFIX "arrow-compute") add_subdirectory(kernels) diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index daf03754984..be8b5fce394 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -174,11 +174,41 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * N); } +void BM_ExecBatchIterator(benchmark::State& state) { + // Measure overhead related to deconstructing vector into a sequence of ExecBatch + random::RandomArrayGenerator rag(kSeed); + + const int64_t length = 1 << 20; + const int num_fields = 10; + + std::vector args(num_fields); + for (int i = 0; i < num_fields; ++i) { + args[i] = rag.Int64(length, 0, 100)->data(); + } + + for (auto _ : state) { + std::unique_ptr it = + *ExecBatchIterator::Make(args, state.range(0)); + ExecBatch batch; + while (it->Next(&batch)) { + for (int i = 0; i < num_fields; ++i) { + auto data = batch.values[i].array()->buffers[1]->data(); + benchmark::DoNotOptimize(data); + } + continue; + } + benchmark::DoNotOptimize(batch); + } + + state.SetItemsProcessed(state.iterations()); +} + BENCHMARK(BM_CastDispatch); BENCHMARK(BM_CastDispatchBaseline); BENCHMARK(BM_AddDispatch); BENCHMARK(BM_ExecuteScalarFunctionOnScalar); BENCHMARK(BM_ExecuteScalarKernelOnScalar); +BENCHMARK(BM_ExecBatchIterator)->RangeMultiplier(2)->Range(256, 32768); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/internals_benchmark.cc b/cpp/src/arrow/compute/internals_benchmark.cc deleted file mode 100644 index dc93c20ccd7..00000000000 --- a/cpp/src/arrow/compute/internals_benchmark.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "benchmark/benchmark.h" - -#include "arrow/compute/exec_internal.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/random.h" -#include "arrow/util/benchmark_util.h" - -namespace arrow { -namespace compute { -namespace detail { - -constexpr int32_t kSeed = 0xfede4a7e; - -void BM_ExecBatchIterator(benchmark::State& state) { - // Measure overhead related to deconstructing vector into a sequence of ExecBatch - random::RandomArrayGenerator rag(kSeed); - - const int64_t length = 1 << 20; - const int num_fields = 10; - - std::vector args(num_fields); - for (int i = 0; i < num_fields; ++i) { - args[i] = rag.Int64(length, 0, 100)->data(); - } - - for (auto _ : state) { - std::unique_ptr it = - *ExecBatchIterator::Make(args, state.range(0)); - ExecBatch batch; - while (it->Next(&batch)) { - for (int i = 0; i < num_fields; ++i) { - auto data = batch.values[i].array()->buffers[1]->data(); - benchmark::DoNotOptimize(data); - } - continue; - } - benchmark::DoNotOptimize(batch); - } - - state.SetItemsProcessed(state.iterations()); -} - -void BM_DatumSlice(benchmark::State& state) { - // Measure overhead related to deconstructing vector into a sequence of ExecBatch - random::RandomArrayGenerator rag(kSeed); - - const int64_t length = 1000; - - int num_datums = 1000; - std::vector datums(num_datums); - for (int i = 0; i < num_datums; ++i) { - datums[i] = rag.Int64(length, 0, 100)->data(); - } - - for (auto _ : state) { - for (const Datum& datum : datums) { - auto slice = datum.array()->Slice(16, 64); - benchmark::DoNotOptimize(slice); - } - } - state.SetItemsProcessed(state.iterations() * num_datums); -} - -BENCHMARK(BM_DatumSlice); -BENCHMARK(BM_ExecBatchIterator)->RangeMultiplier(2)->Range(256, 32768); - -} // namespace detail -} // namespace compute -} // namespace arrow From 8c890fbf0416acf9c8aef529c03140bde880a98c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 27 Jul 2021 19:28:34 -0500 Subject: [PATCH 3/6] A few more fields to make effect more pronounced --- cpp/src/arrow/compute/function_benchmark.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index be8b5fce394..80c5b7dfbc0 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -19,6 +19,7 @@ #include "arrow/array/array_base.h" #include "arrow/compute/api.h" +#include "arrow/compute/exec_internal.h" #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/testing/gtest_util.h" @@ -179,16 +180,17 @@ void BM_ExecBatchIterator(benchmark::State& state) { random::RandomArrayGenerator rag(kSeed); const int64_t length = 1 << 20; - const int num_fields = 10; + const int num_fields = 32; std::vector args(num_fields); for (int i = 0; i < num_fields; ++i) { args[i] = rag.Int64(length, 0, 100)->data(); } + const int64_t blocksize = state.range(0); for (auto _ : state) { - std::unique_ptr it = - *ExecBatchIterator::Make(args, state.range(0)); + std::unique_ptr it = + *detail::ExecBatchIterator::Make(args, blocksize); ExecBatch batch; while (it->Next(&batch)) { for (int i = 0; i < num_fields; ++i) { From 80d62873f3237307ba1a96ec705ed683f7947204 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 28 Jul 2021 20:56:17 -0500 Subject: [PATCH 4/6] lint --- cpp/src/arrow/compute/function_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index 80c5b7dfbc0..a557e4864b6 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -190,7 +190,7 @@ void BM_ExecBatchIterator(benchmark::State& state) { const int64_t blocksize = state.range(0); for (auto _ : state) { std::unique_ptr it = - *detail::ExecBatchIterator::Make(args, blocksize); + *detail::ExecBatchIterator::Make(args, blocksize); ExecBatch batch; while (it->Next(&batch)) { for (int i = 0; i < num_fields; ++i) { From e87518d45c925188ab8e63d26196ab4b6b3e4152 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 29 Jul 2021 12:36:25 -0500 Subject: [PATCH 5/6] Address comments --- cpp/src/arrow/compute/function_benchmark.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index a557e4864b6..c95321e9b44 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -176,7 +176,8 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) { } void BM_ExecBatchIterator(benchmark::State& state) { - // Measure overhead related to deconstructing vector into a sequence of ExecBatch + // Measure overhead related to splitting ExecBatch into smaller ExecBatches + // for parallelism or more optimal CPU cache affinity random::RandomArrayGenerator rag(kSeed); const int64_t length = 1 << 20; @@ -197,11 +198,12 @@ void BM_ExecBatchIterator(benchmark::State& state) { auto data = batch.values[i].array()->buffers[1]->data(); benchmark::DoNotOptimize(data); } - continue; } benchmark::DoNotOptimize(batch); } - + // Provides comparability across blocksizes by looking at the iterations per + // second. So 1000 iterations/second means that input splitting associated + // with ExecBatchIterator takes up 1ms every time. state.SetItemsProcessed(state.iterations()); } From a57cf979e6b5bbebc8725e649ff4498a9701c4c2 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 2 Aug 2021 16:39:07 +0200 Subject: [PATCH 6/6] Refine input parameters --- cpp/src/arrow/compute/function_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index c95321e9b44..a29a766be79 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -212,7 +212,7 @@ BENCHMARK(BM_CastDispatchBaseline); BENCHMARK(BM_AddDispatch); BENCHMARK(BM_ExecuteScalarFunctionOnScalar); BENCHMARK(BM_ExecuteScalarKernelOnScalar); -BENCHMARK(BM_ExecBatchIterator)->RangeMultiplier(2)->Range(256, 32768); +BENCHMARK(BM_ExecBatchIterator)->RangeMultiplier(4)->Range(1024, 64 * 1024); } // namespace compute } // namespace arrow