From 16d5f2ab676f7f07f996cf73d4b3811b57f4a164 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Wed, 20 Sep 2023 15:52:05 +0200 Subject: [PATCH 01/14] Add support for other multi-threading APIs Support the benchmarking of code, which relies on other multi-threading APIs, e.g. OpenMP. --- docs/user_guide.md | 58 ++++++++++++ include/benchmark/benchmark.h | 29 +++++- src/benchmark.cc | 37 +++++++- src/benchmark_api_internal.cc | 18 +++- src/benchmark_api_internal.h | 8 ++ src/benchmark_register.cc | 1 + src/benchmark_runner.cc | 51 +++++++--- src/benchmark_runner.h | 1 + src/thread_manager.h | 1 + test/CMakeLists.txt | 3 + test/manual_threading_test.cc | 169 ++++++++++++++++++++++++++++++++++ 11 files changed, 357 insertions(+), 19 deletions(-) create mode 100644 test/manual_threading_test.cc diff --git a/docs/user_guide.md b/docs/user_guide.md index 2ceb13eb59..b2dafe1092 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -830,6 +830,64 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime(); Without `UseRealTime`, CPU time is used by default. +### Manual Multithreaded Benchmarks + +Google/benchmark uses `std::thread` as multithreading environment per default. +If you want to use another multithreading environment (e.g. OpenMP), you can +turn off the automatic creation of threads using the `ManualThreading` function. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code here. + for (auto _ : state) { +#pragma omp parallel num_threads(state.threads) + // Run the multithreaded test. + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +The above example creates a parallel region in each iteration. +This includes the setup and teardown of the parallel region in the time measurement, and it +adds an implicit barrier at the end of each iteration. +You can avoid these effects, if you run the whole loop in parallel. +Then you must not use the `state` object directly, but create a `ThreadState` object in each thread. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel num_threads(state.threads) + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +If you use the `ThreadState` object and explicitly specify the number of threads, then you must +use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads. +However, if you use `ThreadState` without explicitly specifying the number of threads, +then the number of threads is determined by the number of created `ThreadState` objects. +Specifying `ManualThreading` is optional in this case. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads. +``` + ## CPU Timers diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index e3857e717f..3207ed3f6b 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -930,6 +930,9 @@ class BENCHMARK_EXPORT State { return max_iterations - total_iterations_ + batch_leftover_; } + BENCHMARK_ALWAYS_INLINE + int GetNumThreadStates() const { return num_thread_states_; } + BENCHMARK_ALWAYS_INLINE std::string name() const { return name_; } @@ -976,12 +979,29 @@ class BENCHMARK_EXPORT State { const std::string name_; const int thread_index_; const int threads_; + int num_thread_states_; - internal::ThreadTimer* const timer_; internal::ThreadManager* const manager_; - internal::PerfCountersMeasurement* const perf_counters_measurement_; friend class internal::BenchmarkInstance; + + protected: + void MergeThreadStateToParent(State& parent) const; + bool started() const { return started_; } + + internal::ThreadTimer* timer_; + internal::PerfCountersMeasurement* perf_counters_measurement_; +}; + +// ThreadState can be used in a manually multithreaded benchmark loop. +class ThreadState : public State { + public: + explicit ThreadState(State& s); + ~ThreadState(); + private: + State* parent_; + + ThreadState(const ThreadState&); }; inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() { @@ -1274,6 +1294,9 @@ class BENCHMARK_EXPORT Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); + // Don't create threads. Let the user evaluate state.threads and/or use ThreadState. + Benchmark* ManualThreading() { manual_threading_ = true; return this; } + virtual void Run(State& state) = 0; TimeUnit GetTimeUnit() const; @@ -1286,6 +1309,7 @@ class BENCHMARK_EXPORT Benchmark { const char* GetName() const; int ArgsCnt() const; const char* GetArgName(int arg) const; + bool GetExplicitThreading() const { return !thread_counts_.empty(); } private: friend class BenchmarkFamilies; @@ -1307,6 +1331,7 @@ class BENCHMARK_EXPORT Benchmark { bool measure_process_cpu_time_; bool use_real_time_; bool use_manual_time_; + bool manual_threading_; BigO complexity_; BigOFunc* complexity_lambda_; std::vector statistics_; diff --git a/src/benchmark.cc b/src/benchmark.cc index 6139e59d05..b202d109b9 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -172,8 +172,9 @@ State::State(std::string name, IterationCount max_iters, name_(std::move(name)), thread_index_(thread_i), threads_(n_threads), - timer_(timer), + num_thread_states_(0), manager_(manager), + timer_(timer), perf_counters_measurement_(perf_counters_measurement) { BM_CHECK(max_iterations != 0) << "At least one iteration must be run"; BM_CHECK_LT(thread_index_, threads_) @@ -309,6 +310,40 @@ void State::FinishKeepRunning() { manager_->StartStopBarrier(); } +void State::MergeThreadStateToParent(State& parent) const { + MutexLock l(manager_->GetBenchmarkMutex()); + internal::MergeResults(*this, timer_, manager_); + assert(parent.total_iterations_ == 0 || + parent.total_iterations_ == total_iterations_); + assert(parent.batch_leftover_ == 0 || + parent.batch_leftover_ == batch_leftover_); + parent.total_iterations_ = total_iterations_; + parent.batch_leftover_ = batch_leftover_; + parent.started_ = parent.started_ || started_; + parent.finished_ = parent.finished_ || finished_; + parent.skipped_ = + (parent.error_occurred() || error_occurred()) + ? internal::SkippedWithError + : (parent.skipped() || skipped() ? internal::SkippedWithMessage + : internal::NotSkipped); + parent.num_thread_states_++; +} + +ThreadState::ThreadState(State& s) : State(s), parent_(&s) { + BM_CHECK(!started()) + << "Don't create a ThreadState object after measurement has started"; + timer_ = new internal::ThreadTimer(*timer_); + perf_counters_measurement_ = new internal::PerfCountersMeasurement( + perf_counters_measurement_->names()); +} + +ThreadState::~ThreadState() { + BM_CHECK(error_occurred() || iterations() >= max_iterations) + << "Benchmark returned before ThreadState::KeepRunning() returned false!"; + MergeThreadStateToParent(*parent_); + delete timer_; +} + namespace internal { namespace { diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 286f986530..9db2232b89 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -2,7 +2,10 @@ #include +#include "counter.h" #include "string_util.h" +#include "thread_manager.h" +#include "thread_timer.h" namespace benchmark { namespace internal { @@ -27,7 +30,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx, min_time_(benchmark_.min_time_), min_warmup_time_(benchmark_.min_warmup_time_), iterations_(benchmark_.iterations_), - threads_(thread_count) { + threads_(thread_count), + manual_threading_(benchmark_.manual_threading_), + explicit_threading_(benchmark_.GetExplicitThreading()) { name_.function_name = benchmark_.name_; size_t arg_i = 0; @@ -114,5 +119,16 @@ void BenchmarkInstance::Teardown() const { teardown_(st); } } + +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager) { + ThreadManager::Result& results = manager->results; + results.iterations += st.iterations(); + results.cpu_time_used += timer->cpu_time_used(); + results.real_time_used += timer->real_time_used(); + results.manual_time_used += timer->manual_time_used(); + results.complexity_n += st.complexity_length_n(); + Increment(&results.counters, st.counters); +} } // namespace internal } // namespace benchmark diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 94f516531b..d0813dd7a5 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -41,6 +41,8 @@ class BenchmarkInstance { int threads() const { return threads_; } void Setup() const; void Teardown() const; + bool explicit_threading() const { return explicit_threading_; } + bool manual_threading() const { return manual_threading_; } State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, internal::ThreadManager* manager, @@ -66,6 +68,9 @@ class BenchmarkInstance { double min_warmup_time_; IterationCount iterations_; int threads_; // Number of concurrent threads to us + bool manual_threading_; + bool explicit_threading_; // true: Number of threads come from a Threads() + // call typedef void (*callback_function)(const benchmark::State&); callback_function setup_ = nullptr; @@ -78,6 +83,9 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager); + BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc index e447c9a2d3..7c091c4ff9 100644 --- a/src/benchmark_register.cc +++ b/src/benchmark_register.cc @@ -217,6 +217,7 @@ Benchmark::Benchmark(const std::string& name) measure_process_cpu_time_(false), use_real_time_(false), use_manual_time_(false), + manual_threading_(false), complexity_(oNone), complexity_lambda_(nullptr), setup_(nullptr), diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index f7ae424397..66f87b571b 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -86,7 +86,7 @@ BenchmarkReporter::Run CreateRunReport( // This is the total iterations across all threads. report.iterations = results.iterations; report.time_unit = b.time_unit(); - report.threads = b.threads(); + report.threads = results.thread_count; report.repetition_index = repetition_index; report.repetitions = repeats; @@ -130,17 +130,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters, State st = b->Run(iters, thread_id, &timer, manager, perf_counters_measurement); - BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) - << "Benchmark returned before State::KeepRunning() returned false!"; + + assert(b->explicit_threading() || b->threads() == 1); + + if (st.GetNumThreadStates() > 0) { + BM_CHECK((!b->explicit_threading()) || b->manual_threading()) + << "Benchmark " << b->name().str() + << " run with managed threading. It must not create ThreadStates!"; + BM_CHECK((!b->explicit_threading()) || + st.GetNumThreadStates() == b->threads()) + << "The number of ThreadStates created by Benchmark " << b->name().str() + << " doesn't match the number of threads!"; + } else { + BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) + << "Benchmark returned before State::KeepRunning() returned false!"; + } + { MutexLock l(manager->GetBenchmarkMutex()); internal::ThreadManager::Result& results = manager->results; - results.iterations += st.iterations(); - results.cpu_time_used += timer.cpu_time_used(); - results.real_time_used += timer.real_time_used(); - results.manual_time_used += timer.manual_time_used(); - results.complexity_n += st.complexity_length_n(); - internal::Increment(&results.counters, st.counters); + if (st.GetNumThreadStates() > 0) { + // State values as well as thread state values are summed up for + // complexity_n and user counters: + results.complexity_n += st.complexity_length_n(); + internal::Increment(&results.counters, st.counters); + results.thread_count = + b->explicit_threading() ? b->threads() : st.GetNumThreadStates(); + } else { + internal::MergeResults(st, &timer, manager); + results.thread_count = b->threads(); + } } manager->NotifyThreadComplete(); } @@ -234,7 +253,8 @@ BenchmarkRunner::BenchmarkRunner( has_explicit_iteration_count(b.iterations() != 0 || parsed_benchtime_flag.tag == BenchTimeType::ITERS), - pool(b.threads() - 1), + num_managed_threads(b.manual_threading() ? 1 : b.threads()), + pool(num_managed_threads - 1), iters(has_explicit_iteration_count ? ComputeIters(b_, parsed_benchtime_flag) : 1), @@ -260,7 +280,7 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n"; std::unique_ptr manager; - manager.reset(new internal::ThreadManager(b.threads())); + manager.reset(new internal::ThreadManager(num_managed_threads)); // Run all but one thread in separate threads for (std::size_t ti = 0; ti < pool.size(); ++ti) { @@ -287,17 +307,18 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { manager.reset(); // Adjust real/manual time stats since they were reported per thread. - i.results.real_time_used /= b.threads(); - i.results.manual_time_used /= b.threads(); + i.results.real_time_used /= i.results.thread_count; + i.results.manual_time_used /= i.results.thread_count; // If we were measuring whole-process CPU usage, adjust the CPU time too. - if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads(); + if (b.measure_process_cpu_time()) + i.results.cpu_time_used /= i.results.thread_count; BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" << i.results.real_time_used << "\n"; // By using KeepRunningBatch a benchmark can iterate more times than // requested, so take the iteration count from i.results. - i.iters = i.results.iterations / b.threads(); + i.iters = i.results.iterations / i.results.thread_count; // Base decisions off of real time if requested by this benchmark. i.seconds = i.results.cpu_time_used; diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h index db2fa04396..32e91b76cc 100644 --- a/src/benchmark_runner.h +++ b/src/benchmark_runner.h @@ -93,6 +93,7 @@ class BenchmarkRunner { bool warmup_done; const int repeats; const bool has_explicit_iteration_count; + const int num_managed_threads; // must be before pool int num_repetitions_done = 0; diff --git a/src/thread_manager.h b/src/thread_manager.h index 819b3c44db..612e61e081 100644 --- a/src/thread_manager.h +++ b/src/thread_manager.h @@ -45,6 +45,7 @@ class ThreadManager { std::string report_label_; std::string skip_message_; internal::Skipped skipped_ = internal::NotSkipped; + int thread_count = 0; UserCounters counters; }; GUARDED_BY(GetBenchmarkMutex()) Result results; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fd88131988..5c2b2e347f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -167,6 +167,9 @@ add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time compile_output_test(internal_threading_test) add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s) +compile_output_test(manual_threading_test) +add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s) + compile_output_test(report_aggregates_only_test) add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s) diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc new file mode 100644 index 0000000000..b0ec2fd1ac --- /dev/null +++ b/test/manual_threading_test.cc @@ -0,0 +1,169 @@ + +#undef NDEBUG + +#include +#include +#include + +#include "../src/timers.h" +#include "benchmark/benchmark.h" +#include "output_test.h" + +namespace { + +static const std::chrono::duration time_frame(50); +static const double time_frame_in_sec( + std::chrono::duration_cast>>( + time_frame) + .count()); + +void MyBusySpinwait() { + const auto start = benchmark::ChronoClockNow(); + + while (true) { + const auto now = benchmark::ChronoClockNow(); + const auto elapsed = now - start; + + if (std::chrono::duration(elapsed) >= + time_frame) + return; + } +} + +} + +// ========================================================================= // +// --------------------------- TEST CASES BEGIN ---------------------------- // +// ========================================================================= // + +// ========================================================================= // +// BM_ManualThreadingInLoop +// Measurements include the creation and joining of threads. + +void BM_ManualThreadingInLoop(benchmark::State& state) { + int numWorkerThreads = state.threads() - 1; + std::vector pool (numWorkerThreads); + + for (auto _ : state) { + + for (int i = 0; i < numWorkerThreads; ++i) + { + pool[i] = std::thread(MyBusySpinwait); + } + MyBusySpinwait(); + for (int i = 0; i < numWorkerThreads; ++i) + { + pool[i].join(); + } + state.SetIterationTime(time_frame_in_sec); + } + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// BM_ManualThreadingBeforeLoop +// Creation of threads is done before the start of the measurement, joining after the finish of the measurement. + +void BM_ManualThreadingBeforeLoop(benchmark::State& state) { + + std::promise thread_starter; + auto starter_future = thread_starter.get_future(); + + auto threadedLoop = [&]() { + starter_future.wait(); + benchmark::ThreadState ts(state); + for (auto _ : ts) { + MyBusySpinwait(); + ts.SetIterationTime(time_frame_in_sec); + } + }; + + std::vector pool (state.threads()); + for (int i = 0; i < state.threads(); ++i) + { + pool[i] = std::thread(threadedLoop); + } + thread_starter.set_value(); + for (int i = 0; i < state.threads(); ++i) + { + pool[i].join(); + } + + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// ---------------------------- TEST CASES END ----------------------------- // +// ========================================================================= // + +int main(int argc, char* argv[]) { RunOutputTests(argc, argv); } From dd29368350685810b42a99adcf4fd943bd5641e4 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Thu, 5 Oct 2023 14:00:07 +0200 Subject: [PATCH 02/14] Fixed linking and threading analyzer issues. --- include/benchmark/benchmark.h | 2 +- src/benchmark_api_internal.cc | 2 +- src/benchmark_api_internal.h | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 3207ed3f6b..37626a6444 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -994,7 +994,7 @@ class BENCHMARK_EXPORT State { }; // ThreadState can be used in a manually multithreaded benchmark loop. -class ThreadState : public State { +class BENCHMARK_EXPORT ThreadState : public State { public: explicit ThreadState(State& s); ~ThreadState(); diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 9db2232b89..801d2a8aa6 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -121,7 +121,7 @@ void BenchmarkInstance::Teardown() const { } void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) { + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS { ThreadManager::Result& results = manager->results; results.iterations += st.iterations(); results.cpu_time_used += timer->cpu_time_used(); diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index d0813dd7a5..7ebaa4bf39 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -10,6 +10,7 @@ #include "benchmark/benchmark.h" #include "commandlineflags.h" +#include "mutex.h" namespace benchmark { namespace internal { @@ -83,8 +84,8 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); -void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager); +void MergeResults(const State& st, const ThreadTimer* timer, // only call while holding benchmark_mutex_ + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); From 63e64f23a9a0f474999492c949862a89fd0ac470 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Thu, 5 Oct 2023 14:31:56 +0200 Subject: [PATCH 03/14] Fixed memory leak and formatting issues. --- include/benchmark/benchmark.h | 11 +++- src/benchmark.cc | 8 ++- src/benchmark_api_internal.h | 3 +- test/manual_threading_test.cc | 116 +++++++++++++++++++++++++--------- 4 files changed, 102 insertions(+), 36 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 37626a6444..b53f5fce2b 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -930,7 +930,7 @@ class BENCHMARK_EXPORT State { return max_iterations - total_iterations_ + batch_leftover_; } - BENCHMARK_ALWAYS_INLINE + BENCHMARK_ALWAYS_INLINE int GetNumThreadStates() const { return num_thread_states_; } BENCHMARK_ALWAYS_INLINE @@ -998,6 +998,7 @@ class BENCHMARK_EXPORT ThreadState : public State { public: explicit ThreadState(State& s); ~ThreadState(); + private: State* parent_; @@ -1294,8 +1295,12 @@ class BENCHMARK_EXPORT Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); - // Don't create threads. Let the user evaluate state.threads and/or use ThreadState. - Benchmark* ManualThreading() { manual_threading_ = true; return this; } + // Don't create threads. Let the user evaluate state.threads and/or use + // ThreadState. + Benchmark* ManualThreading() { + manual_threading_ = true; + return this; + } virtual void Run(State& state) = 0; diff --git a/src/benchmark.cc b/src/benchmark.cc index b202d109b9..23498e2d2b 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -333,8 +333,11 @@ ThreadState::ThreadState(State& s) : State(s), parent_(&s) { BM_CHECK(!started()) << "Don't create a ThreadState object after measurement has started"; timer_ = new internal::ThreadTimer(*timer_); - perf_counters_measurement_ = new internal::PerfCountersMeasurement( - perf_counters_measurement_->names()); + if (perf_counters_measurement_) + { + perf_counters_measurement_ = new internal::PerfCountersMeasurement( + perf_counters_measurement_->names()); + } } ThreadState::~ThreadState() { @@ -342,6 +345,7 @@ ThreadState::~ThreadState() { << "Benchmark returned before ThreadState::KeepRunning() returned false!"; MergeThreadStateToParent(*parent_); delete timer_; + delete perf_counters_measurement_; } namespace internal { diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 7ebaa4bf39..8aa0a2bccb 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -84,7 +84,8 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); -void MergeResults(const State& st, const ThreadTimer* timer, // only call while holding benchmark_mutex_ +// only call while holding benchmark_mutex_: +void MergeResults(const State& st, const ThreadTimer* timer, ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; BENCHMARK_EXPORT diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc index b0ec2fd1ac..556ef51fdd 100644 --- a/test/manual_threading_test.cc +++ b/test/manual_threading_test.cc @@ -2,8 +2,8 @@ #undef NDEBUG #include -#include #include +#include #include "../src/timers.h" #include "benchmark/benchmark.h" @@ -30,7 +30,7 @@ void MyBusySpinwait() { } } -} +} // namespace // ========================================================================= // // --------------------------- TEST CASES BEGIN ---------------------------- // @@ -42,17 +42,15 @@ void MyBusySpinwait() { void BM_ManualThreadingInLoop(benchmark::State& state) { int numWorkerThreads = state.threads() - 1; - std::vector pool (numWorkerThreads); + std::vector pool(numWorkerThreads); for (auto _ : state) { - for (int i = 0; i < numWorkerThreads; ++i) - { + for (int i = 0; i < numWorkerThreads; ++i) { pool[i] = std::thread(MyBusySpinwait); } MyBusySpinwait(); - for (int i = 0; i < numWorkerThreads; ++i) - { + for (int i = 0; i < numWorkerThreads; ++i) { pool[i].join(); } state.SetIterationTime(time_frame_in_sec); @@ -61,10 +59,25 @@ void BM_ManualThreadingInLoop(benchmark::State& state) { benchmark::Counter{1, benchmark::Counter::kIsRate}; } -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingInLoop) ->Iterations(1) ->ManualThreading() @@ -78,10 +91,25 @@ BENCHMARK(BM_ManualThreadingInLoop) ->MeasureProcessCPUTime() ->UseManualTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingInLoop) ->Iterations(1) ->ManualThreading() @@ -97,10 +125,10 @@ BENCHMARK(BM_ManualThreadingInLoop) // ========================================================================= // // BM_ManualThreadingBeforeLoop -// Creation of threads is done before the start of the measurement, joining after the finish of the measurement. +// Creation of threads is done before the start of the measurement, +// joining after the finish of the measurement. void BM_ManualThreadingBeforeLoop(benchmark::State& state) { - std::promise thread_starter; auto starter_future = thread_starter.get_future(); @@ -113,14 +141,12 @@ void BM_ManualThreadingBeforeLoop(benchmark::State& state) { } }; - std::vector pool (state.threads()); - for (int i = 0; i < state.threads(); ++i) - { + std::vector pool(state.threads()); + for (int i = 0; i < state.threads(); ++i) { pool[i] = std::thread(threadedLoop); } thread_starter.set_value(); - for (int i = 0; i < state.threads(); ++i) - { + for (int i = 0; i < state.threads(); ++i) { pool[i].join(); } @@ -128,10 +154,25 @@ void BM_ManualThreadingBeforeLoop(benchmark::State& state) { benchmark::Counter{1, benchmark::Counter::kIsRate}; } -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingBeforeLoop) ->Iterations(1) ->ManualThreading() @@ -145,10 +186,25 @@ BENCHMARK(BM_ManualThreadingBeforeLoop) ->MeasureProcessCPUTime() ->UseManualTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingBeforeLoop) ->Iterations(1) ->ManualThreading() From b8f41ce3a32d2e9489f4e894564ba532c635893d Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Thu, 5 Oct 2023 14:59:24 +0200 Subject: [PATCH 04/14] More formatting issues. --- src/benchmark.cc | 3 +-- test/manual_threading_test.cc | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/benchmark.cc b/src/benchmark.cc index 23498e2d2b..d25662ff0b 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -333,8 +333,7 @@ ThreadState::ThreadState(State& s) : State(s), parent_(&s) { BM_CHECK(!started()) << "Don't create a ThreadState object after measurement has started"; timer_ = new internal::ThreadTimer(*timer_); - if (perf_counters_measurement_) - { + if (perf_counters_measurement_) { perf_counters_measurement_ = new internal::PerfCountersMeasurement( perf_counters_measurement_->names()); } diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc index 556ef51fdd..b8c249b9eb 100644 --- a/test/manual_threading_test.cc +++ b/test/manual_threading_test.cc @@ -45,7 +45,6 @@ void BM_ManualThreadingInLoop(benchmark::State& state) { std::vector pool(numWorkerThreads); for (auto _ : state) { - for (int i = 0; i < numWorkerThreads; ++i) { pool[i] = std::thread(MyBusySpinwait); } From 795cd5a66432f6215af97f4cd9c7ea7c9b1d7771 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Fri, 6 Oct 2023 14:32:51 +0200 Subject: [PATCH 05/14] Construct the base class State of a ThreadState like a usual State object instead of copying it. This fixes a bug, which otherwise happens, if one thread has already finished the benchmark loop and merged its results to the parent state, while another one hasn't started yet. --- include/benchmark/benchmark.h | 1 + src/benchmark.cc | 19 ++++++++++--------- src/thread_timer.h | 3 +++ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index b53f5fce2b..03b63850ad 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -984,6 +984,7 @@ class BENCHMARK_EXPORT State { internal::ThreadManager* const manager_; friend class internal::BenchmarkInstance; + friend class ThreadState; protected: void MergeThreadStateToParent(State& parent) const; diff --git a/src/benchmark.cc b/src/benchmark.cc index d25662ff0b..aefde5b410 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -329,15 +329,16 @@ void State::MergeThreadStateToParent(State& parent) const { parent.num_thread_states_++; } -ThreadState::ThreadState(State& s) : State(s), parent_(&s) { - BM_CHECK(!started()) - << "Don't create a ThreadState object after measurement has started"; - timer_ = new internal::ThreadTimer(*timer_); - if (perf_counters_measurement_) { - perf_counters_measurement_ = new internal::PerfCountersMeasurement( - perf_counters_measurement_->names()); - } -} +ThreadState::ThreadState(State& s) + : State(s.name(), s.max_iterations, s.range_, s.thread_index(), s.threads(), + new internal::ThreadTimer( + internal::ThreadTimer::CreateFromTimer(*s.timer_)), + s.manager_, + s.perf_counters_measurement_ + ? new internal::PerfCountersMeasurement( + s.perf_counters_measurement_->names()) + : 0), + parent_(&s) {} ThreadState::~ThreadState() { BM_CHECK(error_occurred() || iterations() >= max_iterations) diff --git a/src/thread_timer.h b/src/thread_timer.h index eb23f59561..5b226f22cd 100644 --- a/src/thread_timer.h +++ b/src/thread_timer.h @@ -18,6 +18,9 @@ class ThreadTimer { static ThreadTimer CreateProcessCpuTime() { return ThreadTimer(/*measure_process_cpu_time_=*/true); } + static ThreadTimer CreateFromTimer(const ThreadTimer& timer) { + return ThreadTimer(timer.measure_process_cpu_time); + } // Called by each thread void StartTimer() { From b027d02284839bdf1db1e823d0d575cd6479f78c Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Mon, 9 Oct 2023 09:58:55 +0200 Subject: [PATCH 06/14] Add support for other multi-threading APIs Support the benchmarking of code, which relies on other multi-threading APIs, e.g. OpenMP. --- docs/user_guide.md | 58 ++++++++++++ include/benchmark/benchmark.h | 29 +++++- src/benchmark.cc | 37 +++++++- src/benchmark_api_internal.cc | 18 +++- src/benchmark_api_internal.h | 8 ++ src/benchmark_register.cc | 1 + src/benchmark_runner.cc | 51 +++++++--- src/benchmark_runner.h | 1 + src/thread_manager.h | 1 + test/CMakeLists.txt | 3 + test/manual_threading_test.cc | 169 ++++++++++++++++++++++++++++++++++ 11 files changed, 357 insertions(+), 19 deletions(-) create mode 100644 test/manual_threading_test.cc diff --git a/docs/user_guide.md b/docs/user_guide.md index 2ceb13eb59..b2dafe1092 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -830,6 +830,64 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime(); Without `UseRealTime`, CPU time is used by default. +### Manual Multithreaded Benchmarks + +Google/benchmark uses `std::thread` as multithreading environment per default. +If you want to use another multithreading environment (e.g. OpenMP), you can +turn off the automatic creation of threads using the `ManualThreading` function. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code here. + for (auto _ : state) { +#pragma omp parallel num_threads(state.threads) + // Run the multithreaded test. + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +The above example creates a parallel region in each iteration. +This includes the setup and teardown of the parallel region in the time measurement, and it +adds an implicit barrier at the end of each iteration. +You can avoid these effects, if you run the whole loop in parallel. +Then you must not use the `state` object directly, but create a `ThreadState` object in each thread. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel num_threads(state.threads) + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +If you use the `ThreadState` object and explicitly specify the number of threads, then you must +use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads. +However, if you use `ThreadState` without explicitly specifying the number of threads, +then the number of threads is determined by the number of created `ThreadState` objects. +Specifying `ManualThreading` is optional in this case. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads. +``` + ## CPU Timers diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 23103571bb..bd2faab11a 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -930,6 +930,9 @@ class BENCHMARK_EXPORT State { return max_iterations - total_iterations_ + batch_leftover_; } + BENCHMARK_ALWAYS_INLINE + int GetNumThreadStates() const { return num_thread_states_; } + BENCHMARK_ALWAYS_INLINE std::string name() const { return name_; } @@ -976,12 +979,29 @@ class BENCHMARK_EXPORT State { const std::string name_; const int thread_index_; const int threads_; + int num_thread_states_; - internal::ThreadTimer* const timer_; internal::ThreadManager* const manager_; - internal::PerfCountersMeasurement* const perf_counters_measurement_; friend class internal::BenchmarkInstance; + + protected: + void MergeThreadStateToParent(State& parent) const; + bool started() const { return started_; } + + internal::ThreadTimer* timer_; + internal::PerfCountersMeasurement* perf_counters_measurement_; +}; + +// ThreadState can be used in a manually multithreaded benchmark loop. +class ThreadState : public State { + public: + explicit ThreadState(State& s); + ~ThreadState(); + private: + State* parent_; + + ThreadState(const ThreadState&); }; inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() { @@ -1274,6 +1294,9 @@ class BENCHMARK_EXPORT Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); + // Don't create threads. Let the user evaluate state.threads and/or use ThreadState. + Benchmark* ManualThreading() { manual_threading_ = true; return this; } + virtual void Run(State& state) = 0; TimeUnit GetTimeUnit() const; @@ -1286,6 +1309,7 @@ class BENCHMARK_EXPORT Benchmark { const char* GetName() const; int ArgsCnt() const; const char* GetArgName(int arg) const; + bool GetExplicitThreading() const { return !thread_counts_.empty(); } private: friend class BenchmarkFamilies; @@ -1307,6 +1331,7 @@ class BENCHMARK_EXPORT Benchmark { bool measure_process_cpu_time_; bool use_real_time_; bool use_manual_time_; + bool manual_threading_; BigO complexity_; BigOFunc* complexity_lambda_; std::vector statistics_; diff --git a/src/benchmark.cc b/src/benchmark.cc index 6139e59d05..b202d109b9 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -172,8 +172,9 @@ State::State(std::string name, IterationCount max_iters, name_(std::move(name)), thread_index_(thread_i), threads_(n_threads), - timer_(timer), + num_thread_states_(0), manager_(manager), + timer_(timer), perf_counters_measurement_(perf_counters_measurement) { BM_CHECK(max_iterations != 0) << "At least one iteration must be run"; BM_CHECK_LT(thread_index_, threads_) @@ -309,6 +310,40 @@ void State::FinishKeepRunning() { manager_->StartStopBarrier(); } +void State::MergeThreadStateToParent(State& parent) const { + MutexLock l(manager_->GetBenchmarkMutex()); + internal::MergeResults(*this, timer_, manager_); + assert(parent.total_iterations_ == 0 || + parent.total_iterations_ == total_iterations_); + assert(parent.batch_leftover_ == 0 || + parent.batch_leftover_ == batch_leftover_); + parent.total_iterations_ = total_iterations_; + parent.batch_leftover_ = batch_leftover_; + parent.started_ = parent.started_ || started_; + parent.finished_ = parent.finished_ || finished_; + parent.skipped_ = + (parent.error_occurred() || error_occurred()) + ? internal::SkippedWithError + : (parent.skipped() || skipped() ? internal::SkippedWithMessage + : internal::NotSkipped); + parent.num_thread_states_++; +} + +ThreadState::ThreadState(State& s) : State(s), parent_(&s) { + BM_CHECK(!started()) + << "Don't create a ThreadState object after measurement has started"; + timer_ = new internal::ThreadTimer(*timer_); + perf_counters_measurement_ = new internal::PerfCountersMeasurement( + perf_counters_measurement_->names()); +} + +ThreadState::~ThreadState() { + BM_CHECK(error_occurred() || iterations() >= max_iterations) + << "Benchmark returned before ThreadState::KeepRunning() returned false!"; + MergeThreadStateToParent(*parent_); + delete timer_; +} + namespace internal { namespace { diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 286f986530..9db2232b89 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -2,7 +2,10 @@ #include +#include "counter.h" #include "string_util.h" +#include "thread_manager.h" +#include "thread_timer.h" namespace benchmark { namespace internal { @@ -27,7 +30,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx, min_time_(benchmark_.min_time_), min_warmup_time_(benchmark_.min_warmup_time_), iterations_(benchmark_.iterations_), - threads_(thread_count) { + threads_(thread_count), + manual_threading_(benchmark_.manual_threading_), + explicit_threading_(benchmark_.GetExplicitThreading()) { name_.function_name = benchmark_.name_; size_t arg_i = 0; @@ -114,5 +119,16 @@ void BenchmarkInstance::Teardown() const { teardown_(st); } } + +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager) { + ThreadManager::Result& results = manager->results; + results.iterations += st.iterations(); + results.cpu_time_used += timer->cpu_time_used(); + results.real_time_used += timer->real_time_used(); + results.manual_time_used += timer->manual_time_used(); + results.complexity_n += st.complexity_length_n(); + Increment(&results.counters, st.counters); +} } // namespace internal } // namespace benchmark diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 94f516531b..d0813dd7a5 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -41,6 +41,8 @@ class BenchmarkInstance { int threads() const { return threads_; } void Setup() const; void Teardown() const; + bool explicit_threading() const { return explicit_threading_; } + bool manual_threading() const { return manual_threading_; } State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, internal::ThreadManager* manager, @@ -66,6 +68,9 @@ class BenchmarkInstance { double min_warmup_time_; IterationCount iterations_; int threads_; // Number of concurrent threads to us + bool manual_threading_; + bool explicit_threading_; // true: Number of threads come from a Threads() + // call typedef void (*callback_function)(const benchmark::State&); callback_function setup_ = nullptr; @@ -78,6 +83,9 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager); + BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc index e447c9a2d3..7c091c4ff9 100644 --- a/src/benchmark_register.cc +++ b/src/benchmark_register.cc @@ -217,6 +217,7 @@ Benchmark::Benchmark(const std::string& name) measure_process_cpu_time_(false), use_real_time_(false), use_manual_time_(false), + manual_threading_(false), complexity_(oNone), complexity_lambda_(nullptr), setup_(nullptr), diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index f7ae424397..66f87b571b 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -86,7 +86,7 @@ BenchmarkReporter::Run CreateRunReport( // This is the total iterations across all threads. report.iterations = results.iterations; report.time_unit = b.time_unit(); - report.threads = b.threads(); + report.threads = results.thread_count; report.repetition_index = repetition_index; report.repetitions = repeats; @@ -130,17 +130,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters, State st = b->Run(iters, thread_id, &timer, manager, perf_counters_measurement); - BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) - << "Benchmark returned before State::KeepRunning() returned false!"; + + assert(b->explicit_threading() || b->threads() == 1); + + if (st.GetNumThreadStates() > 0) { + BM_CHECK((!b->explicit_threading()) || b->manual_threading()) + << "Benchmark " << b->name().str() + << " run with managed threading. It must not create ThreadStates!"; + BM_CHECK((!b->explicit_threading()) || + st.GetNumThreadStates() == b->threads()) + << "The number of ThreadStates created by Benchmark " << b->name().str() + << " doesn't match the number of threads!"; + } else { + BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) + << "Benchmark returned before State::KeepRunning() returned false!"; + } + { MutexLock l(manager->GetBenchmarkMutex()); internal::ThreadManager::Result& results = manager->results; - results.iterations += st.iterations(); - results.cpu_time_used += timer.cpu_time_used(); - results.real_time_used += timer.real_time_used(); - results.manual_time_used += timer.manual_time_used(); - results.complexity_n += st.complexity_length_n(); - internal::Increment(&results.counters, st.counters); + if (st.GetNumThreadStates() > 0) { + // State values as well as thread state values are summed up for + // complexity_n and user counters: + results.complexity_n += st.complexity_length_n(); + internal::Increment(&results.counters, st.counters); + results.thread_count = + b->explicit_threading() ? b->threads() : st.GetNumThreadStates(); + } else { + internal::MergeResults(st, &timer, manager); + results.thread_count = b->threads(); + } } manager->NotifyThreadComplete(); } @@ -234,7 +253,8 @@ BenchmarkRunner::BenchmarkRunner( has_explicit_iteration_count(b.iterations() != 0 || parsed_benchtime_flag.tag == BenchTimeType::ITERS), - pool(b.threads() - 1), + num_managed_threads(b.manual_threading() ? 1 : b.threads()), + pool(num_managed_threads - 1), iters(has_explicit_iteration_count ? ComputeIters(b_, parsed_benchtime_flag) : 1), @@ -260,7 +280,7 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n"; std::unique_ptr manager; - manager.reset(new internal::ThreadManager(b.threads())); + manager.reset(new internal::ThreadManager(num_managed_threads)); // Run all but one thread in separate threads for (std::size_t ti = 0; ti < pool.size(); ++ti) { @@ -287,17 +307,18 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { manager.reset(); // Adjust real/manual time stats since they were reported per thread. - i.results.real_time_used /= b.threads(); - i.results.manual_time_used /= b.threads(); + i.results.real_time_used /= i.results.thread_count; + i.results.manual_time_used /= i.results.thread_count; // If we were measuring whole-process CPU usage, adjust the CPU time too. - if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads(); + if (b.measure_process_cpu_time()) + i.results.cpu_time_used /= i.results.thread_count; BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" << i.results.real_time_used << "\n"; // By using KeepRunningBatch a benchmark can iterate more times than // requested, so take the iteration count from i.results. - i.iters = i.results.iterations / b.threads(); + i.iters = i.results.iterations / i.results.thread_count; // Base decisions off of real time if requested by this benchmark. i.seconds = i.results.cpu_time_used; diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h index db2fa04396..32e91b76cc 100644 --- a/src/benchmark_runner.h +++ b/src/benchmark_runner.h @@ -93,6 +93,7 @@ class BenchmarkRunner { bool warmup_done; const int repeats; const bool has_explicit_iteration_count; + const int num_managed_threads; // must be before pool int num_repetitions_done = 0; diff --git a/src/thread_manager.h b/src/thread_manager.h index 819b3c44db..612e61e081 100644 --- a/src/thread_manager.h +++ b/src/thread_manager.h @@ -45,6 +45,7 @@ class ThreadManager { std::string report_label_; std::string skip_message_; internal::Skipped skipped_ = internal::NotSkipped; + int thread_count = 0; UserCounters counters; }; GUARDED_BY(GetBenchmarkMutex()) Result results; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ac1a00f582..833bb8bdb4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -167,6 +167,9 @@ add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time compile_output_test(internal_threading_test) add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s) +compile_output_test(manual_threading_test) +add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s) + compile_output_test(report_aggregates_only_test) add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s) diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc new file mode 100644 index 0000000000..b0ec2fd1ac --- /dev/null +++ b/test/manual_threading_test.cc @@ -0,0 +1,169 @@ + +#undef NDEBUG + +#include +#include +#include + +#include "../src/timers.h" +#include "benchmark/benchmark.h" +#include "output_test.h" + +namespace { + +static const std::chrono::duration time_frame(50); +static const double time_frame_in_sec( + std::chrono::duration_cast>>( + time_frame) + .count()); + +void MyBusySpinwait() { + const auto start = benchmark::ChronoClockNow(); + + while (true) { + const auto now = benchmark::ChronoClockNow(); + const auto elapsed = now - start; + + if (std::chrono::duration(elapsed) >= + time_frame) + return; + } +} + +} + +// ========================================================================= // +// --------------------------- TEST CASES BEGIN ---------------------------- // +// ========================================================================= // + +// ========================================================================= // +// BM_ManualThreadingInLoop +// Measurements include the creation and joining of threads. + +void BM_ManualThreadingInLoop(benchmark::State& state) { + int numWorkerThreads = state.threads() - 1; + std::vector pool (numWorkerThreads); + + for (auto _ : state) { + + for (int i = 0; i < numWorkerThreads; ++i) + { + pool[i] = std::thread(MyBusySpinwait); + } + MyBusySpinwait(); + for (int i = 0; i < numWorkerThreads; ++i) + { + pool[i].join(); + } + state.SetIterationTime(time_frame_in_sec); + } + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// BM_ManualThreadingBeforeLoop +// Creation of threads is done before the start of the measurement, joining after the finish of the measurement. + +void BM_ManualThreadingBeforeLoop(benchmark::State& state) { + + std::promise thread_starter; + auto starter_future = thread_starter.get_future(); + + auto threadedLoop = [&]() { + starter_future.wait(); + benchmark::ThreadState ts(state); + for (auto _ : ts) { + MyBusySpinwait(); + ts.SetIterationTime(time_frame_in_sec); + } + }; + + std::vector pool (state.threads()); + for (int i = 0; i < state.threads(); ++i) + { + pool[i] = std::thread(threadedLoop); + } + thread_starter.set_value(); + for (int i = 0; i < state.threads(); ++i) + { + pool[i].join(); + } + + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// ---------------------------- TEST CASES END ----------------------------- // +// ========================================================================= // + +int main(int argc, char* argv[]) { RunOutputTests(argc, argv); } From afa85ecb975983999772707d2b3215b231720b7a Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Mon, 9 Oct 2023 09:58:55 +0200 Subject: [PATCH 07/14] Fixed linking and threading analyzer issues. --- include/benchmark/benchmark.h | 2 +- src/benchmark_api_internal.cc | 2 +- src/benchmark_api_internal.h | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index bd2faab11a..1fe2aa23de 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -994,7 +994,7 @@ class BENCHMARK_EXPORT State { }; // ThreadState can be used in a manually multithreaded benchmark loop. -class ThreadState : public State { +class BENCHMARK_EXPORT ThreadState : public State { public: explicit ThreadState(State& s); ~ThreadState(); diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 9db2232b89..801d2a8aa6 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -121,7 +121,7 @@ void BenchmarkInstance::Teardown() const { } void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) { + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS { ThreadManager::Result& results = manager->results; results.iterations += st.iterations(); results.cpu_time_used += timer->cpu_time_used(); diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index d0813dd7a5..7ebaa4bf39 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -10,6 +10,7 @@ #include "benchmark/benchmark.h" #include "commandlineflags.h" +#include "mutex.h" namespace benchmark { namespace internal { @@ -83,8 +84,8 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); -void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager); +void MergeResults(const State& st, const ThreadTimer* timer, // only call while holding benchmark_mutex_ + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); From febafa2bee1c55e5f36fff586a365a90269030b2 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Mon, 9 Oct 2023 09:58:55 +0200 Subject: [PATCH 08/14] Fixed memory leak and formatting issues. --- include/benchmark/benchmark.h | 11 +++- src/benchmark.cc | 8 ++- src/benchmark_api_internal.h | 3 +- test/manual_threading_test.cc | 116 +++++++++++++++++++++++++--------- 4 files changed, 102 insertions(+), 36 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 1fe2aa23de..bb65c16cc7 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -930,7 +930,7 @@ class BENCHMARK_EXPORT State { return max_iterations - total_iterations_ + batch_leftover_; } - BENCHMARK_ALWAYS_INLINE + BENCHMARK_ALWAYS_INLINE int GetNumThreadStates() const { return num_thread_states_; } BENCHMARK_ALWAYS_INLINE @@ -998,6 +998,7 @@ class BENCHMARK_EXPORT ThreadState : public State { public: explicit ThreadState(State& s); ~ThreadState(); + private: State* parent_; @@ -1294,8 +1295,12 @@ class BENCHMARK_EXPORT Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); - // Don't create threads. Let the user evaluate state.threads and/or use ThreadState. - Benchmark* ManualThreading() { manual_threading_ = true; return this; } + // Don't create threads. Let the user evaluate state.threads and/or use + // ThreadState. + Benchmark* ManualThreading() { + manual_threading_ = true; + return this; + } virtual void Run(State& state) = 0; diff --git a/src/benchmark.cc b/src/benchmark.cc index b202d109b9..23498e2d2b 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -333,8 +333,11 @@ ThreadState::ThreadState(State& s) : State(s), parent_(&s) { BM_CHECK(!started()) << "Don't create a ThreadState object after measurement has started"; timer_ = new internal::ThreadTimer(*timer_); - perf_counters_measurement_ = new internal::PerfCountersMeasurement( - perf_counters_measurement_->names()); + if (perf_counters_measurement_) + { + perf_counters_measurement_ = new internal::PerfCountersMeasurement( + perf_counters_measurement_->names()); + } } ThreadState::~ThreadState() { @@ -342,6 +345,7 @@ ThreadState::~ThreadState() { << "Benchmark returned before ThreadState::KeepRunning() returned false!"; MergeThreadStateToParent(*parent_); delete timer_; + delete perf_counters_measurement_; } namespace internal { diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 7ebaa4bf39..8aa0a2bccb 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -84,7 +84,8 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); -void MergeResults(const State& st, const ThreadTimer* timer, // only call while holding benchmark_mutex_ +// only call while holding benchmark_mutex_: +void MergeResults(const State& st, const ThreadTimer* timer, ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; BENCHMARK_EXPORT diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc index b0ec2fd1ac..556ef51fdd 100644 --- a/test/manual_threading_test.cc +++ b/test/manual_threading_test.cc @@ -2,8 +2,8 @@ #undef NDEBUG #include -#include #include +#include #include "../src/timers.h" #include "benchmark/benchmark.h" @@ -30,7 +30,7 @@ void MyBusySpinwait() { } } -} +} // namespace // ========================================================================= // // --------------------------- TEST CASES BEGIN ---------------------------- // @@ -42,17 +42,15 @@ void MyBusySpinwait() { void BM_ManualThreadingInLoop(benchmark::State& state) { int numWorkerThreads = state.threads() - 1; - std::vector pool (numWorkerThreads); + std::vector pool(numWorkerThreads); for (auto _ : state) { - for (int i = 0; i < numWorkerThreads; ++i) - { + for (int i = 0; i < numWorkerThreads; ++i) { pool[i] = std::thread(MyBusySpinwait); } MyBusySpinwait(); - for (int i = 0; i < numWorkerThreads; ++i) - { + for (int i = 0; i < numWorkerThreads; ++i) { pool[i].join(); } state.SetIterationTime(time_frame_in_sec); @@ -61,10 +59,25 @@ void BM_ManualThreadingInLoop(benchmark::State& state) { benchmark::Counter{1, benchmark::Counter::kIsRate}; } -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingInLoop) ->Iterations(1) ->ManualThreading() @@ -78,10 +91,25 @@ BENCHMARK(BM_ManualThreadingInLoop) ->MeasureProcessCPUTime() ->UseManualTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); -BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingInLoop) ->Iterations(1) ->ManualThreading() @@ -97,10 +125,10 @@ BENCHMARK(BM_ManualThreadingInLoop) // ========================================================================= // // BM_ManualThreadingBeforeLoop -// Creation of threads is done before the start of the measurement, joining after the finish of the measurement. +// Creation of threads is done before the start of the measurement, +// joining after the finish of the measurement. void BM_ManualThreadingBeforeLoop(benchmark::State& state) { - std::promise thread_starter; auto starter_future = thread_starter.get_future(); @@ -113,14 +141,12 @@ void BM_ManualThreadingBeforeLoop(benchmark::State& state) { } }; - std::vector pool (state.threads()); - for (int i = 0; i < state.threads(); ++i) - { + std::vector pool(state.threads()); + for (int i = 0; i < state.threads(); ++i) { pool[i] = std::thread(threadedLoop); } thread_starter.set_value(); - for (int i = 0; i < state.threads(); ++i) - { + for (int i = 0; i < state.threads(); ++i) { pool[i].join(); } @@ -128,10 +154,25 @@ void BM_ManualThreadingBeforeLoop(benchmark::State& state) { benchmark::Counter{1, benchmark::Counter::kIsRate}; } -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingBeforeLoop) ->Iterations(1) ->ManualThreading() @@ -145,10 +186,25 @@ BENCHMARK(BM_ManualThreadingBeforeLoop) ->MeasureProcessCPUTime() ->UseManualTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime(); -BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); BENCHMARK(BM_ManualThreadingBeforeLoop) ->Iterations(1) ->ManualThreading() From 96569c63f75c2f61121ba726038ca3446626b7db Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Mon, 9 Oct 2023 09:58:55 +0200 Subject: [PATCH 09/14] More formatting issues. --- src/benchmark.cc | 3 +-- test/manual_threading_test.cc | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/benchmark.cc b/src/benchmark.cc index 23498e2d2b..d25662ff0b 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -333,8 +333,7 @@ ThreadState::ThreadState(State& s) : State(s), parent_(&s) { BM_CHECK(!started()) << "Don't create a ThreadState object after measurement has started"; timer_ = new internal::ThreadTimer(*timer_); - if (perf_counters_measurement_) - { + if (perf_counters_measurement_) { perf_counters_measurement_ = new internal::PerfCountersMeasurement( perf_counters_measurement_->names()); } diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc index 556ef51fdd..b8c249b9eb 100644 --- a/test/manual_threading_test.cc +++ b/test/manual_threading_test.cc @@ -45,7 +45,6 @@ void BM_ManualThreadingInLoop(benchmark::State& state) { std::vector pool(numWorkerThreads); for (auto _ : state) { - for (int i = 0; i < numWorkerThreads; ++i) { pool[i] = std::thread(MyBusySpinwait); } From 82ad95f3717b89dbc6dd56b849792f2a57d93080 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Mon, 9 Oct 2023 09:58:55 +0200 Subject: [PATCH 10/14] Construct the base class State of a ThreadState like a usual State object instead of copying it. This fixes a bug, which otherwise happens, if one thread has already finished the benchmark loop and merged its results to the parent state, while another one hasn't started yet. --- include/benchmark/benchmark.h | 1 + src/benchmark.cc | 19 ++++++++++--------- src/thread_timer.h | 3 +++ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index bb65c16cc7..2057fd6475 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -984,6 +984,7 @@ class BENCHMARK_EXPORT State { internal::ThreadManager* const manager_; friend class internal::BenchmarkInstance; + friend class ThreadState; protected: void MergeThreadStateToParent(State& parent) const; diff --git a/src/benchmark.cc b/src/benchmark.cc index d25662ff0b..aefde5b410 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -329,15 +329,16 @@ void State::MergeThreadStateToParent(State& parent) const { parent.num_thread_states_++; } -ThreadState::ThreadState(State& s) : State(s), parent_(&s) { - BM_CHECK(!started()) - << "Don't create a ThreadState object after measurement has started"; - timer_ = new internal::ThreadTimer(*timer_); - if (perf_counters_measurement_) { - perf_counters_measurement_ = new internal::PerfCountersMeasurement( - perf_counters_measurement_->names()); - } -} +ThreadState::ThreadState(State& s) + : State(s.name(), s.max_iterations, s.range_, s.thread_index(), s.threads(), + new internal::ThreadTimer( + internal::ThreadTimer::CreateFromTimer(*s.timer_)), + s.manager_, + s.perf_counters_measurement_ + ? new internal::PerfCountersMeasurement( + s.perf_counters_measurement_->names()) + : 0), + parent_(&s) {} ThreadState::~ThreadState() { BM_CHECK(error_occurred() || iterations() >= max_iterations) diff --git a/src/thread_timer.h b/src/thread_timer.h index eb23f59561..5b226f22cd 100644 --- a/src/thread_timer.h +++ b/src/thread_timer.h @@ -18,6 +18,9 @@ class ThreadTimer { static ThreadTimer CreateProcessCpuTime() { return ThreadTimer(/*measure_process_cpu_time_=*/true); } + static ThreadTimer CreateFromTimer(const ThreadTimer& timer) { + return ThreadTimer(timer.measure_process_cpu_time); + } // Called by each thread void StartTimer() { From d9a71d01f41f7ecacf2d3d6f6ebf0923e040ae2c Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Wed, 18 Oct 2023 16:25:19 +0200 Subject: [PATCH 11/14] Improved thread sanitizer statements --- src/benchmark_api_internal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 8aa0a2bccb..54dfc237df 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -84,9 +84,8 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); -// only call while holding benchmark_mutex_: void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; + ThreadManager* manager) REQUIRES(benchmark_mutex_); BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); From a8fc4af2945984b68884cb89f2d18ecf795744b2 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Wed, 18 Oct 2023 16:41:24 +0200 Subject: [PATCH 12/14] Second thread sanitzer try --- src/benchmark_api_internal.cc | 2 +- src/benchmark_api_internal.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 801d2a8aa6..db763a18b6 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -121,7 +121,7 @@ void BenchmarkInstance::Teardown() const { } void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS { + ThreadManager* manager) REQUIRES(manager->benchmark_mutex_) { ThreadManager::Result& results = manager->results; results.iterations += st.iterations(); results.cpu_time_used += timer->cpu_time_used(); diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 54dfc237df..87c00f461c 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -10,6 +10,7 @@ #include "benchmark/benchmark.h" #include "commandlineflags.h" +#include "thread_manager.h" #include "mutex.h" namespace benchmark { @@ -85,7 +86,7 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) REQUIRES(benchmark_mutex_); + ThreadManager* manager) REQUIRES(manager->benchmark_mutex_); BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); From c333d804ad7ead494730db434c47075d535398e6 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Thu, 19 Oct 2023 14:18:15 +0200 Subject: [PATCH 13/14] Revert "Second thread sanitzer try" and "Improve thread sanitzer statements" --- src/benchmark_api_internal.cc | 2 +- src/benchmark_api_internal.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index db763a18b6..801d2a8aa6 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -121,7 +121,7 @@ void BenchmarkInstance::Teardown() const { } void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) REQUIRES(manager->benchmark_mutex_) { + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS { ThreadManager::Result& results = manager->results; results.iterations += st.iterations(); results.cpu_time_used += timer->cpu_time_used(); diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 87c00f461c..8aa0a2bccb 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -10,7 +10,6 @@ #include "benchmark/benchmark.h" #include "commandlineflags.h" -#include "thread_manager.h" #include "mutex.h" namespace benchmark { @@ -85,8 +84,9 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); +// only call while holding benchmark_mutex_: void MergeResults(const State& st, const ThreadTimer* timer, - ThreadManager* manager) REQUIRES(manager->benchmark_mutex_); + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); From 6514086e19aca9c1f6770d9d1bd0da34fec4908d Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Thu, 19 Oct 2023 14:59:37 +0200 Subject: [PATCH 14/14] Add support for other multi-threading APIs Support the benchmarking of code, which relies on other multi-threading APIs, e.g. OpenMP. --- docs/user_guide.md | 58 +++++++++ include/benchmark/benchmark.h | 35 +++++- src/benchmark.cc | 41 ++++++- src/benchmark_api_internal.cc | 18 ++- src/benchmark_api_internal.h | 10 ++ src/benchmark_register.cc | 1 + src/benchmark_runner.cc | 51 +++++--- src/benchmark_runner.h | 1 + src/thread_manager.h | 1 + src/thread_timer.h | 3 + test/CMakeLists.txt | 3 + test/manual_threading_test.cc | 224 ++++++++++++++++++++++++++++++++++ 12 files changed, 427 insertions(+), 19 deletions(-) create mode 100644 test/manual_threading_test.cc diff --git a/docs/user_guide.md b/docs/user_guide.md index 2ceb13eb59..b2dafe1092 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -830,6 +830,64 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime(); Without `UseRealTime`, CPU time is used by default. +### Manual Multithreaded Benchmarks + +Google/benchmark uses `std::thread` as multithreading environment per default. +If you want to use another multithreading environment (e.g. OpenMP), you can +turn off the automatic creation of threads using the `ManualThreading` function. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code here. + for (auto _ : state) { +#pragma omp parallel num_threads(state.threads) + // Run the multithreaded test. + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +The above example creates a parallel region in each iteration. +This includes the setup and teardown of the parallel region in the time measurement, and it +adds an implicit barrier at the end of each iteration. +You can avoid these effects, if you run the whole loop in parallel. +Then you must not use the `state` object directly, but create a `ThreadState` object in each thread. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel num_threads(state.threads) + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +If you use the `ThreadState` object and explicitly specify the number of threads, then you must +use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads. +However, if you use `ThreadState` without explicitly specifying the number of threads, +then the number of threads is determined by the number of created `ThreadState` objects. +Specifying `ManualThreading` is optional in this case. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads. +``` + ## CPU Timers diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 23103571bb..2057fd6475 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -930,6 +930,9 @@ class BENCHMARK_EXPORT State { return max_iterations - total_iterations_ + batch_leftover_; } + BENCHMARK_ALWAYS_INLINE + int GetNumThreadStates() const { return num_thread_states_; } + BENCHMARK_ALWAYS_INLINE std::string name() const { return name_; } @@ -976,12 +979,31 @@ class BENCHMARK_EXPORT State { const std::string name_; const int thread_index_; const int threads_; + int num_thread_states_; - internal::ThreadTimer* const timer_; internal::ThreadManager* const manager_; - internal::PerfCountersMeasurement* const perf_counters_measurement_; friend class internal::BenchmarkInstance; + friend class ThreadState; + + protected: + void MergeThreadStateToParent(State& parent) const; + bool started() const { return started_; } + + internal::ThreadTimer* timer_; + internal::PerfCountersMeasurement* perf_counters_measurement_; +}; + +// ThreadState can be used in a manually multithreaded benchmark loop. +class BENCHMARK_EXPORT ThreadState : public State { + public: + explicit ThreadState(State& s); + ~ThreadState(); + + private: + State* parent_; + + ThreadState(const ThreadState&); }; inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() { @@ -1274,6 +1296,13 @@ class BENCHMARK_EXPORT Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); + // Don't create threads. Let the user evaluate state.threads and/or use + // ThreadState. + Benchmark* ManualThreading() { + manual_threading_ = true; + return this; + } + virtual void Run(State& state) = 0; TimeUnit GetTimeUnit() const; @@ -1286,6 +1315,7 @@ class BENCHMARK_EXPORT Benchmark { const char* GetName() const; int ArgsCnt() const; const char* GetArgName(int arg) const; + bool GetExplicitThreading() const { return !thread_counts_.empty(); } private: friend class BenchmarkFamilies; @@ -1307,6 +1337,7 @@ class BENCHMARK_EXPORT Benchmark { bool measure_process_cpu_time_; bool use_real_time_; bool use_manual_time_; + bool manual_threading_; BigO complexity_; BigOFunc* complexity_lambda_; std::vector statistics_; diff --git a/src/benchmark.cc b/src/benchmark.cc index 6139e59d05..aefde5b410 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -172,8 +172,9 @@ State::State(std::string name, IterationCount max_iters, name_(std::move(name)), thread_index_(thread_i), threads_(n_threads), - timer_(timer), + num_thread_states_(0), manager_(manager), + timer_(timer), perf_counters_measurement_(perf_counters_measurement) { BM_CHECK(max_iterations != 0) << "At least one iteration must be run"; BM_CHECK_LT(thread_index_, threads_) @@ -309,6 +310,44 @@ void State::FinishKeepRunning() { manager_->StartStopBarrier(); } +void State::MergeThreadStateToParent(State& parent) const { + MutexLock l(manager_->GetBenchmarkMutex()); + internal::MergeResults(*this, timer_, manager_); + assert(parent.total_iterations_ == 0 || + parent.total_iterations_ == total_iterations_); + assert(parent.batch_leftover_ == 0 || + parent.batch_leftover_ == batch_leftover_); + parent.total_iterations_ = total_iterations_; + parent.batch_leftover_ = batch_leftover_; + parent.started_ = parent.started_ || started_; + parent.finished_ = parent.finished_ || finished_; + parent.skipped_ = + (parent.error_occurred() || error_occurred()) + ? internal::SkippedWithError + : (parent.skipped() || skipped() ? internal::SkippedWithMessage + : internal::NotSkipped); + parent.num_thread_states_++; +} + +ThreadState::ThreadState(State& s) + : State(s.name(), s.max_iterations, s.range_, s.thread_index(), s.threads(), + new internal::ThreadTimer( + internal::ThreadTimer::CreateFromTimer(*s.timer_)), + s.manager_, + s.perf_counters_measurement_ + ? new internal::PerfCountersMeasurement( + s.perf_counters_measurement_->names()) + : 0), + parent_(&s) {} + +ThreadState::~ThreadState() { + BM_CHECK(error_occurred() || iterations() >= max_iterations) + << "Benchmark returned before ThreadState::KeepRunning() returned false!"; + MergeThreadStateToParent(*parent_); + delete timer_; + delete perf_counters_measurement_; +} + namespace internal { namespace { diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 286f986530..801d2a8aa6 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -2,7 +2,10 @@ #include +#include "counter.h" #include "string_util.h" +#include "thread_manager.h" +#include "thread_timer.h" namespace benchmark { namespace internal { @@ -27,7 +30,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx, min_time_(benchmark_.min_time_), min_warmup_time_(benchmark_.min_warmup_time_), iterations_(benchmark_.iterations_), - threads_(thread_count) { + threads_(thread_count), + manual_threading_(benchmark_.manual_threading_), + explicit_threading_(benchmark_.GetExplicitThreading()) { name_.function_name = benchmark_.name_; size_t arg_i = 0; @@ -114,5 +119,16 @@ void BenchmarkInstance::Teardown() const { teardown_(st); } } + +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS { + ThreadManager::Result& results = manager->results; + results.iterations += st.iterations(); + results.cpu_time_used += timer->cpu_time_used(); + results.real_time_used += timer->real_time_used(); + results.manual_time_used += timer->manual_time_used(); + results.complexity_n += st.complexity_length_n(); + Increment(&results.counters, st.counters); +} } // namespace internal } // namespace benchmark diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 94f516531b..8aa0a2bccb 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -10,6 +10,7 @@ #include "benchmark/benchmark.h" #include "commandlineflags.h" +#include "mutex.h" namespace benchmark { namespace internal { @@ -41,6 +42,8 @@ class BenchmarkInstance { int threads() const { return threads_; } void Setup() const; void Teardown() const; + bool explicit_threading() const { return explicit_threading_; } + bool manual_threading() const { return manual_threading_; } State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, internal::ThreadManager* manager, @@ -66,6 +69,9 @@ class BenchmarkInstance { double min_warmup_time_; IterationCount iterations_; int threads_; // Number of concurrent threads to us + bool manual_threading_; + bool explicit_threading_; // true: Number of threads come from a Threads() + // call typedef void (*callback_function)(const benchmark::State&); callback_function setup_ = nullptr; @@ -78,6 +84,10 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); +// only call while holding benchmark_mutex_: +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; + BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc index e447c9a2d3..7c091c4ff9 100644 --- a/src/benchmark_register.cc +++ b/src/benchmark_register.cc @@ -217,6 +217,7 @@ Benchmark::Benchmark(const std::string& name) measure_process_cpu_time_(false), use_real_time_(false), use_manual_time_(false), + manual_threading_(false), complexity_(oNone), complexity_lambda_(nullptr), setup_(nullptr), diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index f5cd3e644b..0a2d389c2b 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -86,7 +86,7 @@ BenchmarkReporter::Run CreateRunReport( // This is the total iterations across all threads. report.iterations = results.iterations; report.time_unit = b.time_unit(); - report.threads = b.threads(); + report.threads = results.thread_count; report.repetition_index = repetition_index; report.repetitions = repeats; @@ -130,17 +130,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters, State st = b->Run(iters, thread_id, &timer, manager, perf_counters_measurement); - BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) - << "Benchmark returned before State::KeepRunning() returned false!"; + + assert(b->explicit_threading() || b->threads() == 1); + + if (st.GetNumThreadStates() > 0) { + BM_CHECK((!b->explicit_threading()) || b->manual_threading()) + << "Benchmark " << b->name().str() + << " run with managed threading. It must not create ThreadStates!"; + BM_CHECK((!b->explicit_threading()) || + st.GetNumThreadStates() == b->threads()) + << "The number of ThreadStates created by Benchmark " << b->name().str() + << " doesn't match the number of threads!"; + } else { + BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) + << "Benchmark returned before State::KeepRunning() returned false!"; + } + { MutexLock l(manager->GetBenchmarkMutex()); internal::ThreadManager::Result& results = manager->results; - results.iterations += st.iterations(); - results.cpu_time_used += timer.cpu_time_used(); - results.real_time_used += timer.real_time_used(); - results.manual_time_used += timer.manual_time_used(); - results.complexity_n += st.complexity_length_n(); - internal::Increment(&results.counters, st.counters); + if (st.GetNumThreadStates() > 0) { + // State values as well as thread state values are summed up for + // complexity_n and user counters: + results.complexity_n += st.complexity_length_n(); + internal::Increment(&results.counters, st.counters); + results.thread_count = + b->explicit_threading() ? b->threads() : st.GetNumThreadStates(); + } else { + internal::MergeResults(st, &timer, manager); + results.thread_count = b->threads(); + } } manager->NotifyThreadComplete(); } @@ -234,7 +253,8 @@ BenchmarkRunner::BenchmarkRunner( has_explicit_iteration_count(b.iterations() != 0 || parsed_benchtime_flag.tag == BenchTimeType::ITERS), - pool(b.threads() - 1), + num_managed_threads(b.manual_threading() ? 1 : b.threads()), + pool(num_managed_threads - 1), iters(has_explicit_iteration_count ? ComputeIters(b_, parsed_benchtime_flag) : 1), @@ -260,7 +280,7 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n"; std::unique_ptr manager; - manager.reset(new internal::ThreadManager(b.threads())); + manager.reset(new internal::ThreadManager(num_managed_threads)); // Run all but one thread in separate threads for (std::size_t ti = 0; ti < pool.size(); ++ti) { @@ -287,17 +307,18 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { manager.reset(); // Adjust real/manual time stats since they were reported per thread. - i.results.real_time_used /= b.threads(); - i.results.manual_time_used /= b.threads(); + i.results.real_time_used /= i.results.thread_count; + i.results.manual_time_used /= i.results.thread_count; // If we were measuring whole-process CPU usage, adjust the CPU time too. - if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads(); + if (b.measure_process_cpu_time()) + i.results.cpu_time_used /= i.results.thread_count; BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" << i.results.real_time_used << "\n"; // By using KeepRunningBatch a benchmark can iterate more times than // requested, so take the iteration count from i.results. - i.iters = i.results.iterations / b.threads(); + i.iters = i.results.iterations / i.results.thread_count; // Base decisions off of real time if requested by this benchmark. i.seconds = i.results.cpu_time_used; diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h index db2fa04396..32e91b76cc 100644 --- a/src/benchmark_runner.h +++ b/src/benchmark_runner.h @@ -93,6 +93,7 @@ class BenchmarkRunner { bool warmup_done; const int repeats; const bool has_explicit_iteration_count; + const int num_managed_threads; // must be before pool int num_repetitions_done = 0; diff --git a/src/thread_manager.h b/src/thread_manager.h index 819b3c44db..612e61e081 100644 --- a/src/thread_manager.h +++ b/src/thread_manager.h @@ -45,6 +45,7 @@ class ThreadManager { std::string report_label_; std::string skip_message_; internal::Skipped skipped_ = internal::NotSkipped; + int thread_count = 0; UserCounters counters; }; GUARDED_BY(GetBenchmarkMutex()) Result results; diff --git a/src/thread_timer.h b/src/thread_timer.h index eb23f59561..5b226f22cd 100644 --- a/src/thread_timer.h +++ b/src/thread_timer.h @@ -18,6 +18,9 @@ class ThreadTimer { static ThreadTimer CreateProcessCpuTime() { return ThreadTimer(/*measure_process_cpu_time_=*/true); } + static ThreadTimer CreateFromTimer(const ThreadTimer& timer) { + return ThreadTimer(timer.measure_process_cpu_time); + } // Called by each thread void StartTimer() { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ac1a00f582..833bb8bdb4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -167,6 +167,9 @@ add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time compile_output_test(internal_threading_test) add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s) +compile_output_test(manual_threading_test) +add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s) + compile_output_test(report_aggregates_only_test) add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s) diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc new file mode 100644 index 0000000000..b8c249b9eb --- /dev/null +++ b/test/manual_threading_test.cc @@ -0,0 +1,224 @@ + +#undef NDEBUG + +#include +#include +#include + +#include "../src/timers.h" +#include "benchmark/benchmark.h" +#include "output_test.h" + +namespace { + +static const std::chrono::duration time_frame(50); +static const double time_frame_in_sec( + std::chrono::duration_cast>>( + time_frame) + .count()); + +void MyBusySpinwait() { + const auto start = benchmark::ChronoClockNow(); + + while (true) { + const auto now = benchmark::ChronoClockNow(); + const auto elapsed = now - start; + + if (std::chrono::duration(elapsed) >= + time_frame) + return; + } +} + +} // namespace + +// ========================================================================= // +// --------------------------- TEST CASES BEGIN ---------------------------- // +// ========================================================================= // + +// ========================================================================= // +// BM_ManualThreadingInLoop +// Measurements include the creation and joining of threads. + +void BM_ManualThreadingInLoop(benchmark::State& state) { + int numWorkerThreads = state.threads() - 1; + std::vector pool(numWorkerThreads); + + for (auto _ : state) { + for (int i = 0; i < numWorkerThreads; ++i) { + pool[i] = std::thread(MyBusySpinwait); + } + MyBusySpinwait(); + for (int i = 0; i < numWorkerThreads; ++i) { + pool[i].join(); + } + state.SetIterationTime(time_frame_in_sec); + } + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// BM_ManualThreadingBeforeLoop +// Creation of threads is done before the start of the measurement, +// joining after the finish of the measurement. + +void BM_ManualThreadingBeforeLoop(benchmark::State& state) { + std::promise thread_starter; + auto starter_future = thread_starter.get_future(); + + auto threadedLoop = [&]() { + starter_future.wait(); + benchmark::ThreadState ts(state); + for (auto _ : ts) { + MyBusySpinwait(); + ts.SetIterationTime(time_frame_in_sec); + } + }; + + std::vector pool(state.threads()); + for (int i = 0; i < state.threads(); ++i) { + pool[i] = std::thread(threadedLoop); + } + thread_starter.set_value(); + for (int i = 0; i < state.threads(); ++i) { + pool[i].join(); + } + + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// ---------------------------- TEST CASES END ----------------------------- // +// ========================================================================= // + +int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }