From 4d42dda801470df6cd39b239a4b526a850773540 Mon Sep 17 00:00:00 2001 From: Olaf Krzikalla Date: Mon, 24 Aug 2020 14:07:15 +0200 Subject: [PATCH] Add support for other multi-threading APIs --- README.md | 54 +++++++++++++++++++++++++++++++++++ include/benchmark/benchmark.h | 20 +++++++++++++ src/benchmark.cc | 40 +++++++++++++++++++++++++- src/benchmark_api_internal.h | 4 +++ src/benchmark_register.cc | 3 ++ src/benchmark_runner.cc | 47 +++++++++++++++++++++--------- src/thread_manager.h | 1 + 7 files changed, 155 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 02a3bfad80..a32becb752 100644 --- a/README.md +++ b/README.md @@ -905,6 +905,60 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime(); Without `UseRealTime`, CPU time is used by default. +Google/benchmark uses `std::thread` as multithreading environment per default. +If you want to use another multithreading environment (e.g. OpenMP), you can +turn off the automatic threading using the `ManualThreading` function. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code here. + for (auto _ : state) { +#pragma omp parallel num_threads(state.threads) + // Run the multithreaded test. + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +The above example creates a parallel region in each iteration. +This includes the setup and teardown of the parallel region in the time measurment and it +adds an implicit barrier at the end of each iteration. +You can avoid these effects, if you run the whole loop in parallel. +Then you must not use the `state` object directly, but create a `ThreadState` object in each thread. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel num_threads(state.threads) + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +If you use the `ThreadState` object and explicitly specify the number of threads, then you must +use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads. +However, if you use `ThreadState` without explicitly specifying the number of threads, +then the number of threads is determined by the number of created `ThreadState` objects. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code here. +#pragma omp parallel + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads. +``` +Specifying `ManualThreading` is optional in this case. + + ### CPU Timers diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index da638f952d..d35dc509b7 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -641,6 +641,9 @@ class State { return max_iterations - total_iterations_ + batch_leftover_; } + BENCHMARK_ALWAYS_INLINE + int GetNumThreadStates() const { return num_thread_states_; } + private : // items we expect on the first cache line (ie 64 bytes of the struct) // When total_iterations_ is 0, KeepRunning() and friends will return false. @@ -685,8 +688,21 @@ class State { void FinishKeepRunning(); internal::ThreadTimer* timer_; internal::ThreadManager* manager_; + int num_thread_states_; friend struct internal::BenchmarkInstance; + friend class ThreadState; +}; + +class ThreadState : public State +{ + public: + explicit ThreadState(State& s); + ~ThreadState(); + private: + State* parent_; + + ThreadState(const ThreadState&); }; inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() { @@ -945,6 +961,9 @@ class Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); + // Don't create threads. Let the user evaluate state.threads and/or use ThreadState. + Benchmark* ManualThreading() { manual_threading_ = true; return this; } + virtual void Run(State& state) = 0; protected: @@ -969,6 +988,7 @@ class Benchmark { bool measure_process_cpu_time_; bool use_real_time_; bool use_manual_time_; + bool manual_threading_; BigO complexity_; BigOFunc* complexity_lambda_; std::vector statistics_; diff --git a/src/benchmark.cc b/src/benchmark.cc index 1c049f2884..411040a8f6 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -130,7 +130,8 @@ State::State(IterationCount max_iters, const std::vector& ranges, thread_index(thread_i), threads(n_threads), timer_(timer), - manager_(manager) { + manager_(manager), + num_thread_states_(0) { CHECK(max_iterations != 0) << "At least one iteration must be run"; CHECK_LT(thread_index, threads) << "thread_index must be less than threads"; @@ -212,6 +213,33 @@ void State::FinishKeepRunning() { manager_->StartStopBarrier(); } +ThreadState::ThreadState(State& s) : + State(s), + parent_(&s) +{ + CHECK(!s.started_) << "Don't create a ThreadState object after measurement has started"; + timer_ = new internal::ThreadTimer(*timer_); +} + +ThreadState::~ThreadState() +{ + CHECK(error_occurred() || iterations() >= max_iterations) + << "Benchmark returned before ThreadState::KeepRunning() returned false!"; + { + MutexLock l(manager_->GetBenchmarkMutex()); + internal::MergeResults(*this, timer_, manager_); + assert(parent_->total_iterations_ == 0 || parent_->total_iterations_ == total_iterations_); + assert(parent_->batch_leftover_ == 0 || parent_->batch_leftover_ == batch_leftover_); + parent_->total_iterations_ = total_iterations_; + parent_->batch_leftover_ = batch_leftover_; + parent_->started_ = parent_->started_ || started_; + parent_->finished_ = parent_->finished_ || finished_; + parent_->error_occurred_ = parent_->error_occurred_ || error_occurred_; + parent_->num_thread_states_++; + } + delete timer_; +} + namespace internal { namespace { @@ -315,6 +343,16 @@ bool IsZero(double n) { return std::abs(n) < std::numeric_limits::epsilon(); } +void MergeResults(State& st, ThreadTimer* timer, ThreadManager* manager) +{ + ThreadManager::Result& results = manager->results; + results.cpu_time_used += timer->cpu_time_used(); + results.real_time_used = std::max(results.real_time_used, timer->real_time_used()); + results.manual_time_used += timer->manual_time_used(); + results.complexity_n += st.complexity_length_n(); + Increment(&results.counters, st.counters); +} + ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) { int output_opts = ConsoleReporter::OO_Defaults; auto is_benchmark_color = [force_no_color]() -> bool { diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 264eff95c5..57e32a5ecd 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -34,6 +34,8 @@ struct BenchmarkInstance { double min_time; IterationCount iterations; int threads; // Number of concurrent threads to us + bool explicit_threading; // true: Number of threads come from a Threads() call + bool manual_threading; State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, internal::ThreadManager* manager) const; @@ -45,6 +47,8 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); +void MergeResults(State& st, ThreadTimer* timer, ThreadManager* manager); + ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); } // end namespace internal diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc index cca39b2215..906c54f2ac 100644 --- a/src/benchmark_register.cc +++ b/src/benchmark_register.cc @@ -174,6 +174,8 @@ bool BenchmarkFamilies::FindBenchmarks( instance.complexity_lambda = family->complexity_lambda_; instance.statistics = &family->statistics_; instance.threads = num_threads; + instance.manual_threading = family->manual_threading_; + instance.explicit_threading = !family->thread_counts_.empty(); // Add arguments to instance name size_t arg_i = 0; @@ -268,6 +270,7 @@ Benchmark::Benchmark(const char* name) measure_process_cpu_time_(false), use_real_time_(false), use_manual_time_(false), + manual_threading_(false), complexity_(oNone), complexity_lambda_(nullptr) { ComputeStatistics("mean", StatisticsMean); diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index 7bc6b6329e..379dae01b8 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -77,7 +77,7 @@ BenchmarkReporter::Run CreateRunReport( // This is the total iterations across all threads. report.iterations = results.iterations; report.time_unit = b.time_unit; - report.threads = b.threads; + report.threads = results.thread_count; report.repetition_index = repetition_index; report.repetitions = b.repetitions; @@ -117,17 +117,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters, ? internal::ThreadTimer::CreateProcessCpuTime() : internal::ThreadTimer::Create()); State st = b->Run(iters, thread_id, &timer, manager); - CHECK(st.error_occurred() || st.iterations() >= st.max_iterations) - << "Benchmark returned before State::KeepRunning() returned false!"; + assert(b->explicit_threading || b->threads == 1); + if (st.GetNumThreadStates() > 0) + { + CHECK((!b->explicit_threading) || b->manual_threading) + << "Benchmark " << b->name.str() << " run with managed threading. It must not create ThreadStates!"; + CHECK((!b->explicit_threading) || st.GetNumThreadStates() == b->threads) + << "The number of ThreadStates created by Benchmark " << b->name.str() + << " doesn't match the number of threads!"; + } + else + { + CHECK(st.error_occurred() || st.iterations() >= st.max_iterations) + << "Benchmark returned before State::KeepRunning() returned false!"; + } { MutexLock l(manager->GetBenchmarkMutex()); internal::ThreadManager::Result& results = manager->results; results.iterations += st.iterations(); - results.cpu_time_used += timer.cpu_time_used(); - results.real_time_used += timer.real_time_used(); - results.manual_time_used += timer.manual_time_used(); - results.complexity_n += st.complexity_length_n(); - internal::Increment(&results.counters, st.counters); + if (st.GetNumThreadStates() > 0) + { + // State values as well as thread state values are summed up for complexity_n and user counters: + results.complexity_n += st.complexity_length_n(); + internal::Increment(&results.counters, st.counters); + results.thread_count = b->explicit_threading ? b->threads : st.GetNumThreadStates(); + } + else + { + internal::MergeResults(st, &timer, manager); + results.thread_count = b->threads; + } } manager->NotifyThreadComplete(); } @@ -142,7 +161,8 @@ class BenchmarkRunner { repeats(b.repetitions != 0 ? b.repetitions : FLAGS_benchmark_repetitions), has_explicit_iteration_count(b.iterations != 0), - pool(b.threads - 1), + num_managed_threads(b.manual_threading ? 1 : b.threads), + pool(num_managed_threads - 1), iters(has_explicit_iteration_count ? b.iterations : 1) { run_results.display_report_aggregates_only = (FLAGS_benchmark_report_aggregates_only || @@ -186,6 +206,7 @@ class BenchmarkRunner { const int repeats; const bool has_explicit_iteration_count; + const int num_managed_threads; // number of managed threads, must be before pool std::vector pool; IterationCount iters; // preserved between repetitions! @@ -201,7 +222,7 @@ class BenchmarkRunner { VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n"; std::unique_ptr manager; - manager.reset(new internal::ThreadManager(b.threads)); + manager.reset(new internal::ThreadManager(num_managed_threads)); // Run all but one thread in separate threads for (std::size_t ti = 0; ti < pool.size(); ++ti) { @@ -228,10 +249,10 @@ class BenchmarkRunner { manager.reset(); // Adjust real/manual time stats since they were reported per thread. - i.results.real_time_used /= b.threads; - i.results.manual_time_used /= b.threads; + i.results.real_time_used /= i.results.thread_count; + i.results.manual_time_used /= i.results.thread_count; // If we were measuring whole-process CPU usage, adjust the CPU time too. - if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads; + if (b.measure_process_cpu_time) i.results.cpu_time_used /= i.results.thread_count; VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" << i.results.real_time_used << "\n"; diff --git a/src/thread_manager.h b/src/thread_manager.h index 28e2dd53af..7b0aa4ae86 100644 --- a/src/thread_manager.h +++ b/src/thread_manager.h @@ -46,6 +46,7 @@ class ThreadManager { std::string report_label_; std::string error_message_; bool has_error_ = false; + int thread_count = 0; UserCounters counters; }; GUARDED_BY(GetBenchmarkMutex()) Result results;