diff --git a/docs/user_guide.md b/docs/user_guide.md index 2ceb13eb59..b2dafe1092 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -830,6 +830,64 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime(); Without `UseRealTime`, CPU time is used by default. +### Manual Multithreaded Benchmarks + +Google/benchmark uses `std::thread` as multithreading environment per default. +If you want to use another multithreading environment (e.g. OpenMP), you can +turn off the automatic creation of threads using the `ManualThreading` function. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code here. + for (auto _ : state) { +#pragma omp parallel num_threads(state.threads) + // Run the multithreaded test. + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +The above example creates a parallel region in each iteration. +This includes the setup and teardown of the parallel region in the time measurement, and it +adds an implicit barrier at the end of each iteration. +You can avoid these effects, if you run the whole loop in parallel. +Then you must not use the `state` object directly, but create a `ThreadState` object in each thread. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel num_threads(state.threads) + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4); +``` +If you use the `ThreadState` object and explicitly specify the number of threads, then you must +use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads. +However, if you use `ThreadState` without explicitly specifying the number of threads, +then the number of threads is determined by the number of created `ThreadState` objects. +Specifying `ManualThreading` is optional in this case. +```c++ +static void BM_MultiThreaded(benchmark::State& state) { + // Setup code (shared objects) here. +#pragma omp parallel + { + // Thread-local setup code here. + for (auto _ : benchmark::ThreadState(state)) { + // Run the multithreaded test. + } + } + // Teardown code here. +} + +BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads. +``` + ## CPU Timers diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 23103571bb..2057fd6475 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -930,6 +930,9 @@ class BENCHMARK_EXPORT State { return max_iterations - total_iterations_ + batch_leftover_; } + BENCHMARK_ALWAYS_INLINE + int GetNumThreadStates() const { return num_thread_states_; } + BENCHMARK_ALWAYS_INLINE std::string name() const { return name_; } @@ -976,12 +979,31 @@ class BENCHMARK_EXPORT State { const std::string name_; const int thread_index_; const int threads_; + int num_thread_states_; - internal::ThreadTimer* const timer_; internal::ThreadManager* const manager_; - internal::PerfCountersMeasurement* const perf_counters_measurement_; friend class internal::BenchmarkInstance; + friend class ThreadState; + + protected: + void MergeThreadStateToParent(State& parent) const; + bool started() const { return started_; } + + internal::ThreadTimer* timer_; + internal::PerfCountersMeasurement* perf_counters_measurement_; +}; + +// ThreadState can be used in a manually multithreaded benchmark loop. +class BENCHMARK_EXPORT ThreadState : public State { + public: + explicit ThreadState(State& s); + ~ThreadState(); + + private: + State* parent_; + + ThreadState(const ThreadState&); }; inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() { @@ -1274,6 +1296,13 @@ class BENCHMARK_EXPORT Benchmark { // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) Benchmark* ThreadPerCpu(); + // Don't create threads. Let the user evaluate state.threads and/or use + // ThreadState. + Benchmark* ManualThreading() { + manual_threading_ = true; + return this; + } + virtual void Run(State& state) = 0; TimeUnit GetTimeUnit() const; @@ -1286,6 +1315,7 @@ class BENCHMARK_EXPORT Benchmark { const char* GetName() const; int ArgsCnt() const; const char* GetArgName(int arg) const; + bool GetExplicitThreading() const { return !thread_counts_.empty(); } private: friend class BenchmarkFamilies; @@ -1307,6 +1337,7 @@ class BENCHMARK_EXPORT Benchmark { bool measure_process_cpu_time_; bool use_real_time_; bool use_manual_time_; + bool manual_threading_; BigO complexity_; BigOFunc* complexity_lambda_; std::vector statistics_; diff --git a/src/benchmark.cc b/src/benchmark.cc index 6139e59d05..aefde5b410 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -172,8 +172,9 @@ State::State(std::string name, IterationCount max_iters, name_(std::move(name)), thread_index_(thread_i), threads_(n_threads), - timer_(timer), + num_thread_states_(0), manager_(manager), + timer_(timer), perf_counters_measurement_(perf_counters_measurement) { BM_CHECK(max_iterations != 0) << "At least one iteration must be run"; BM_CHECK_LT(thread_index_, threads_) @@ -309,6 +310,44 @@ void State::FinishKeepRunning() { manager_->StartStopBarrier(); } +void State::MergeThreadStateToParent(State& parent) const { + MutexLock l(manager_->GetBenchmarkMutex()); + internal::MergeResults(*this, timer_, manager_); + assert(parent.total_iterations_ == 0 || + parent.total_iterations_ == total_iterations_); + assert(parent.batch_leftover_ == 0 || + parent.batch_leftover_ == batch_leftover_); + parent.total_iterations_ = total_iterations_; + parent.batch_leftover_ = batch_leftover_; + parent.started_ = parent.started_ || started_; + parent.finished_ = parent.finished_ || finished_; + parent.skipped_ = + (parent.error_occurred() || error_occurred()) + ? internal::SkippedWithError + : (parent.skipped() || skipped() ? internal::SkippedWithMessage + : internal::NotSkipped); + parent.num_thread_states_++; +} + +ThreadState::ThreadState(State& s) + : State(s.name(), s.max_iterations, s.range_, s.thread_index(), s.threads(), + new internal::ThreadTimer( + internal::ThreadTimer::CreateFromTimer(*s.timer_)), + s.manager_, + s.perf_counters_measurement_ + ? new internal::PerfCountersMeasurement( + s.perf_counters_measurement_->names()) + : 0), + parent_(&s) {} + +ThreadState::~ThreadState() { + BM_CHECK(error_occurred() || iterations() >= max_iterations) + << "Benchmark returned before ThreadState::KeepRunning() returned false!"; + MergeThreadStateToParent(*parent_); + delete timer_; + delete perf_counters_measurement_; +} + namespace internal { namespace { diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 286f986530..801d2a8aa6 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -2,7 +2,10 @@ #include +#include "counter.h" #include "string_util.h" +#include "thread_manager.h" +#include "thread_timer.h" namespace benchmark { namespace internal { @@ -27,7 +30,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx, min_time_(benchmark_.min_time_), min_warmup_time_(benchmark_.min_warmup_time_), iterations_(benchmark_.iterations_), - threads_(thread_count) { + threads_(thread_count), + manual_threading_(benchmark_.manual_threading_), + explicit_threading_(benchmark_.GetExplicitThreading()) { name_.function_name = benchmark_.name_; size_t arg_i = 0; @@ -114,5 +119,16 @@ void BenchmarkInstance::Teardown() const { teardown_(st); } } + +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS { + ThreadManager::Result& results = manager->results; + results.iterations += st.iterations(); + results.cpu_time_used += timer->cpu_time_used(); + results.real_time_used += timer->real_time_used(); + results.manual_time_used += timer->manual_time_used(); + results.complexity_n += st.complexity_length_n(); + Increment(&results.counters, st.counters); +} } // namespace internal } // namespace benchmark diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 94f516531b..8aa0a2bccb 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -10,6 +10,7 @@ #include "benchmark/benchmark.h" #include "commandlineflags.h" +#include "mutex.h" namespace benchmark { namespace internal { @@ -41,6 +42,8 @@ class BenchmarkInstance { int threads() const { return threads_; } void Setup() const; void Teardown() const; + bool explicit_threading() const { return explicit_threading_; } + bool manual_threading() const { return manual_threading_; } State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, internal::ThreadManager* manager, @@ -66,6 +69,9 @@ class BenchmarkInstance { double min_warmup_time_; IterationCount iterations_; int threads_; // Number of concurrent threads to us + bool manual_threading_; + bool explicit_threading_; // true: Number of threads come from a Threads() + // call typedef void (*callback_function)(const benchmark::State&); callback_function setup_ = nullptr; @@ -78,6 +84,10 @@ bool FindBenchmarksInternal(const std::string& re, bool IsZero(double n); +// only call while holding benchmark_mutex_: +void MergeResults(const State& st, const ThreadTimer* timer, + ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS; + BENCHMARK_EXPORT ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false); diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc index e447c9a2d3..7c091c4ff9 100644 --- a/src/benchmark_register.cc +++ b/src/benchmark_register.cc @@ -217,6 +217,7 @@ Benchmark::Benchmark(const std::string& name) measure_process_cpu_time_(false), use_real_time_(false), use_manual_time_(false), + manual_threading_(false), complexity_(oNone), complexity_lambda_(nullptr), setup_(nullptr), diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index f5cd3e644b..0a2d389c2b 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -86,7 +86,7 @@ BenchmarkReporter::Run CreateRunReport( // This is the total iterations across all threads. report.iterations = results.iterations; report.time_unit = b.time_unit(); - report.threads = b.threads(); + report.threads = results.thread_count; report.repetition_index = repetition_index; report.repetitions = repeats; @@ -130,17 +130,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters, State st = b->Run(iters, thread_id, &timer, manager, perf_counters_measurement); - BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) - << "Benchmark returned before State::KeepRunning() returned false!"; + + assert(b->explicit_threading() || b->threads() == 1); + + if (st.GetNumThreadStates() > 0) { + BM_CHECK((!b->explicit_threading()) || b->manual_threading()) + << "Benchmark " << b->name().str() + << " run with managed threading. It must not create ThreadStates!"; + BM_CHECK((!b->explicit_threading()) || + st.GetNumThreadStates() == b->threads()) + << "The number of ThreadStates created by Benchmark " << b->name().str() + << " doesn't match the number of threads!"; + } else { + BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations) + << "Benchmark returned before State::KeepRunning() returned false!"; + } + { MutexLock l(manager->GetBenchmarkMutex()); internal::ThreadManager::Result& results = manager->results; - results.iterations += st.iterations(); - results.cpu_time_used += timer.cpu_time_used(); - results.real_time_used += timer.real_time_used(); - results.manual_time_used += timer.manual_time_used(); - results.complexity_n += st.complexity_length_n(); - internal::Increment(&results.counters, st.counters); + if (st.GetNumThreadStates() > 0) { + // State values as well as thread state values are summed up for + // complexity_n and user counters: + results.complexity_n += st.complexity_length_n(); + internal::Increment(&results.counters, st.counters); + results.thread_count = + b->explicit_threading() ? b->threads() : st.GetNumThreadStates(); + } else { + internal::MergeResults(st, &timer, manager); + results.thread_count = b->threads(); + } } manager->NotifyThreadComplete(); } @@ -234,7 +253,8 @@ BenchmarkRunner::BenchmarkRunner( has_explicit_iteration_count(b.iterations() != 0 || parsed_benchtime_flag.tag == BenchTimeType::ITERS), - pool(b.threads() - 1), + num_managed_threads(b.manual_threading() ? 1 : b.threads()), + pool(num_managed_threads - 1), iters(has_explicit_iteration_count ? ComputeIters(b_, parsed_benchtime_flag) : 1), @@ -260,7 +280,7 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n"; std::unique_ptr manager; - manager.reset(new internal::ThreadManager(b.threads())); + manager.reset(new internal::ThreadManager(num_managed_threads)); // Run all but one thread in separate threads for (std::size_t ti = 0; ti < pool.size(); ++ti) { @@ -287,17 +307,18 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() { manager.reset(); // Adjust real/manual time stats since they were reported per thread. - i.results.real_time_used /= b.threads(); - i.results.manual_time_used /= b.threads(); + i.results.real_time_used /= i.results.thread_count; + i.results.manual_time_used /= i.results.thread_count; // If we were measuring whole-process CPU usage, adjust the CPU time too. - if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads(); + if (b.measure_process_cpu_time()) + i.results.cpu_time_used /= i.results.thread_count; BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/" << i.results.real_time_used << "\n"; // By using KeepRunningBatch a benchmark can iterate more times than // requested, so take the iteration count from i.results. - i.iters = i.results.iterations / b.threads(); + i.iters = i.results.iterations / i.results.thread_count; // Base decisions off of real time if requested by this benchmark. i.seconds = i.results.cpu_time_used; diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h index db2fa04396..32e91b76cc 100644 --- a/src/benchmark_runner.h +++ b/src/benchmark_runner.h @@ -93,6 +93,7 @@ class BenchmarkRunner { bool warmup_done; const int repeats; const bool has_explicit_iteration_count; + const int num_managed_threads; // must be before pool int num_repetitions_done = 0; diff --git a/src/thread_manager.h b/src/thread_manager.h index 819b3c44db..612e61e081 100644 --- a/src/thread_manager.h +++ b/src/thread_manager.h @@ -45,6 +45,7 @@ class ThreadManager { std::string report_label_; std::string skip_message_; internal::Skipped skipped_ = internal::NotSkipped; + int thread_count = 0; UserCounters counters; }; GUARDED_BY(GetBenchmarkMutex()) Result results; diff --git a/src/thread_timer.h b/src/thread_timer.h index eb23f59561..5b226f22cd 100644 --- a/src/thread_timer.h +++ b/src/thread_timer.h @@ -18,6 +18,9 @@ class ThreadTimer { static ThreadTimer CreateProcessCpuTime() { return ThreadTimer(/*measure_process_cpu_time_=*/true); } + static ThreadTimer CreateFromTimer(const ThreadTimer& timer) { + return ThreadTimer(timer.measure_process_cpu_time); + } // Called by each thread void StartTimer() { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ac1a00f582..833bb8bdb4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -167,6 +167,9 @@ add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time compile_output_test(internal_threading_test) add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s) +compile_output_test(manual_threading_test) +add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s) + compile_output_test(report_aggregates_only_test) add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s) diff --git a/test/manual_threading_test.cc b/test/manual_threading_test.cc new file mode 100644 index 0000000000..b8c249b9eb --- /dev/null +++ b/test/manual_threading_test.cc @@ -0,0 +1,224 @@ + +#undef NDEBUG + +#include +#include +#include + +#include "../src/timers.h" +#include "benchmark/benchmark.h" +#include "output_test.h" + +namespace { + +static const std::chrono::duration time_frame(50); +static const double time_frame_in_sec( + std::chrono::duration_cast>>( + time_frame) + .count()); + +void MyBusySpinwait() { + const auto start = benchmark::ChronoClockNow(); + + while (true) { + const auto now = benchmark::ChronoClockNow(); + const auto elapsed = now - start; + + if (std::chrono::duration(elapsed) >= + time_frame) + return; + } +} + +} // namespace + +// ========================================================================= // +// --------------------------- TEST CASES BEGIN ---------------------------- // +// ========================================================================= // + +// ========================================================================= // +// BM_ManualThreadingInLoop +// Measurements include the creation and joining of threads. + +void BM_ManualThreadingInLoop(benchmark::State& state) { + int numWorkerThreads = state.threads() - 1; + std::vector pool(numWorkerThreads); + + for (auto _ : state) { + for (int i = 0; i < numWorkerThreads; ++i) { + pool[i] = std::thread(MyBusySpinwait); + } + MyBusySpinwait(); + for (int i = 0; i < numWorkerThreads; ++i) { + pool[i].join(); + } + state.SetIterationTime(time_frame_in_sec); + } + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingInLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// BM_ManualThreadingBeforeLoop +// Creation of threads is done before the start of the measurement, +// joining after the finish of the measurement. + +void BM_ManualThreadingBeforeLoop(benchmark::State& state) { + std::promise thread_starter; + auto starter_future = thread_starter.get_future(); + + auto threadedLoop = [&]() { + starter_future.wait(); + benchmark::ThreadState ts(state); + for (auto _ : ts) { + MyBusySpinwait(); + ts.SetIterationTime(time_frame_in_sec); + } + }; + + std::vector pool(state.threads()); + for (int i = 0; i < state.threads(); ++i) { + pool[i] = std::thread(threadedLoop); + } + thread_starter.set_value(); + for (int i = 0; i < state.threads(); ++i) { + pool[i].join(); + } + + state.counters["invtime"] = + benchmark::Counter{1, benchmark::Counter::kIsRate}; +} + +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(1) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->UseManualTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseRealTime(); +BENCHMARK(BM_ManualThreadingBeforeLoop) + ->Iterations(1) + ->ManualThreading() + ->Threads(2) + ->MeasureProcessCPUTime() + ->UseManualTime(); + +// ========================================================================= // +// ---------------------------- TEST CASES END ----------------------------- // +// ========================================================================= // + +int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }