diff --git a/README.md b/README.md
index 02a3bfad80..a32becb752 100644
--- a/README.md
+++ b/README.md
@@ -905,6 +905,60 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
Without `UseRealTime`, CPU time is used by default.
+Google/benchmark uses `std::thread` as multithreading environment per default.
+If you want to use another multithreading environment (e.g. OpenMP), you can
+turn off the automatic threading using the `ManualThreading` function.
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+ // Setup code here.
+ for (auto _ : state) {
+#pragma omp parallel num_threads(state.threads)
+ // Run the multithreaded test.
+ }
+ // Teardown code here.
+}
+
+BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4);
+```
+The above example creates a parallel region in each iteration.
+This includes the setup and teardown of the parallel region in the time measurment and it
+adds an implicit barrier at the end of each iteration.
+You can avoid these effects, if you run the whole loop in parallel.
+Then you must not use the `state` object directly, but create a `ThreadState` object in each thread.
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+ // Setup code (shared objects) here.
+#pragma omp parallel num_threads(state.threads)
+ {
+ // Thread-local setup code here.
+ for (auto _ : benchmark::ThreadState(state)) {
+ // Run the multithreaded test.
+ }
+ }
+ // Teardown code here.
+}
+
+BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4);
+```
+If you use the `ThreadState` object and explicitly specify the number of threads, then you must
+use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads.
+However, if you use `ThreadState` without explicitly specifying the number of threads,
+then the number of threads is determined by the number of created `ThreadState` objects.
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+ // Setup code here.
+#pragma omp parallel
+ for (auto _ : benchmark::ThreadState(state)) {
+ // Run the multithreaded test.
+ }
+ // Teardown code here.
+}
+
+BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads.
+```
+Specifying `ManualThreading` is optional in this case.
+
+
### CPU Timers
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index da638f952d..d35dc509b7 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -641,6 +641,9 @@ class State {
return max_iterations - total_iterations_ + batch_leftover_;
}
+ BENCHMARK_ALWAYS_INLINE
+ int GetNumThreadStates() const { return num_thread_states_; }
+
private
: // items we expect on the first cache line (ie 64 bytes of the struct)
// When total_iterations_ is 0, KeepRunning() and friends will return false.
@@ -685,8 +688,21 @@ class State {
void FinishKeepRunning();
internal::ThreadTimer* timer_;
internal::ThreadManager* manager_;
+ int num_thread_states_;
friend struct internal::BenchmarkInstance;
+ friend class ThreadState;
+};
+
+class ThreadState : public State
+{
+ public:
+ explicit ThreadState(State& s);
+ ~ThreadState();
+ private:
+ State* parent_;
+
+ ThreadState(const ThreadState&);
};
inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@@ -945,6 +961,9 @@ class Benchmark {
// Equivalent to ThreadRange(NumCPUs(), NumCPUs())
Benchmark* ThreadPerCpu();
+ // Don't create threads. Let the user evaluate state.threads and/or use ThreadState.
+ Benchmark* ManualThreading() { manual_threading_ = true; return this; }
+
virtual void Run(State& state) = 0;
protected:
@@ -969,6 +988,7 @@ class Benchmark {
bool measure_process_cpu_time_;
bool use_real_time_;
bool use_manual_time_;
+ bool manual_threading_;
BigO complexity_;
BigOFunc* complexity_lambda_;
std::vector statistics_;
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 1c049f2884..411040a8f6 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -130,7 +130,8 @@ State::State(IterationCount max_iters, const std::vector& ranges,
thread_index(thread_i),
threads(n_threads),
timer_(timer),
- manager_(manager) {
+ manager_(manager),
+ num_thread_states_(0) {
CHECK(max_iterations != 0) << "At least one iteration must be run";
CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
@@ -212,6 +213,33 @@ void State::FinishKeepRunning() {
manager_->StartStopBarrier();
}
+ThreadState::ThreadState(State& s) :
+ State(s),
+ parent_(&s)
+{
+ CHECK(!s.started_) << "Don't create a ThreadState object after measurement has started";
+ timer_ = new internal::ThreadTimer(*timer_);
+}
+
+ThreadState::~ThreadState()
+{
+ CHECK(error_occurred() || iterations() >= max_iterations)
+ << "Benchmark returned before ThreadState::KeepRunning() returned false!";
+ {
+ MutexLock l(manager_->GetBenchmarkMutex());
+ internal::MergeResults(*this, timer_, manager_);
+ assert(parent_->total_iterations_ == 0 || parent_->total_iterations_ == total_iterations_);
+ assert(parent_->batch_leftover_ == 0 || parent_->batch_leftover_ == batch_leftover_);
+ parent_->total_iterations_ = total_iterations_;
+ parent_->batch_leftover_ = batch_leftover_;
+ parent_->started_ = parent_->started_ || started_;
+ parent_->finished_ = parent_->finished_ || finished_;
+ parent_->error_occurred_ = parent_->error_occurred_ || error_occurred_;
+ parent_->num_thread_states_++;
+ }
+ delete timer_;
+}
+
namespace internal {
namespace {
@@ -315,6 +343,16 @@ bool IsZero(double n) {
return std::abs(n) < std::numeric_limits::epsilon();
}
+void MergeResults(State& st, ThreadTimer* timer, ThreadManager* manager)
+{
+ ThreadManager::Result& results = manager->results;
+ results.cpu_time_used += timer->cpu_time_used();
+ results.real_time_used = std::max(results.real_time_used, timer->real_time_used());
+ results.manual_time_used += timer->manual_time_used();
+ results.complexity_n += st.complexity_length_n();
+ Increment(&results.counters, st.counters);
+}
+
ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
int output_opts = ConsoleReporter::OO_Defaults;
auto is_benchmark_color = [force_no_color]() -> bool {
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 264eff95c5..57e32a5ecd 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -34,6 +34,8 @@ struct BenchmarkInstance {
double min_time;
IterationCount iterations;
int threads; // Number of concurrent threads to us
+ bool explicit_threading; // true: Number of threads come from a Threads() call
+ bool manual_threading;
State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
internal::ThreadManager* manager) const;
@@ -45,6 +47,8 @@ bool FindBenchmarksInternal(const std::string& re,
bool IsZero(double n);
+void MergeResults(State& st, ThreadTimer* timer, ThreadManager* manager);
+
ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
} // end namespace internal
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index cca39b2215..906c54f2ac 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -174,6 +174,8 @@ bool BenchmarkFamilies::FindBenchmarks(
instance.complexity_lambda = family->complexity_lambda_;
instance.statistics = &family->statistics_;
instance.threads = num_threads;
+ instance.manual_threading = family->manual_threading_;
+ instance.explicit_threading = !family->thread_counts_.empty();
// Add arguments to instance name
size_t arg_i = 0;
@@ -268,6 +270,7 @@ Benchmark::Benchmark(const char* name)
measure_process_cpu_time_(false),
use_real_time_(false),
use_manual_time_(false),
+ manual_threading_(false),
complexity_(oNone),
complexity_lambda_(nullptr) {
ComputeStatistics("mean", StatisticsMean);
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index 7bc6b6329e..379dae01b8 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -77,7 +77,7 @@ BenchmarkReporter::Run CreateRunReport(
// This is the total iterations across all threads.
report.iterations = results.iterations;
report.time_unit = b.time_unit;
- report.threads = b.threads;
+ report.threads = results.thread_count;
report.repetition_index = repetition_index;
report.repetitions = b.repetitions;
@@ -117,17 +117,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
? internal::ThreadTimer::CreateProcessCpuTime()
: internal::ThreadTimer::Create());
State st = b->Run(iters, thread_id, &timer, manager);
- CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
- << "Benchmark returned before State::KeepRunning() returned false!";
+ assert(b->explicit_threading || b->threads == 1);
+ if (st.GetNumThreadStates() > 0)
+ {
+ CHECK((!b->explicit_threading) || b->manual_threading)
+ << "Benchmark " << b->name.str() << " run with managed threading. It must not create ThreadStates!";
+ CHECK((!b->explicit_threading) || st.GetNumThreadStates() == b->threads)
+ << "The number of ThreadStates created by Benchmark " << b->name.str()
+ << " doesn't match the number of threads!";
+ }
+ else
+ {
+ CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+ << "Benchmark returned before State::KeepRunning() returned false!";
+ }
{
MutexLock l(manager->GetBenchmarkMutex());
internal::ThreadManager::Result& results = manager->results;
results.iterations += st.iterations();
- results.cpu_time_used += timer.cpu_time_used();
- results.real_time_used += timer.real_time_used();
- results.manual_time_used += timer.manual_time_used();
- results.complexity_n += st.complexity_length_n();
- internal::Increment(&results.counters, st.counters);
+ if (st.GetNumThreadStates() > 0)
+ {
+ // State values as well as thread state values are summed up for complexity_n and user counters:
+ results.complexity_n += st.complexity_length_n();
+ internal::Increment(&results.counters, st.counters);
+ results.thread_count = b->explicit_threading ? b->threads : st.GetNumThreadStates();
+ }
+ else
+ {
+ internal::MergeResults(st, &timer, manager);
+ results.thread_count = b->threads;
+ }
}
manager->NotifyThreadComplete();
}
@@ -142,7 +161,8 @@ class BenchmarkRunner {
repeats(b.repetitions != 0 ? b.repetitions
: FLAGS_benchmark_repetitions),
has_explicit_iteration_count(b.iterations != 0),
- pool(b.threads - 1),
+ num_managed_threads(b.manual_threading ? 1 : b.threads),
+ pool(num_managed_threads - 1),
iters(has_explicit_iteration_count ? b.iterations : 1) {
run_results.display_report_aggregates_only =
(FLAGS_benchmark_report_aggregates_only ||
@@ -186,6 +206,7 @@ class BenchmarkRunner {
const int repeats;
const bool has_explicit_iteration_count;
+ const int num_managed_threads; // number of managed threads, must be before pool
std::vector pool;
IterationCount iters; // preserved between repetitions!
@@ -201,7 +222,7 @@ class BenchmarkRunner {
VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
std::unique_ptr manager;
- manager.reset(new internal::ThreadManager(b.threads));
+ manager.reset(new internal::ThreadManager(num_managed_threads));
// Run all but one thread in separate threads
for (std::size_t ti = 0; ti < pool.size(); ++ti) {
@@ -228,10 +249,10 @@ class BenchmarkRunner {
manager.reset();
// Adjust real/manual time stats since they were reported per thread.
- i.results.real_time_used /= b.threads;
- i.results.manual_time_used /= b.threads;
+ i.results.real_time_used /= i.results.thread_count;
+ i.results.manual_time_used /= i.results.thread_count;
// If we were measuring whole-process CPU usage, adjust the CPU time too.
- if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
+ if (b.measure_process_cpu_time) i.results.cpu_time_used /= i.results.thread_count;
VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
<< i.results.real_time_used << "\n";
diff --git a/src/thread_manager.h b/src/thread_manager.h
index 28e2dd53af..7b0aa4ae86 100644
--- a/src/thread_manager.h
+++ b/src/thread_manager.h
@@ -46,6 +46,7 @@ class ThreadManager {
std::string report_label_;
std::string error_message_;
bool has_error_ = false;
+ int thread_count = 0;
UserCounters counters;
};
GUARDED_BY(GetBenchmarkMutex()) Result results;