Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
12b8cbe
Merge pull request #1 from google/master
krzikalla Aug 25, 2020
a75f6cf
Merge branch 'google:main' into master
krzikalla Sep 15, 2023
16d5f2a
Add support for other multi-threading APIs
krzikalla Sep 20, 2023
dd29368
Fixed linking and threading analyzer issues.
krzikalla Oct 5, 2023
63e64f2
Fixed memory leak and formatting issues.
krzikalla Oct 5, 2023
b8f41ce
More formatting issues.
krzikalla Oct 5, 2023
795cd5a
Construct the base class State of a ThreadState like
krzikalla Oct 6, 2023
2ae4a49
Merge branch 'main' into threadingapi_2
krzikalla Oct 6, 2023
b985090
Merge branch 'google:main' into master
krzikalla Oct 9, 2023
b027d02
Add support for other multi-threading APIs
krzikalla Oct 9, 2023
afa85ec
Fixed linking and threading analyzer issues.
krzikalla Oct 9, 2023
febafa2
Fixed memory leak and formatting issues.
krzikalla Oct 9, 2023
96569c6
More formatting issues.
krzikalla Oct 9, 2023
82ad95f
Construct the base class State of a ThreadState like
krzikalla Oct 9, 2023
f8543fd
Merge branch 'threadingapi_2' of https://github.com/krzikalla/benchma…
krzikalla Oct 9, 2023
d9a71d0
Improved thread sanitizer statements
krzikalla Oct 18, 2023
a8fc4af
Second thread sanitzer try
krzikalla Oct 18, 2023
c333d80
Revert "Second thread sanitzer try"
krzikalla Oct 19, 2023
c559b1f
Merge branch 'google:main' into master
krzikalla Oct 19, 2023
6514086
Add support for other multi-threading APIs
krzikalla Oct 19, 2023
e1d7ff9
Merge branch 'threadingapi_2' of https://github.com/krzikalla/benchma…
krzikalla Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -830,6 +830,64 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();

Without `UseRealTime`, CPU time is used by default.

### Manual Multithreaded Benchmarks

Google/benchmark uses `std::thread` as multithreading environment per default.
If you want to use another multithreading environment (e.g. OpenMP), you can
turn off the automatic creation of threads using the `ManualThreading` function.
```c++
static void BM_MultiThreaded(benchmark::State& state) {
// Setup code here.
for (auto _ : state) {
#pragma omp parallel num_threads(state.threads)
// Run the multithreaded test.
}
// Teardown code here.
}

BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4);
```
The above example creates a parallel region in each iteration.
This includes the setup and teardown of the parallel region in the time measurement, and it
adds an implicit barrier at the end of each iteration.
You can avoid these effects, if you run the whole loop in parallel.
Then you must not use the `state` object directly, but create a `ThreadState` object in each thread.
```c++
static void BM_MultiThreaded(benchmark::State& state) {
// Setup code (shared objects) here.
#pragma omp parallel num_threads(state.threads)
{
// Thread-local setup code here.
for (auto _ : benchmark::ThreadState(state)) {
// Run the multithreaded test.
}
}
// Teardown code here.
}

BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4);
```
If you use the `ThreadState` object and explicitly specify the number of threads, then you must
use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads.
However, if you use `ThreadState` without explicitly specifying the number of threads,
then the number of threads is determined by the number of created `ThreadState` objects.
Specifying `ManualThreading` is optional in this case.
```c++
static void BM_MultiThreaded(benchmark::State& state) {
// Setup code (shared objects) here.
#pragma omp parallel
{
// Thread-local setup code here.
for (auto _ : benchmark::ThreadState(state)) {
// Run the multithreaded test.
}
}
// Teardown code here.
}

BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads.
```

<a name="cpu-timers" />

## CPU Timers
Expand Down
35 changes: 33 additions & 2 deletions include/benchmark/benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,9 @@ class BENCHMARK_EXPORT State {
return max_iterations - total_iterations_ + batch_leftover_;
}

BENCHMARK_ALWAYS_INLINE
int GetNumThreadStates() const { return num_thread_states_; }

BENCHMARK_ALWAYS_INLINE
std::string name() const { return name_; }

Expand Down Expand Up @@ -976,12 +979,31 @@ class BENCHMARK_EXPORT State {
const std::string name_;
const int thread_index_;
const int threads_;
int num_thread_states_;

internal::ThreadTimer* const timer_;
internal::ThreadManager* const manager_;
internal::PerfCountersMeasurement* const perf_counters_measurement_;

friend class internal::BenchmarkInstance;
friend class ThreadState;

protected:
void MergeThreadStateToParent(State& parent) const;
bool started() const { return started_; }

internal::ThreadTimer* timer_;
internal::PerfCountersMeasurement* perf_counters_measurement_;
};

// ThreadState can be used in a manually multithreaded benchmark loop.
class BENCHMARK_EXPORT ThreadState : public State {
public:
explicit ThreadState(State& s);
~ThreadState();

private:
State* parent_;

ThreadState(const ThreadState&);
};

inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
Expand Down Expand Up @@ -1274,6 +1296,13 @@ class BENCHMARK_EXPORT Benchmark {
// Equivalent to ThreadRange(NumCPUs(), NumCPUs())
Benchmark* ThreadPerCpu();
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These all should probably assert !manual_threading_.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As said above, it is fine to combine these functions with manual threading.


// Don't create threads. Let the user evaluate state.threads and/or use
// ThreadState.
Benchmark* ManualThreading() {
Comment on lines +1299 to +1301
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This presumably should assert that thread_counts_ is empty.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is actually fine to have a non-empty thread_counts_. In that case the benchmark is executed with each thread count and the user is responsible for creating the appropriate number of threads. The first two examples in the user guide use this combination. Later src/benchmark_runner.cc:140 checks, if the thread count and the number of ThreadState created match.

manual_threading_ = true;
return this;
}

virtual void Run(State& state) = 0;

TimeUnit GetTimeUnit() const;
Expand All @@ -1286,6 +1315,7 @@ class BENCHMARK_EXPORT Benchmark {
const char* GetName() const;
int ArgsCnt() const;
const char* GetArgName(int arg) const;
bool GetExplicitThreading() const { return !thread_counts_.empty(); }

private:
friend class BenchmarkFamilies;
Expand All @@ -1307,6 +1337,7 @@ class BENCHMARK_EXPORT Benchmark {
bool measure_process_cpu_time_;
bool use_real_time_;
bool use_manual_time_;
bool manual_threading_;
BigO complexity_;
BigOFunc* complexity_lambda_;
std::vector<Statistics> statistics_;
Expand Down
41 changes: 40 additions & 1 deletion src/benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,9 @@ State::State(std::string name, IterationCount max_iters,
name_(std::move(name)),
thread_index_(thread_i),
threads_(n_threads),
timer_(timer),
num_thread_states_(0),
manager_(manager),
timer_(timer),
perf_counters_measurement_(perf_counters_measurement) {
BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
BM_CHECK_LT(thread_index_, threads_)
Expand Down Expand Up @@ -309,6 +310,44 @@ void State::FinishKeepRunning() {
manager_->StartStopBarrier();
}

void State::MergeThreadStateToParent(State& parent) const {
MutexLock l(manager_->GetBenchmarkMutex());
internal::MergeResults(*this, timer_, manager_);
assert(parent.total_iterations_ == 0 ||
parent.total_iterations_ == total_iterations_);
assert(parent.batch_leftover_ == 0 ||
parent.batch_leftover_ == batch_leftover_);
parent.total_iterations_ = total_iterations_;
parent.batch_leftover_ = batch_leftover_;
parent.started_ = parent.started_ || started_;
parent.finished_ = parent.finished_ || finished_;
parent.skipped_ =
(parent.error_occurred() || error_occurred())
? internal::SkippedWithError
: (parent.skipped() || skipped() ? internal::SkippedWithMessage
: internal::NotSkipped);
parent.num_thread_states_++;
}

ThreadState::ThreadState(State& s)
: State(s.name(), s.max_iterations, s.range_, s.thread_index(), s.threads(),
new internal::ThreadTimer(
internal::ThreadTimer::CreateFromTimer(*s.timer_)),
s.manager_,
s.perf_counters_measurement_
? new internal::PerfCountersMeasurement(
s.perf_counters_measurement_->names())
: 0),
parent_(&s) {}

ThreadState::~ThreadState() {
BM_CHECK(error_occurred() || iterations() >= max_iterations)
<< "Benchmark returned before ThreadState::KeepRunning() returned false!";
MergeThreadStateToParent(*parent_);
delete timer_;
delete perf_counters_measurement_;
}

namespace internal {
namespace {

Expand Down
18 changes: 17 additions & 1 deletion src/benchmark_api_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

#include <cinttypes>

#include "counter.h"
#include "string_util.h"
#include "thread_manager.h"
#include "thread_timer.h"

namespace benchmark {
namespace internal {
Expand All @@ -27,7 +30,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
min_time_(benchmark_.min_time_),
min_warmup_time_(benchmark_.min_warmup_time_),
iterations_(benchmark_.iterations_),
threads_(thread_count) {
threads_(thread_count),
manual_threading_(benchmark_.manual_threading_),
explicit_threading_(benchmark_.GetExplicitThreading()) {
name_.function_name = benchmark_.name_;

size_t arg_i = 0;
Expand Down Expand Up @@ -114,5 +119,16 @@ void BenchmarkInstance::Teardown() const {
teardown_(st);
}
}

void MergeResults(const State& st, const ThreadTimer* timer,
ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS {
ThreadManager::Result& results = manager->results;
results.iterations += st.iterations();
results.cpu_time_used += timer->cpu_time_used();
results.real_time_used += timer->real_time_used();
results.manual_time_used += timer->manual_time_used();
results.complexity_n += st.complexity_length_n();
Increment(&results.counters, st.counters);
}
} // namespace internal
} // namespace benchmark
10 changes: 10 additions & 0 deletions src/benchmark_api_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include "benchmark/benchmark.h"
#include "commandlineflags.h"
#include "mutex.h"

namespace benchmark {
namespace internal {
Expand Down Expand Up @@ -41,6 +42,8 @@ class BenchmarkInstance {
int threads() const { return threads_; }
void Setup() const;
void Teardown() const;
bool explicit_threading() const { return explicit_threading_; }
bool manual_threading() const { return manual_threading_; }

State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
internal::ThreadManager* manager,
Expand All @@ -66,6 +69,9 @@ class BenchmarkInstance {
double min_warmup_time_;
IterationCount iterations_;
int threads_; // Number of concurrent threads to us
bool manual_threading_;
bool explicit_threading_; // true: Number of threads come from a Threads()
// call
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't fully looked at things, but preliminary, i don't really like the names chosen.


typedef void (*callback_function)(const benchmark::State&);
callback_function setup_ = nullptr;
Expand All @@ -78,6 +84,10 @@ bool FindBenchmarksInternal(const std::string& re,

bool IsZero(double n);

// only call while holding benchmark_mutex_:
void MergeResults(const State& st, const ThreadTimer* timer,
ThreadManager* manager) NO_THREAD_SAFETY_ANALYSIS;

BENCHMARK_EXPORT
ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);

Expand Down
1 change: 1 addition & 0 deletions src/benchmark_register.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ Benchmark::Benchmark(const std::string& name)
measure_process_cpu_time_(false),
use_real_time_(false),
use_manual_time_(false),
manual_threading_(false),
complexity_(oNone),
complexity_lambda_(nullptr),
setup_(nullptr),
Expand Down
51 changes: 36 additions & 15 deletions src/benchmark_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ BenchmarkReporter::Run CreateRunReport(
// This is the total iterations across all threads.
report.iterations = results.iterations;
report.time_unit = b.time_unit();
report.threads = b.threads();
report.threads = results.thread_count;
report.repetition_index = repetition_index;
report.repetitions = repeats;

Expand Down Expand Up @@ -130,17 +130,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,

State st =
b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
<< "Benchmark returned before State::KeepRunning() returned false!";

assert(b->explicit_threading() || b->threads() == 1);

if (st.GetNumThreadStates() > 0) {
BM_CHECK((!b->explicit_threading()) || b->manual_threading())
<< "Benchmark " << b->name().str()
<< " run with managed threading. It must not create ThreadStates!";
BM_CHECK((!b->explicit_threading()) ||
st.GetNumThreadStates() == b->threads())
<< "The number of ThreadStates created by Benchmark " << b->name().str()
<< " doesn't match the number of threads!";
} else {
BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
<< "Benchmark returned before State::KeepRunning() returned false!";
}

{
MutexLock l(manager->GetBenchmarkMutex());
internal::ThreadManager::Result& results = manager->results;
results.iterations += st.iterations();
results.cpu_time_used += timer.cpu_time_used();
results.real_time_used += timer.real_time_used();
results.manual_time_used += timer.manual_time_used();
results.complexity_n += st.complexity_length_n();
internal::Increment(&results.counters, st.counters);
if (st.GetNumThreadStates() > 0) {
// State values as well as thread state values are summed up for
// complexity_n and user counters:
results.complexity_n += st.complexity_length_n();
internal::Increment(&results.counters, st.counters);
results.thread_count =
b->explicit_threading() ? b->threads() : st.GetNumThreadStates();
} else {
internal::MergeResults(st, &timer, manager);
results.thread_count = b->threads();
}
}
manager->NotifyThreadComplete();
}
Expand Down Expand Up @@ -234,7 +253,8 @@ BenchmarkRunner::BenchmarkRunner(
has_explicit_iteration_count(b.iterations() != 0 ||
parsed_benchtime_flag.tag ==
BenchTimeType::ITERS),
pool(b.threads() - 1),
num_managed_threads(b.manual_threading() ? 1 : b.threads()),
pool(num_managed_threads - 1),
iters(has_explicit_iteration_count
? ComputeIters(b_, parsed_benchtime_flag)
: 1),
Expand All @@ -260,7 +280,7 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";

std::unique_ptr<internal::ThreadManager> manager;
manager.reset(new internal::ThreadManager(b.threads()));
manager.reset(new internal::ThreadManager(num_managed_threads));

// Run all but one thread in separate threads
for (std::size_t ti = 0; ti < pool.size(); ++ti) {
Expand All @@ -287,17 +307,18 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
manager.reset();

// Adjust real/manual time stats since they were reported per thread.
i.results.real_time_used /= b.threads();
i.results.manual_time_used /= b.threads();
i.results.real_time_used /= i.results.thread_count;
i.results.manual_time_used /= i.results.thread_count;
// If we were measuring whole-process CPU usage, adjust the CPU time too.
if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
if (b.measure_process_cpu_time())
i.results.cpu_time_used /= i.results.thread_count;

BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
<< i.results.real_time_used << "\n";

// By using KeepRunningBatch a benchmark can iterate more times than
// requested, so take the iteration count from i.results.
i.iters = i.results.iterations / b.threads();
i.iters = i.results.iterations / i.results.thread_count;

// Base decisions off of real time if requested by this benchmark.
i.seconds = i.results.cpu_time_used;
Expand Down
1 change: 1 addition & 0 deletions src/benchmark_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class BenchmarkRunner {
bool warmup_done;
const int repeats;
const bool has_explicit_iteration_count;
const int num_managed_threads; // must be before pool

int num_repetitions_done = 0;

Expand Down
1 change: 1 addition & 0 deletions src/thread_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class ThreadManager {
std::string report_label_;
std::string skip_message_;
internal::Skipped skipped_ = internal::NotSkipped;
int thread_count = 0;
UserCounters counters;
};
GUARDED_BY(GetBenchmarkMutex()) Result results;
Expand Down
Loading