diff --git a/bench/latency.cc b/bench/latency.cc index cae6c77..da575c8 100644 --- a/bench/latency.cc +++ b/bench/latency.cc @@ -47,6 +47,23 @@ static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { } pthreadpool_destroy(threadpool); } +BENCHMARK(pthreadpool_parallelize_1d_tile_1d)->UseRealTime()->Apply(SetNumberOfThreads); + +static void compute_1d_dynamic(void*, size_t, size_t) {} + +static void pthreadpool_parallelize_1d_dynamic(benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d_dynamic(threadpool, compute_1d_dynamic, + nullptr /* context */, threads, 1, + 0 /* flags */); + } + pthreadpool_destroy(threadpool); +} +BENCHMARK(pthreadpool_parallelize_1d_dynamic) + ->UseRealTime() + ->Apply(SetNumberOfThreads); BENCHMARK(pthreadpool_parallelize_1d_tile_1d) ->UseRealTime() @@ -79,7 +96,19 @@ static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_parallelize_2d_tile_2d) +static void compute_2d_dynamic(void*, size_t, size_t, size_t, size_t) {} + +static void pthreadpool_parallelize_2d_dynamic(benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_dynamic(threadpool, compute_2d_dynamic, + nullptr /* context */, 1, threads, 1, 1, + 0 /* flags */); + } + pthreadpool_destroy(threadpool); +} +BENCHMARK(pthreadpool_parallelize_2d_dynamic) ->UseRealTime() ->Apply(SetNumberOfThreads); diff --git a/bench/throughput.cc b/bench/throughput.cc index 7c58f1a..85b5d79 100644 --- a/bench/throughput.cc +++ b/bench/throughput.cc @@ -54,7 +54,29 @@ BENCHMARK(pthreadpool_parallelize_1d_tile_1d) ->RangeMultiplier(10) ->Range(10, 1000000); -static void compute_2d(void*, size_t, size_t) {} +static void compute_1d_dynamic(void*, size_t, size_t) {} + +static void pthreadpool_parallelize_1d_dynamic(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d_dynamic(threadpool, compute_1d_dynamic, + nullptr /* context */, items * threads, + 1, 0 /* flags */); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(int64_t(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_1d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_2d(void*, size_t, size_t) { +} static void pthreadpool_parallelize_2d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(2); @@ -95,7 +117,29 @@ BENCHMARK(pthreadpool_parallelize_2d_tile_1d) ->RangeMultiplier(10) ->Range(10, 1000000); -static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) {} +static void compute_2d_tile_1d_dynamic(void*, size_t, size_t, size_t) {} + +static void pthreadpool_parallelize_2d_tile_1d_dynamic(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_1d_dynamic(threadpool, compute_2d_tile_1d_dynamic, + nullptr /* context */, threads, items, + 1, 0 /* flags */); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(int64_t(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_2d_tile_1d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) { +} static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(2); @@ -116,7 +160,29 @@ BENCHMARK(pthreadpool_parallelize_2d_tile_2d) ->RangeMultiplier(10) ->Range(10, 1000000); -static void compute_3d(void*, size_t, size_t, size_t) {} +static void compute_2d_dynamic(void*, size_t, size_t, size_t, size_t) {} + +static void pthreadpool_parallelize_2d_dynamic(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_dynamic(threadpool, compute_2d_dynamic, + nullptr /* context */, threads, items, 1, + 1, 0 /* flags */); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(int64_t(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_2d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_3d(void*, size_t, size_t, size_t) { +} static void pthreadpool_parallelize_3d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(2); @@ -178,7 +244,30 @@ BENCHMARK(pthreadpool_parallelize_3d_tile_2d) ->RangeMultiplier(10) ->Range(10, 1000000); -static void compute_4d(void*, size_t, size_t, size_t, size_t) {} +static void compute_3d_tile_2d_dynamic(void*, size_t, size_t, size_t, size_t, + size_t) {} + +static void pthreadpool_parallelize_3d_tile_2d_dynamic(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_3d_tile_2d_dynamic(threadpool, compute_3d_tile_2d_dynamic, + nullptr /* context */, 1, threads, + items, 1, 1, 0 /* flags */); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(int64_t(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_3d_tile_2d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_4d(void*, size_t, size_t, size_t, size_t) { +} static void pthreadpool_parallelize_4d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(2); diff --git a/include/pthreadpool.h b/include/pthreadpool.h index e192911..7d105fc 100644 --- a/include/pthreadpool.h +++ b/include/pthreadpool.h @@ -18,12 +18,17 @@ typedef struct pthreadpool* pthreadpool_t; typedef void (*pthreadpool_task_1d_t)(void*, size_t); typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); +typedef void (*pthreadpool_task_1d_dynamic_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t); typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_2d_dynamic_t)(void*, size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_1d_dynamic_t)(void*, size_t, size_t, + size_t); typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t); typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t); @@ -32,6 +37,8 @@ typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t); typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_2d_dynamic_t)(void*, size_t, size_t, + size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t); @@ -250,6 +257,41 @@ void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool, void* context, size_t range, size_t tile, uint32_t flags); +/** + * Process items on a 1D grid with specified prefered tile size. + * + * The function repeatedly calls + * + * function(context, i, count) + * + * in parallel where `i` is in the range `[0, range)` and a multiple of the + * provided @a tile and `count` is an integer multiple of @a tile unless `i + * + count == range`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range the number of items on the 1D grid to process. + * @param tile the preferred multiple number of items on the 1D grid to + * process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_1d_dynamic(pthreadpool_t threadpool, + pthreadpool_task_1d_dynamic_t function, + void* context, size_t range, + size_t tile, uint32_t flags); + /** * Process items on a 2D grid. * @@ -345,6 +387,45 @@ void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool, size_t range_j, size_t tile_j, uint32_t flags); +/** + * Process items on a 2D grid with specified prefered tile size along the + * last grid dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_j) + * + * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range + * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an + * integer multiple of @a tile_j unless `j + count_j == range_j`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags); + /** * Process items on a 2D grid with the specified maximum tile size along the * last grid dimension using a microarchitecture-aware task function. @@ -476,6 +557,49 @@ void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool, size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags); +/** + * Process items on a 2D grid with specified prefered tile size along each grid + * dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_i, count_j) + * + * in parallel where `i` is in the range `[0, range_i)` and a multiple of the + * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the + * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a + * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j == + * range_j`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_i the preferred multiple number of items on the first + * dimension of the 2D grid to process in each function call. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags); + /** * Process items on a 2D grid with the specified maximum tile size along each * grid dimension using a microarchitecture-aware task function. @@ -788,6 +912,54 @@ void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool, size_t tile_j, size_t tile_k, uint32_t flags); +/** + * Process items on a 3D grid with specified prefered tile size along the last + * two grid dimensions. + * + * The function repeatedly calls + * + * function(context, i, j, k, count_j, count_k) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)` and a multiple of the provided @a + * tile_j, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k, + * unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 3D + * grid to process. + * @param range_j the number of items on the second dimension of the 3D + * grid to process. + * @param range_k the number of items on the third dimension of the 3D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 3D grid to process in each function call. + * @param tile_k the preferred multiple number of items on the third + * dimension of the 3D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags); + /** * Process items on a 3D grid with the specified maximum tile size along the * last two grid dimensions using a microarchitecture-aware task function. @@ -1381,7 +1553,7 @@ void pthreadpool_compute_4d_tiled(pthreadpool_t threadpool, namespace libpthreadpool { namespace detail { -namespace { +namespace { // NOLINT: Naming this namespace would expose it. template void call_wrapper_1d(void* arg, size_t i) { @@ -1393,6 +1565,11 @@ void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) { (*static_cast(arg))(range_i, tile_i); } +template +void call_wrapper_1d_dynamic(void* arg, size_t range_i, size_t tile_i) { + (*static_cast(arg))(range_i, tile_i); +} + template void call_wrapper_2d(void* functor, size_t i, size_t j) { (*static_cast(functor))(i, j); @@ -1404,12 +1581,25 @@ void call_wrapper_2d_tile_1d(void* functor, size_t i, size_t range_j, (*static_cast(functor))(i, range_j, tile_j); } +template +void call_wrapper_2d_tile_1d_dynamic(void* functor, size_t i, size_t range_j, + size_t tile_j) { + (*static_cast(functor))(i, range_j, tile_j); +} + template void call_wrapper_2d_tile_2d(void* functor, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j) { (*static_cast(functor))(range_i, range_j, tile_i, tile_j); } +template +void call_wrapper_2d_tile_2d_dynamic(void* functor, size_t range_i, + size_t range_j, size_t tile_i, + size_t tile_j) { + (*static_cast(functor))(range_i, range_j, tile_i, tile_j); +} + template void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) { (*static_cast(functor))(i, j, k); @@ -1427,6 +1617,13 @@ void call_wrapper_3d_tile_2d(void* functor, size_t i, size_t range_j, (*static_cast(functor))(i, range_j, range_k, tile_j, tile_k); } +template +void call_wrapper_3d_tile_2d_dynamic(void* functor, size_t i, size_t range_j, + size_t range_k, size_t tile_j, + size_t tile_k) { + (*static_cast(functor))(i, range_j, range_k, tile_j, tile_k); +} + template void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) { (*static_cast(functor))(i, j, k, l); @@ -1553,6 +1750,47 @@ inline void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool, flags); } +/** + * Process items on a 1D grid with specified prefered tile size. + * + * The function repeatedly calls + * + * function(context, i, count) + * + * in parallel where `i` is in the range `[0, range)` and a multiple of the + * provided @a tile and `count` is an integer multiple of @a tile unless `i + * + count == range`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range the number of items on the 1D grid to process. + * @param tile the preferred multiple number of items on the 1D grid to + * process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_1d_dynamic(pthreadpool_t threadpool, + const T& functor, size_t range, + size_t tile, + uint32_t flags = 0) { + pthreadpool_parallelize_1d_dynamic( + threadpool, &libpthreadpool::detail::call_wrapper_1d_dynamic, + const_cast(static_cast(&functor)), range, tile, + flags); +} + /** * Process items on a 2D grid. * @@ -1627,6 +1865,51 @@ inline void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool, tile_j, flags); } +/** + * Process items on a 2D grid with specified prefered tile size along the + * last grid dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_j) + * + * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range + * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an + * integer multiple of @a tile_j unless `j + count_j == range_j`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t tile_j, uint32_t flags = 0) { + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_2d_tile_1d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + tile_j, flags); +} + /** * Process items on a 2D grid with the specified maximum tile size along each * grid dimension. @@ -1670,6 +1953,55 @@ inline void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool, tile_i, tile_j, flags); } +/** + * Process items on a 2D grid with specified prefered tile size along each grid + * dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_i, count_j) + * + * in parallel where `i` is in the range `[0, range_i)` and a multiple of the + * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the + * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a + * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j == + * range_j`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_i the preferred multiple number of items on the first + * dimension of the 2D grid to process in each function call. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t tile_i, size_t tile_j, uint32_t flags = 0) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_2d_tile_2d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + tile_i, tile_j, flags); +} + /** * Process items on a 3D grid. * @@ -1798,6 +2130,60 @@ inline void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool, range_k, tile_j, tile_k, flags); } +/** + * Process items on a 3D grid with specified prefered tile size along the last + * two grid dimensions. + * + * The function repeatedly calls + * + * function(context, i, j, k, count_j, count_k) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)` and a multiple of the provided @a + * tile_j, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k, + * unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 3D + * grid to process. + * @param range_j the number of items on the second dimension of the 3D + * grid to process. + * @param range_k the number of items on the third dimension of the 3D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 3D grid to process in each function call. + * @param tile_k the preferred multiple number of items on the third + * dimension of the 3D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t range_k, size_t tile_j, size_t tile_k, uint32_t flags = 0) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_3d_tile_2d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, tile_j, tile_k, flags); +} + /** * Process items on a 4D grid. * diff --git a/src/portable-api.c b/src/portable-api.c index 5aa3700..d862573 100644 --- a/src/portable-api.c +++ b/src/portable-api.c @@ -191,6 +191,51 @@ static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, pthreadpool_fence_release(); } +static void thread_parallelize_1d_dynamic(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_1d_dynamic_params* params = + &threadpool->params.parallelize_1d_dynamic; + const size_t threads_count = threadpool->threads_count.value; + const size_t count = params->range; + const size_t tile = params->tile; + const pthreadpool_task_1d_dynamic_t task = + (pthreadpool_task_1d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + while (1) { + /* Get the current offset, quit if there is no more work left. */ + size_t offset = params->curr_offset; + if (offset >= count) { + break; + } + + /* Choose a chunk size based on the remaining amount of work and the current + * number of threads, rounded up to the highest integer multiple of `tile`. + */ + size_t chunk_size = max((count - offset) / (2 * threads_count), 1); + chunk_size = ((chunk_size + tile - 1) / tile) * tile; + + /* Grab a chunk of work, and adjust it if there is not enough work left. */ + offset = + pthreadpool_fetch_add_relaxed_size_t(¶ms->curr_offset, chunk_size); + if (offset >= count) { + break; + } + chunk_size = min(chunk_size, count - offset); + + /* Call the task function. */ + task(argument, offset, chunk_size); + } + + /* Make changes by this thread visible to other threads. */ + pthreadpool_fence_release(); +} + static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); @@ -474,6 +519,63 @@ static void thread_parallelize_2d_tile_1d_with_uarch_with_thread( pthreadpool_fence_release(); } +static void thread_parallelize_2d_tile_1d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_2d_tile_1d_dynamic_params* params = + &threadpool->params.parallelize_2d_tile_1d_dynamic; + const size_t threads_count = threadpool->threads_count.value; + const size_t range_i = params->range_i; + const size_t range_j = params->range_j; + const size_t tile_j = params->tile_j; + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t count = range_i * tile_range_j; + const pthreadpool_task_2d_tile_1d_dynamic_t task = + (pthreadpool_task_2d_tile_1d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + while (1) { + /* Get the current offset, quit if there is no more work left. */ + size_t offset = params->curr_offset; + if (offset >= count) { + break; + } + + /* Choose a chunk size based on the remaining amount of work and the current + * number of threads. */ + size_t chunk_size = max((count - offset) / (2 * threads_count), 1); + + /* Grab a chunk of work, maybe adjust the size if there is not enough work + * left. */ + offset = + pthreadpool_fetch_add_relaxed_size_t(¶ms->curr_offset, chunk_size); + if (offset >= count) { + break; + } + chunk_size = min(chunk_size, count - offset); + + // Call the task function. + while (chunk_size > 0) { + // Extract the i and j indices from the offset. + const size_t index_i = offset / tile_range_j; + const size_t tile_index_j = offset % tile_range_j; + const size_t index_j = tile_index_j * tile_j; + const size_t tile_step_j = min(chunk_size, tile_range_j - tile_index_j); + chunk_size -= tile_step_j; + offset += tile_step_j; + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + task(argument, index_i, index_j, step_j); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); @@ -604,6 +706,67 @@ static void thread_parallelize_2d_tile_2d_with_uarch( pthreadpool_fence_release(); } +static void thread_parallelize_2d_tile_2d_dynamic(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_2d_tile_2d_dynamic_params* params = + &threadpool->params.parallelize_2d_tile_2d_dynamic; + const size_t threads_count = threadpool->threads_count.value; + const size_t range_i = params->range_i; + const size_t range_j = params->range_j; + const size_t tile_i = params->tile_i; + const size_t tile_j = params->tile_j; + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t count = tile_range_i * tile_range_j; + const pthreadpool_task_2d_tile_2d_dynamic_t task = + (pthreadpool_task_2d_tile_2d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + while (1) { + /* Get the current offset, quit if there is no more work left. */ + size_t offset = params->curr_offset; + if (offset >= count) { + break; + } + + /* Choose a chunk size based on the remaining amount of work and the current + * number of threads. */ + size_t chunk_size = max((count - offset) / (2 * threads_count), 1); + + /* Grab a chunk of work, maybe adjust the size if there is not enough work + * left. */ + offset = + pthreadpool_fetch_add_relaxed_size_t(¶ms->curr_offset, chunk_size); + if (offset >= count) { + break; + } + chunk_size = min(chunk_size, count - offset); + + // Call the task function. + while (chunk_size > 0) { + // Extract the i and j indices from the offset. + const size_t tile_index_i = offset / tile_range_j; + const size_t tile_index_j = offset % tile_range_j; + const size_t index_i = tile_index_i * tile_i; + const size_t index_j = tile_index_j * tile_j; + size_t step_i = min(tile_i, range_i - index_i); + size_t tile_step_j = min(chunk_size, tile_range_j - tile_index_j); + chunk_size -= tile_step_j; + offset += tile_step_j; + size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + task(argument, index_i, index_j, step_i, step_j); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); @@ -1096,6 +1259,79 @@ static void thread_parallelize_3d_tile_2d_with_uarch( pthreadpool_fence_release(); } +static void thread_parallelize_3d_tile_2d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_3d_tile_2d_dynamic_params* params = + &threadpool->params.parallelize_3d_tile_2d_dynamic; + const size_t threads_count = threadpool->threads_count.value; + const size_t range_i = params->range_i; + const size_t range_j = params->range_j; + const size_t range_k = params->range_k; + const size_t tile_j = params->tile_j; + const size_t tile_k = params->tile_k; + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t count = range_i * tile_range_j * tile_range_k; + const pthreadpool_task_3d_tile_2d_dynamic_t task = + (pthreadpool_task_3d_tile_2d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + while (1) { + /* Get the current offset, quit if there is no more work left. */ + size_t offset = params->curr_offset; + if (offset >= count) { + break; + } + + /* Choose a chunk size based on the remaining amount of work and the current + * number of threads. */ + size_t chunk_size = max((count - offset) / (2 * threads_count), 1); + + /* Grab a chunk of work, maybe adjust the size if there is not enough work + * left. */ + offset = + pthreadpool_fetch_add_relaxed_size_t(¶ms->curr_offset, chunk_size); + if (offset >= count) { + break; + } + chunk_size = min(chunk_size, count - offset); + + // Call the task function. + while (chunk_size > 0) { + // Extract the i and j indices from the offset. + const size_t index_i = offset / (tile_range_j * tile_range_k); + const size_t tile_index_j = (offset / tile_range_k) % tile_range_j; + const size_t tile_index_k = offset % tile_range_k; + const size_t index_j = tile_index_j * tile_j; + if (tile_index_k == 0 && tile_range_j - tile_index_j > 1 && + chunk_size >= 2 * tile_range_k) { + const size_t tile_step_j = + min(chunk_size / tile_range_k, tile_range_j - tile_index_j); + chunk_size -= tile_step_j * tile_range_k; + offset += tile_step_j * tile_range_k; + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + task(argument, index_i, index_j, /*index_k=*/0, step_j, range_k); + } else { + const size_t step_j = min(tile_j, range_j - index_j); + const size_t index_k = tile_index_k * tile_k; + const size_t tile_step_k = min(chunk_size, tile_range_k - tile_index_k); + chunk_size -= tile_step_k; + offset += tile_step_k; + const size_t step_k = min(tile_step_k * tile_k, range_k - index_k); + task(argument, index_i, index_j, index_k, step_j, step_k); + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); @@ -2101,6 +2337,36 @@ void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool, } } +void pthreadpool_parallelize_1d_dynamic(pthreadpool_t threadpool, + pthreadpool_task_1d_dynamic_t function, + void* context, size_t range, + size_t tile, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + function(context, 0, range); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range = divide_round_up(range, tile); + const struct pthreadpool_1d_dynamic_params params = { + .range = range, + .curr_offset = 0, + .tile = tile, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_1d_dynamic, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + void pthreadpool_parallelize_2d(pthreadpool_t threadpool, pthreadpool_task_2d_t function, void* context, size_t range_i, size_t range_j, @@ -2282,6 +2548,39 @@ void pthreadpool_parallelize_2d_tile_1d_with_uarch( } } +void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t index_i = 0; index_i < range_i; index_i++) { + function(context, index_i, /*index_j=*/0, range_j); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = range_i * tile_range_j; + const struct pthreadpool_2d_tile_1d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .curr_offset = 0, + .tile_j = tile_j, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_2d_tile_1d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, void* context, @@ -2389,6 +2688,39 @@ void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool, } } +void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= tile_i && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + function(context, /*index_i=*/0, /*index_j=*/0, range_i, range_j); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = tile_range_i * tile_range_j; + const struct pthreadpool_2d_tile_2d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .curr_offset = 0, + .tile_i = tile_i, + .tile_j = tile_j, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_2d_tile_2d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + void pthreadpool_parallelize_2d_tile_2d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, @@ -2765,6 +3097,43 @@ void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool, } } +void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t index_i = 0; index_i < range_i; index_i++) { + function(context, index_i, /*index_j=*/0, /*index_k=*/0, range_j, + range_k); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * tile_range_j * tile_range_k; + const struct pthreadpool_3d_tile_2d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .curr_offset = 0, + .tile_j = tile_j, + .tile_k = tile_k, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_3d_tile_2d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + void pthreadpool_parallelize_3d_tile_2d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, diff --git a/src/shim.c b/src/shim.c index 87527d2..5a325f2 100644 --- a/src/shim.c +++ b/src/shim.c @@ -67,6 +67,13 @@ void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool, } } +void pthreadpool_parallelize_1d_dynamic(pthreadpool_t threadpool, + pthreadpool_task_1d_dynamic_t function, + void* context, size_t range, + size_t tile, uint32_t flags) { + function(context, 0, range); +} + void pthreadpool_parallelize_2d(struct pthreadpool* threadpool, pthreadpool_task_2d_t function, void* context, size_t range_i, size_t range_j, @@ -100,6 +107,15 @@ void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool, } } +void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + function(context, i, 0, range_j); + } +} + void pthreadpool_parallelize_2d_tile_1d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, @@ -149,6 +165,13 @@ void pthreadpool_parallelize_2d_tile_2d_with_uarch( } } +void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + function(context, 0, 0, range_i, range_j); +} + void pthreadpool_parallelize_3d(pthreadpool_t threadpool, pthreadpool_task_3d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, @@ -251,6 +274,15 @@ void pthreadpool_parallelize_3d_tile_2d_with_uarch( } } +void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + function(context, i, 0, 0, range_j, range_k); + } +} + void pthreadpool_parallelize_4d(pthreadpool_t threadpool, pthreadpool_task_4d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, diff --git a/src/threadpool-atomics.h b/src/threadpool-atomics.h index 6b8e71c..8f77fa6 100644 --- a/src/threadpool-atomics.h +++ b/src/threadpool-atomics.h @@ -15,6 +15,7 @@ #include #include #include +#include /* Windows-specific headers */ #ifdef _WIN32 @@ -127,6 +128,11 @@ static inline bool pthreadpool_try_decrement_relaxed_size_t( return false; } +static inline size_t pthreadpool_fetch_add_relaxed_size_t( + pthreadpool_atomic_size_t* address, size_t value) { + return atomic_fetch_add_explicit(address, value, memory_order_relaxed); +} + static inline void pthreadpool_fence_acquire() { atomic_thread_fence(memory_order_acquire); } diff --git a/src/threadpool-object.h b/src/threadpool-object.h index 19f98b6..a26d70b 100644 --- a/src/threadpool-object.h +++ b/src/threadpool-object.h @@ -125,6 +125,23 @@ struct pthreadpool_1d_tile_1d_params { size_t tile; }; +struct pthreadpool_1d_dynamic_params { + /** + * Copy of the range argument passed to the pthreadpool_parallelize_1d_dynamic + * function. + */ + size_t range; + /** + * Offset of the next element in @a range to process. + */ + pthreadpool_atomic_size_t curr_offset; + /** + * Copy of the tile argument passed to the pthreadpool_parallelize_1d_dynamic + * function. + */ + size_t tile; +}; + struct pthreadpool_2d_params { /** * FXdiv divisor for the range_j argument passed to the @@ -177,6 +194,28 @@ struct pthreadpool_2d_tile_1d_with_uarch_params { struct fxdiv_divisor_size_t tile_range_j; }; +struct pthreadpool_2d_tile_1d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_1d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d_dynamic function. + */ + size_t range_j; + /** + * Offset of the next element to process. + */ + pthreadpool_atomic_size_t curr_offset; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d_dynamic function. + */ + size_t tile_j; +}; + struct pthreadpool_2d_tile_2d_params { /** * Copy of the range_i argument passed to the @@ -241,6 +280,33 @@ struct pthreadpool_2d_tile_2d_with_uarch_params { struct fxdiv_divisor_size_t tile_range_j; }; +struct pthreadpool_2d_tile_2d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Offset of the next element to process. + */ + pthreadpool_atomic_size_t curr_offset; + /** + * Copy of the tile_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t tile_i; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t tile_j; +}; + struct pthreadpool_3d_params { /** * FXdiv divisor for the range_j argument passed to the @@ -380,6 +446,38 @@ struct pthreadpool_3d_tile_2d_with_uarch_params { struct fxdiv_divisor_size_t tile_range_k; }; +struct pthreadpool_3d_tile_2d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_k; + /** + * Offset of the next element to process. + */ + pthreadpool_atomic_size_t curr_offset; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t tile_j; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t tile_k; +}; + struct pthreadpool_4d_params { /** * Copy of the range_k argument passed to the pthreadpool_parallelize_4d @@ -789,13 +887,16 @@ struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool { union { struct pthreadpool_1d_with_uarch_params parallelize_1d_with_uarch; struct pthreadpool_1d_tile_1d_params parallelize_1d_tile_1d; + struct pthreadpool_1d_dynamic_params parallelize_1d_dynamic; struct pthreadpool_2d_params parallelize_2d; struct pthreadpool_2d_tile_1d_params parallelize_2d_tile_1d; struct pthreadpool_2d_tile_1d_with_uarch_params parallelize_2d_tile_1d_with_uarch; + struct pthreadpool_2d_tile_1d_dynamic_params parallelize_2d_tile_1d_dynamic; struct pthreadpool_2d_tile_2d_params parallelize_2d_tile_2d; struct pthreadpool_2d_tile_2d_with_uarch_params parallelize_2d_tile_2d_with_uarch; + struct pthreadpool_2d_tile_2d_dynamic_params parallelize_2d_tile_2d_dynamic; struct pthreadpool_3d_params parallelize_3d; struct pthreadpool_3d_tile_1d_params parallelize_3d_tile_1d; struct pthreadpool_3d_tile_1d_with_uarch_params @@ -803,6 +904,7 @@ struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool { struct pthreadpool_3d_tile_2d_params parallelize_3d_tile_2d; struct pthreadpool_3d_tile_2d_with_uarch_params parallelize_3d_tile_2d_with_uarch; + struct pthreadpool_3d_tile_2d_dynamic_params parallelize_3d_tile_2d_dynamic; struct pthreadpool_4d_params parallelize_4d; struct pthreadpool_4d_tile_1d_params parallelize_4d_tile_1d; struct pthreadpool_4d_tile_2d_params parallelize_4d_tile_2d; diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h index 9ef08ef..2e33d90 100644 --- a/src/threadpool-utils.h +++ b/src/threadpool-utils.h @@ -139,7 +139,11 @@ static inline size_t divide_round_up(size_t dividend, size_t divisor) { #ifdef min #undef min #endif +#ifdef max +#undef max +#endif static inline size_t min(size_t a, size_t b) { return a < b ? a : b; } +static inline size_t max(size_t a, size_t b) { return a > b ? a : b; } #endif // __PTHREADPOOL_SRC_THREADPOOL_UTILS_H_ diff --git a/test/pthreadpool-cxx.cc b/test/pthreadpool-cxx.cc index c03f996..766b1b7 100644 --- a/test/pthreadpool-cxx.cc +++ b/test/pthreadpool-cxx.cc @@ -238,6 +238,75 @@ TEST(Parallelize1DTile1D, EachItemProcessedOnce) { } } +TEST(Parallelize1DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_dynamic( + threadpool.get(), [](size_t, size_t) {}, kParallelize1DTile1DRange, + kParallelize1DTile1DTile); +} + +TEST(Parallelize1DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_dynamic( + threadpool.get(), + [](size_t start_i, size_t tile_i) { + EXPECT_GE(start_i, 0); + EXPECT_GT(tile_i, 0); + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); +} + +TEST(Parallelize1DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_dynamic( + threadpool.get(), + [&indicators](size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + indicators[i].store(true, std::memory_order_relaxed); + } + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +TEST(Parallelize1DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_dynamic( + threadpool.get(), + [&counters](size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + counters[i].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } +} + TEST(Parallelize2D, ThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -409,6 +478,89 @@ TEST(Parallelize2DTile1D, EachItemProcessedOnce) { } } +TEST(Parallelize2DTile1DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), [](size_t, size_t, size_t) {}, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); +} + +TEST(Parallelize2DTile1DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), + [](size_t i, size_t start_j, size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); +} + +TEST(Parallelize2DTile1DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), + [&indicators](size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), + [&counters](size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + TEST(Parallelize2DTile2D, ThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -522,6 +674,98 @@ TEST(Parallelize2DTile2D, EachItemProcessedOnce) { } } +TEST(Parallelize2DTile2DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), [](size_t, size_t, size_t, size_t) {}, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); +} + +TEST(Parallelize2DTile2DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + [](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_GT(tile_j, 0); + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); +} + +TEST(Parallelize2DTile2DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + [&indicators](size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + [&counters](size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + TEST(Parallelize3D, ThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -852,6 +1096,118 @@ TEST(Parallelize3DTile2D, EachItemProcessedOnce) { } } +TEST(Parallelize3DTile2DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t) {}, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); +} + +TEST(Parallelize3DTile2DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_GT(tile_k, 0); + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); +} + +TEST(Parallelize3DTile2DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * + kParallelize3DTile2DRangeK + + k; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * + kParallelize3DTile2DRangeK + + k; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + TEST(Parallelize4D, ThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); diff --git a/test/pthreadpool.cc b/test/pthreadpool.cc index d212e97..915248a 100644 --- a/test/pthreadpool.cc +++ b/test/pthreadpool.cc @@ -1186,6 +1186,288 @@ TEST(Parallelize1DTile1D, MultiThreadPoolWorkStealing) { kParallelize1DTile1DRange); } +static void ComputeNothing1DDynamic(void*, size_t, size_t) {} + +TEST(Parallelize1DDynamic, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), ComputeNothing1DDynamic, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); +} + +TEST(Parallelize1DDynamic, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), ComputeNothing1DDynamic, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); +} + +static void CheckBounds1DDynamic(void*, size_t start_i, size_t tile_i) { + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); +} + +TEST(Parallelize1DDynamic, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckBounds1DDynamic, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); +} + +TEST(Parallelize1DDynamic, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckBounds1DDynamic, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); +} + +static void CheckTiling1DDynamic(void*, size_t start_i, size_t tile_i) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize1DTile1DRange); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); +} + +TEST(Parallelize1DDynamic, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckTiling1DDynamic, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); +} + +TEST(Parallelize1DDynamic, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckTiling1DDynamic, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); +} + +static void SetTrue1DDynamic(std::atomic_bool* processed_indicators, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_indicators[i].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue1DDynamic), + static_cast(indicators.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +TEST(Parallelize1DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue1DDynamic), + static_cast(indicators.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +static void Increment1DDynamic(std::atomic_int* processed_counters, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } +} + +TEST(Parallelize1DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } +} + +TEST(Parallelize1DDynamic, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +TEST(Parallelize1DDynamic, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +static void IncrementSame1DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame1DDynamic), + static_cast(&num_processed_items), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DTile1DRange); +} + +static void WorkImbalance1DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t tile_i) { + num_processed_items->fetch_add(tile_i, std::memory_order_relaxed); + if (start_i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize1DTile1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize1DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance1DDynamic), + static_cast(&num_processed_items), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DTile1DRange); +} + static void ComputeNothing2D(void*, size_t, size_t) {} TEST(Parallelize2D, SingleThreadPoolCompletes) { @@ -2082,20 +2364,19 @@ TEST(Parallelize2DTile1D, MultiThreadPoolWorkStealing) { kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void ComputeNothing2DTile1DWithUArch(void*, uint32_t, size_t, size_t, - size_t) {} +static void ComputeNothing2DTile1DDynamic(void*, size_t, size_t, size_t) {} -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolCompletes) { +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), ComputeNothing2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), ComputeNothing2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolCompletes) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2103,61 +2384,30 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), ComputeNothing2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); -} - -static void CheckUArch2DTile1DWithUArch(void*, uint32_t uarch_index, size_t, - size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } -} - -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), CheckUArch2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); -} - -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), CheckUArch2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), ComputeNothing2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckBounds2DTile1DWithUArch(void*, uint32_t, size_t i, - size_t start_j, size_t tile_j) { +static void CheckBounds2DTile1DDynamic(void*, size_t i, size_t start_j, + size_t tile_j) { EXPECT_LT(i, kParallelize2DTile1DRangeI); EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); } -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), CheckBounds2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckBounds2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2165,32 +2415,30 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), CheckBounds2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckBounds2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckTiling2DTile1DWithUArch(void*, uint32_t, size_t i, - size_t start_j, size_t tile_j) { +static void CheckTiling2DTile1DDynamic(void*, size_t i, size_t start_j, + size_t tile_j) { EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_LE(tile_j, kParallelize2DTile1DRangeJ); EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, - kParallelize2DTile1DRangeJ - start_j)); } -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolUniformTiling) { +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), CheckTiling2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckTiling2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUniformTiling) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2198,35 +2446,32 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), CheckTiling2DTile1DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, - kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckTiling2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -static void SetTrue2DTile1DWithUArch(std::atomic_bool* processed_indicators, - uint32_t, size_t i, size_t start_j, - size_t tile_j) { +static void SetTrue2DTile1DDynamic(std::atomic_bool* processed_indicators, + size_t i, size_t start_j, size_t tile_j) { for (size_t j = start_j; j < start_j + tile_j; j++) { const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } } -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - SetTrue2DTile1DWithUArch), - static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast(SetTrue2DTile1DDynamic), + static_cast(indicators.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { @@ -2237,7 +2482,7 @@ TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { } } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2248,13 +2493,11 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - SetTrue2DTile1DWithUArch), - static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast(SetTrue2DTile1DDynamic), + static_cast(indicators.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { @@ -2265,29 +2508,26 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { } } -static void Increment2DTile1DWithUArch(std::atomic_int* processed_counters, - uint32_t, size_t i, size_t start_j, - size_t tile_j) { +static void Increment2DTile1DDynamic(std::atomic_int* processed_counters, + size_t i, size_t start_j, size_t tile_j) { for (size_t j = start_j; j < start_j + tile_j; j++) { const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast(Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { @@ -2300,7 +2540,7 @@ TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2311,13 +2551,11 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast(Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { @@ -2330,7 +2568,7 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize2DTile1DWithUArch, +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2339,13 +2577,12 @@ TEST(Parallelize2DTile1DWithUArch, ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast( + Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { @@ -2360,7 +2597,7 @@ TEST(Parallelize2DTile1DWithUArch, } } -TEST(Parallelize2DTile1DWithUArch, +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2373,13 +2610,12 @@ TEST(Parallelize2DTile1DWithUArch, } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast( + Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { @@ -2394,15 +2630,15 @@ TEST(Parallelize2DTile1DWithUArch, } } -static void IncrementSame2DTile1DWithUArch(std::atomic_int* num_processed_items, - uint32_t, size_t i, size_t start_j, - size_t tile_j) { +static void IncrementSame2DTile1DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t tile_j) { for (size_t j = start_j; j < start_j + tile_j; j++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolHighContention) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -2412,20 +2648,19 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - IncrementSame2DTile1DWithUArch), - static_cast(&num_processed_items), kDefaultUArchIndex, - kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast( + IncrementSame2DTile1DDynamic), + static_cast(&num_processed_items), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void WorkImbalance2DTile1DWithUArch(std::atomic_int* num_processed_items, - uint32_t, size_t i, size_t start_j, - size_t tile_j) { +static void WorkImbalance2DTile1DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t tile_j) { num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); if (i == 0 && start_j == 0) { /* Spin-wait until all items are computed */ @@ -2436,7 +2671,7 @@ static void WorkImbalance2DTile1DWithUArch(std::atomic_int* num_processed_items, } } -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolWorkStealing) { +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -2446,31 +2681,30 @@ TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_parallelize_2d_tile_1d( threadpool.get(), - reinterpret_cast( - WorkImbalance2DTile1DWithUArch), - static_cast(&num_processed_items), kDefaultUArchIndex, - kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + reinterpret_cast( + WorkImbalance2DTile1DDynamic), + static_cast(&num_processed_items), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void ComputeNothing2DTile1DWithUArchWithThread(void*, uint32_t, size_t, - size_t, size_t, size_t) {} +static void ComputeNothing2DTile1DWithUArch(void*, uint32_t, size_t, size_t, + size_t) {} -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolCompletes) { +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), ComputeNothing2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2478,31 +2712,30 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), ComputeNothing2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckUArch2DTile1DWithUArchWithThread(void*, uint32_t uarch_index, - size_t, size_t, size_t, - size_t) { +static void CheckUArch2DTile1DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t) { if (uarch_index != kDefaultUArchIndex) { EXPECT_LE(uarch_index, kMaxUArchIndex); } } -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolUArchInBounds) { +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolUArchInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckUArch2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckUArch2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUArchInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2510,31 +2743,30 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckUArch2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckUArch2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckBounds2DTile1DWithUArchWithThread(void*, uint32_t, size_t, - size_t i, size_t start_j, - size_t tile_j) { +static void CheckBounds2DTile1DWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t tile_j) { EXPECT_LT(i, kParallelize2DTile1DRangeI); EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); } -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckBounds2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckBounds2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2542,15 +2774,14 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckBounds2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckBounds2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckTiling2DTile1DWithUArchWithThread(void*, uint32_t, size_t, - size_t i, size_t start_j, - size_t tile_j) { +static void CheckTiling2DTile1DWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t tile_j) { EXPECT_GT(tile_j, 0); EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); @@ -2558,17 +2789,17 @@ static void CheckTiling2DTile1DWithUArchWithThread(void*, uint32_t, size_t, kParallelize2DTile1DRangeJ - start_j)); } -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolUniformTiling) { +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckTiling2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckTiling2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2576,33 +2807,32 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckTiling2DTile1DWithUArchWithThread, nullptr, + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckTiling2DTile1DWithUArch, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void SetTrue2DTile1DWithUArchWithThread( - std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, - size_t start_j, size_t tile_j) { +static void SetTrue2DTile1DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { for (size_t j = start_j; j < start_j + tile_j; j++) { const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } } -TEST(Parallelize2DTile1DWithUArchWithThread, - SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - SetTrue2DTile1DWithUArchWithThread), + reinterpret_cast( + SetTrue2DTile1DWithUArch), static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2616,7 +2846,7 @@ TEST(Parallelize2DTile1DWithUArchWithThread, } } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2627,10 +2857,10 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - SetTrue2DTile1DWithUArchWithThread), + reinterpret_cast( + SetTrue2DTile1DWithUArch), static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2644,27 +2874,26 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { } } -static void Increment2DTile1DWithUArchWithThread( - std::atomic_int* processed_counters, uint32_t, size_t, size_t i, - size_t start_j, size_t tile_j) { +static void Increment2DTile1DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { for (size_t j = start_j; j < start_j + tile_j; j++) { const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize2DTile1DWithUArchWithThread, - SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArchWithThread), + reinterpret_cast( + Increment2DTile1DWithUArch), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2680,8 +2909,7 @@ TEST(Parallelize2DTile1DWithUArchWithThread, } } -TEST(Parallelize2DTile1DWithUArchWithThread, - MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2692,10 +2920,10 @@ TEST(Parallelize2DTile1DWithUArchWithThread, GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArchWithThread), + reinterpret_cast( + Increment2DTile1DWithUArch), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2711,7 +2939,7 @@ TEST(Parallelize2DTile1DWithUArchWithThread, } } -TEST(Parallelize2DTile1DWithUArchWithThread, +TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2720,10 +2948,10 @@ TEST(Parallelize2DTile1DWithUArchWithThread, ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArchWithThread), + reinterpret_cast( + Increment2DTile1DWithUArch), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2741,7 +2969,7 @@ TEST(Parallelize2DTile1DWithUArchWithThread, } } -TEST(Parallelize2DTile1DWithUArchWithThread, +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); @@ -2754,10 +2982,10 @@ TEST(Parallelize2DTile1DWithUArchWithThread, } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment2DTile1DWithUArchWithThread), + reinterpret_cast( + Increment2DTile1DWithUArch), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2775,15 +3003,15 @@ TEST(Parallelize2DTile1DWithUArchWithThread, } } -static void IncrementSame2DTile1DWithUArchWithThread( - std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, - size_t start_j, size_t tile_j) { +static void IncrementSame2DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { for (size_t j = start_j; j < start_j + tile_j; j++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -2793,10 +3021,10 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - IncrementSame2DTile1DWithUArchWithThread), + reinterpret_cast( + IncrementSame2DTile1DWithUArch), static_cast(&num_processed_items), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2804,9 +3032,9 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void WorkImbalance2DTile1DWithUArchWithThread( - std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, - size_t start_j, size_t tile_j) { +static void WorkImbalance2DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); if (i == 0 && start_j == 0) { /* Spin-wait until all items are computed */ @@ -2817,7 +3045,7 @@ static void WorkImbalance2DTile1DWithUArchWithThread( } } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { +TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -2827,10 +3055,10 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_2d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - WorkImbalance2DTile1DWithUArchWithThread), + reinterpret_cast( + WorkImbalance2DTile1DWithUArch), static_cast(&num_processed_items), kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); @@ -2838,42 +3066,52 @@ TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void SetThreadTrue2DTile1DWithUArchWithThread(const size_t* num_threads, - uint32_t, - size_t thread_index, - size_t i, size_t start_j, - size_t tile_j) { - EXPECT_LE(thread_index, *num_threads); +static void ComputeNothing2DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t, size_t, size_t) {} + +TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolThreadIndexValid) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast( - SetThreadTrue2DTile1DWithUArchWithThread), - static_cast(&num_threads), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, - kParallelize2DTile1DTileJ, 0 /* flags */); + threadpool.get(), ComputeNothing2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) {} +static void CheckUArch2DTile1DWithUArchWithThread(void*, uint32_t uarch_index, + size_t, size_t, size_t, + size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } +} -TEST(Parallelize2DTile2D, SingleThreadPoolCompletes) { +TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolUArchInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), ComputeNothing2DTile2D, nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2881,31 +3119,31 @@ TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), ComputeNothing2DTile2D, nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, - size_t tile_i, size_t tile_j) { - EXPECT_LT(start_i, kParallelize2DTile2DRangeI); - EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); - EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); - EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +static void CheckBounds2DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t start_j, + size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); } -TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), CheckBounds2DTile2D, nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2913,38 +3151,33 @@ TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), CheckBounds2DTile2D, nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, - size_t tile_i, size_t tile_j) { - EXPECT_GT(tile_i, 0); - EXPECT_LE(tile_i, kParallelize2DTile2DTileI); - EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); - EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, - kParallelize2DTile2DRangeI - start_i)); - +static void CheckTiling2DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t start_j, + size_t tile_j) { EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, - kParallelize2DTile2DRangeJ - start_j)); + EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, + kParallelize2DTile1DRangeJ - start_j)); } -TEST(Parallelize2DTile2D, SingleThreadPoolUniformTiling) { +TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), CheckTiling2DTile2D, nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -2952,49 +3185,49 @@ TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), CheckTiling2DTile2D, nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, 0 /* flags */); } -static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, - size_t start_i, size_t start_j, size_t tile_i, - size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue2DTile1DWithUArchWithThread( + std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } } -TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * - kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, + SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(SetTrue2DTile2D), - static_cast(indicators.data()), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + SetTrue2DTile1DWithUArchWithThread), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) << "Element (" << i << ", " << j << ") not processed"; } } } -TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * - kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3003,50 +3236,51 @@ TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(SetTrue2DTile2D), - static_cast(indicators.data()), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + reinterpret_cast( + SetTrue2DTile1DWithUArchWithThread), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) << "Element (" << i << ", " << j << ") not processed"; } } } -static void Increment2DTile2D(std::atomic_int* processed_counters, - size_t start_i, size_t start_j, size_t tile_i, - size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } +static void Increment2DTile1DWithUArchWithThread( + std::atomic_int* processed_counters, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * - kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) << "Element (" << i << ", " << j << ") was processed " << counters[linear_idx].load(std::memory_order_relaxed) @@ -3055,9 +3289,10 @@ TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * - kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3066,16 +3301,17 @@ TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) << "Element (" << i << ", " << j << ") was processed " << counters[linear_idx].load(std::memory_order_relaxed) @@ -3084,25 +3320,27 @@ TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile2DRangeI * - kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) << "Element (" << i << ", " << j << ") was processed " @@ -3112,9 +3350,10 @@ TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { } } -TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile2DRangeI * - kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3124,17 +3363,18 @@ TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) << "Element (" << i << ", " << j << ") was processed " @@ -3144,17 +3384,15 @@ TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { } } -static void IncrementSame2DTile2D(std::atomic_int* num_processed_items, - size_t start_i, size_t start_j, size_t tile_i, - size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } +static void IncrementSame2DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -3164,30 +3402,31 @@ TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(IncrementSame2DTile2D), - static_cast(&num_processed_items), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + IncrementSame2DTile1DWithUArchWithThread), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), - kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, - size_t start_i, size_t start_j, size_t tile_i, - size_t tile_j) { - num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); - if (start_i == 0 && start_j == 0) { +static void WorkImbalance2DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); + if (i == 0 && start_j == 0) { /* Spin-wait until all items are computed */ while (num_processed_items->load(std::memory_order_relaxed) != - kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { std::atomic_thread_fence(std::memory_order_acquire); } } } -TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -3197,64 +3436,53 @@ TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d( + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast(WorkImbalance2DTile2D), - static_cast(&num_processed_items), kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast( + WorkImbalance2DTile1DWithUArchWithThread), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), - kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void ComputeNothing2DTile2DWithUArch(void*, uint32_t, size_t, size_t, - size_t, size_t) {} - -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), ComputeNothing2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); +static void SetThreadTrue2DTile1DWithUArchWithThread(const size_t* num_threads, + uint32_t, + size_t thread_index, + size_t i, size_t start_j, + size_t tile_j) { + EXPECT_LE(thread_index, *num_threads); } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolCompletes) { +TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolThreadIndexValid) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), ComputeNothing2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetThreadTrue2DTile1DWithUArchWithThread), + static_cast(&num_threads), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, 0 /* flags */); } -static void CheckUArch2DTile2DWithUArch(void*, uint32_t uarch_index, size_t, - size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } -} +static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) {} -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUArchInBounds) { +TEST(Parallelize2DTile2D, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), CheckUArch2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), ComputeNothing2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUArchInBounds) { +TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3262,34 +3490,31 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUArchInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), CheckUArch2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), ComputeNothing2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -static void CheckBounds2DTile2DWithUArch(void*, uint32_t, size_t start_i, - size_t start_j, size_t tile_i, - size_t tile_j) { +static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { EXPECT_LT(start_i, kParallelize2DTile2DRangeI); EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); } -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), CheckBounds2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckBounds2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3297,16 +3522,14 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), CheckBounds2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckBounds2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, - size_t start_j, size_t tile_i, - size_t tile_j) { +static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { EXPECT_GT(tile_i, 0); EXPECT_LE(tile_i, kParallelize2DTile2DTileI); EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); @@ -3320,18 +3543,17 @@ static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, kParallelize2DTile2DRangeJ - start_j)); } -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUniformTiling) { +TEST(Parallelize2DTile2D, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), CheckTiling2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckTiling2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUniformTiling) { +TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3339,16 +3561,15 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), CheckTiling2DTile2DWithUArch, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, - kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, - kParallelize2DTile2DTileJ, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckTiling2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, - uint32_t, size_t start_i, size_t start_j, - size_t tile_i, size_t tile_j) { +static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { for (size_t i = start_i; i < start_i + tile_i; i++) { for (size_t j = start_j; j < start_j + tile_j; j++) { const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; @@ -3357,20 +3578,19 @@ static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, } } -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - SetTrue2DTile2DWithUArch), - static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(SetTrue2DTile2D), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { @@ -3381,7 +3601,7 @@ TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) { } } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); @@ -3392,13 +3612,12 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - SetTrue2DTile2DWithUArch), - static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(SetTrue2DTile2D), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { @@ -3409,9 +3628,9 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { } } -static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, - uint32_t, size_t start_i, size_t start_j, - size_t tile_i, size_t tile_j) { +static void Increment2DTile2D(std::atomic_int* processed_counters, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { for (size_t i = start_i; i < start_i + tile_i; i++) { for (size_t j = start_j; j < start_j + tile_j; j++) { const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; @@ -3420,20 +3639,19 @@ static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, } } -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment2DTile2DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { @@ -3446,7 +3664,7 @@ TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); @@ -3457,13 +3675,12 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment2DTile2DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { @@ -3476,8 +3693,7 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize2DTile2DWithUArch, - SingleThreadPoolEachItemProcessedMultipleTimes) { +TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); @@ -3485,13 +3701,12 @@ TEST(Parallelize2DTile2DWithUArch, ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment2DTile2DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); } for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { @@ -3506,8 +3721,7 @@ TEST(Parallelize2DTile2DWithUArch, } } -TEST(Parallelize2DTile2DWithUArch, - MultiThreadPoolEachItemProcessedMultipleTimes) { +TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); @@ -3519,13 +3733,12 @@ TEST(Parallelize2DTile2DWithUArch, } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment2DTile2DWithUArch), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); } for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { @@ -3540,10 +3753,9 @@ TEST(Parallelize2DTile2DWithUArch, } } -static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, - uint32_t, size_t start_i, - size_t start_j, size_t tile_i, - size_t tile_j) { +static void IncrementSame2DTile2D(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { for (size_t i = start_i; i < start_i + tile_i; i++) { for (size_t j = start_j; j < start_j + tile_j; j++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); @@ -3551,7 +3763,7 @@ static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, } } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolHighContention) { +TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -3561,21 +3773,19 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - IncrementSame2DTile2DWithUArch), - static_cast(&num_processed_items), kDefaultUArchIndex, - kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(IncrementSame2DTile2D), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); } -static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, - uint32_t, size_t start_i, - size_t start_j, size_t tile_i, - size_t tile_j) { +static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); if (start_i == 0 && start_j == 0) { /* Spin-wait until all items are computed */ @@ -3586,7 +3796,7 @@ static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, } } -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolWorkStealing) { +TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -3596,29 +3806,29 @@ TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_parallelize_2d_tile_2d( threadpool.get(), - reinterpret_cast( - WorkImbalance2DTile2DWithUArch), - static_cast(&num_processed_items), kDefaultUArchIndex, - kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + reinterpret_cast(WorkImbalance2DTile2D), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); } -static void ComputeNothing3D(void*, size_t, size_t, size_t) {} +static void ComputeNothing2DDynamic(void*, size_t, size_t, size_t, size_t) {} -TEST(Parallelize3D, SingleThreadPoolCompletes) { +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d(threadpool.get(), ComputeNothing3D, nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, - kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), ComputeNothing2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -TEST(Parallelize3D, MultiThreadPoolCompletes) { +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3626,27 +3836,31 @@ TEST(Parallelize3D, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_3d(threadpool.get(), ComputeNothing3D, nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, - kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), ComputeNothing2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -static void CheckBounds3D(void*, size_t i, size_t j, size_t k) { - EXPECT_LT(i, kParallelize3DRangeI); - EXPECT_LT(j, kParallelize3DRangeJ); - EXPECT_LT(k, kParallelize3DRangeK); +static void CheckBounds2DDynamic(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); } -TEST(Parallelize3D, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d(threadpool.get(), CheckBounds3D, nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, - kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckBounds2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -TEST(Parallelize3D, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3654,46 +3868,34 @@ TEST(Parallelize3D, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d(threadpool.get(), CheckBounds3D, nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, - kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckBounds2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -static void SetTrue3D(std::atomic_bool* processed_indicators, size_t i, - size_t j, size_t k) { - const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); -} +static void CheckTiling2DDynamic(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DRangeI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); -TEST(Parallelize3D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators( - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DRangeJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); +} +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d( - threadpool.get(), reinterpret_cast(SetTrue3D), - static_cast(indicators.data()), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckTiling2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); } -TEST(Parallelize3D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators( - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3701,19 +3903,768 @@ TEST(Parallelize3D, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_3d( - threadpool.get(), reinterpret_cast(SetTrue3D), - static_cast(indicators.data()), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckTiling2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); +} - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } +static void SetTrue2DDynamic(std::atomic_bool* processed_indicators, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue2DDynamic), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue2DDynamic), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DDynamic(std::atomic_int* processed_counters, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(IncrementSame2DDynamic), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void WorkImbalance2DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(WorkImbalance2DDynamic), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void ComputeNothing2DTile2DWithUArch(void*, uint32_t, size_t, size_t, + size_t, size_t) {} + +TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +static void CheckUArch2DTile2DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } +} + +TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckUArch2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckUArch2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +static void CheckBounds2DTile2DWithUArch(void*, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +} + +TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckBounds2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckBounds2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DTileI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, + kParallelize2DTile2DRangeI - start_i)); + + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, + kParallelize2DTile2DRangeJ - start_j)); +} + +TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckTiling2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckTiling2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, 0 /* flags */); +} + +static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile2DWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame2DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void ComputeNothing3D(void*, size_t, size_t, size_t) {} + +TEST(Parallelize3D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d(threadpool.get(), ComputeNothing3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, 0 /* flags */); +} + +TEST(Parallelize3D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d(threadpool.get(), ComputeNothing3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, 0 /* flags */); +} + +static void CheckBounds3D(void*, size_t i, size_t j, size_t k) { + EXPECT_LT(i, kParallelize3DRangeI); + EXPECT_LT(j, kParallelize3DRangeJ); + EXPECT_LT(k, kParallelize3DRangeK); +} + +TEST(Parallelize3D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d(threadpool.get(), CheckBounds3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, 0 /* flags */); +} + +TEST(Parallelize3D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d(threadpool.get(), CheckBounds3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, 0 /* flags */); +} + +static void SetTrue3D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +} + +TEST(Parallelize3D, SingleThreadPoolAllItemsProcessed) { + std::vector indicators( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(SetTrue3D), + static_cast(indicators.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3D, MultiThreadPoolAllItemsProcessed) { + std::vector indicators( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(SetTrue3D), + static_cast(indicators.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } } } } @@ -3725,23 +4676,369 @@ static void Increment3D(std::atomic_int* processed_counters, size_t i, size_t j, processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } -TEST(Parallelize3D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters( - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); +TEST(Parallelize3D, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); +} + +TEST(Parallelize3D, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d( + threadpool.get(), + reinterpret_cast(IncrementSame3D), + static_cast(&num_processed_items), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); +} + +static void WorkImbalance3D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize3D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d( + threadpool.get(), + reinterpret_cast(WorkImbalance3D), + static_cast(&num_processed_items), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); +} + +static void ComputeNothing3DTile1D(void*, size_t, size_t, size_t, size_t) {} + +TEST(Parallelize3DTile1D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), ComputeNothing3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); +} + +TEST(Parallelize3DTile1D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), ComputeNothing3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); +} + +static void CheckBounds3DTile1D(void*, size_t i, size_t j, size_t start_k, + size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile1DRangeI); + EXPECT_LT(j, kParallelize3DTile1DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile1DRangeK); + EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); +} + +TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckBounds3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); +} + +TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckBounds3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); +} + +static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, + size_t tile_k) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile1DTileK); + EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, + kParallelize3DTile1DRangeK - start_k)); +} + +TEST(Parallelize3DTile1D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckTiling3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); +} + +TEST(Parallelize3DTile1D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckTiling3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); +} + +static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue3DTile1D), + static_cast(indicators.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue3DTile1D), + static_cast(indicators.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d( - threadpool.get(), reinterpret_cast(Increment3D), - static_cast(counters.data()), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) << "Element (" << i << ", " << j << ", " << k << ") was processed " << counters[linear_idx].load(std::memory_order_relaxed) @@ -3751,9 +5048,10 @@ TEST(Parallelize3D, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters( - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); +TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3762,16 +5060,19 @@ TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_3d( - threadpool.get(), reinterpret_cast(Increment3D), - static_cast(counters.data()), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) << "Element (" << i << ", " << j << ", " << k << ") was processed " << counters[linear_idx].load(std::memory_order_relaxed) @@ -3781,25 +5082,29 @@ TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters( - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); +TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d( - threadpool.get(), reinterpret_cast(Increment3D), - static_cast(counters.data()), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) << "Element (" << i << ", " << j << ", " << k << ") was processed " @@ -3810,9 +5115,10 @@ TEST(Parallelize3D, SingleThreadPoolEachItemProcessedMultipleTimes) { } } -TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters( - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); +TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3822,17 +5128,20 @@ TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) { } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d( - threadpool.get(), reinterpret_cast(Increment3D), - static_cast(counters.data()), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) << "Element (" << i << ", " << j << ", " << k << ") was processed " @@ -3843,12 +5152,15 @@ TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) { } } -static void IncrementSame3D(std::atomic_int* num_processed_items, size_t i, - size_t j, size_t k) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); +static void IncrementSame3DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } -TEST(Parallelize3D, MultiThreadPoolHighContention) { +TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -3858,28 +5170,32 @@ TEST(Parallelize3D, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_3d( + pthreadpool_parallelize_3d_tile_1d( threadpool.get(), - reinterpret_cast(IncrementSame3D), - static_cast(&num_processed_items), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + reinterpret_cast(IncrementSame3DTile1D), + static_cast(&num_processed_items), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); } -static void WorkImbalance3D(std::atomic_int* num_processed_items, size_t i, - size_t j, size_t k) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0) { +static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t tile_k) { + num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0) { /* Spin-wait until all items are computed */ while (num_processed_items->load(std::memory_order_relaxed) != - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK) { + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK) { std::atomic_thread_fence(std::memory_order_acquire); } } } -TEST(Parallelize3D, MultiThreadPoolWorkStealing) { +TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -3889,28 +5205,31 @@ TEST(Parallelize3D, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_3d( + pthreadpool_parallelize_3d_tile_1d( threadpool.get(), - reinterpret_cast(WorkImbalance3D), - static_cast(&num_processed_items), kParallelize3DRangeI, - kParallelize3DRangeJ, kParallelize3DRangeK, 0 /* flags */); + reinterpret_cast(WorkImbalance3DTile1D), + static_cast(&num_processed_items), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), - kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); } -static void ComputeNothing3DTile1D(void*, size_t, size_t, size_t, size_t) {} +static void ComputeNothing3DTile1DWithThread(void*, size_t, size_t, size_t, + size_t, size_t) {} -TEST(Parallelize3DTile1D, SingleThreadPoolCompletes) { +TEST(Parallelize3DTile1DWithThread, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), ComputeNothing3DTile1D, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithThread, nullptr, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1D, MultiThreadPoolCompletes) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3918,31 +5237,31 @@ TEST(Parallelize3DTile1D, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), ComputeNothing3DTile1D, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithThread, nullptr, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckBounds3DTile1D(void*, size_t i, size_t j, size_t start_k, - size_t tile_k) { +static void CheckBounds3DTile1DWithThread(void*, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { EXPECT_LT(i, kParallelize3DTile1DRangeI); EXPECT_LT(j, kParallelize3DTile1DRangeJ); EXPECT_LT(start_k, kParallelize3DTile1DRangeK); EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } -TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), CheckBounds3DTile1D, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckBounds3DTile1DWithThread, nullptr, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -3950,14 +5269,14 @@ TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), CheckBounds3DTile1D, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckBounds3DTile1DWithThread, nullptr, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, - size_t tile_k) { +static void CheckTiling3DTile1DWithThread(void*, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { EXPECT_GT(tile_k, 0); EXPECT_LE(tile_k, kParallelize3DTile1DTileK); EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); @@ -3965,32 +5284,33 @@ static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, kParallelize3DTile1DRangeK - start_k)); } -TEST(Parallelize3DTile1D, SingleThreadPoolUniformTiling) { +TEST(Parallelize3DTile1DWithThread, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), CheckTiling3DTile1D, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckTiling3DTile1DWithThread, nullptr, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1D, MultiThreadPoolUniformTiling) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { GTEST_SKIP(); } - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), CheckTiling3DTile1D, nullptr, + + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckTiling3DTile1DWithThread, nullptr, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, - size_t j, size_t start_k, size_t tile_k) { +static void SetTrue3DTile1DWithThread(std::atomic_bool* processed_indicators, + size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; @@ -3998,7 +5318,7 @@ static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, } } -TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4006,9 +5326,10 @@ TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(SetTrue3DTile1D), + reinterpret_cast( + SetTrue3DTile1DWithThread), static_cast(indicators.data()), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4026,7 +5347,7 @@ TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) { } } -TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4038,9 +5359,10 @@ TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(SetTrue3DTile1D), + reinterpret_cast( + SetTrue3DTile1DWithThread), static_cast(indicators.data()), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4058,8 +5380,9 @@ TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) { } } -static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, - size_t j, size_t start_k, size_t tile_k) { +static void Increment3DTile1DWithThread(std::atomic_int* processed_counters, + size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; @@ -4067,7 +5390,7 @@ static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, } } -TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile1DWithThread, SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4075,9 +5398,10 @@ TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(Increment3DTile1D), + reinterpret_cast( + Increment3DTile1DWithThread), static_cast(counters.data()), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4097,7 +5421,7 @@ TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4109,9 +5433,10 @@ TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(Increment3DTile1D), + reinterpret_cast( + Increment3DTile1DWithThread), static_cast(counters.data()), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4131,7 +5456,8 @@ TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { +TEST(Parallelize3DTile1DWithThread, + SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4140,9 +5466,10 @@ TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(Increment3DTile1D), + reinterpret_cast( + Increment3DTile1DWithThread), static_cast(counters.data()), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4164,7 +5491,8 @@ TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { } } -TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { +TEST(Parallelize3DTile1DWithThread, + MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4177,9 +5505,10 @@ TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(Increment3DTile1D), + reinterpret_cast( + Increment3DTile1DWithThread), static_cast(counters.data()), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4201,15 +5530,15 @@ TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { } } -static void IncrementSame3DTile1D(std::atomic_int* num_processed_items, - size_t i, size_t j, size_t start_k, - size_t tile_k) { +static void IncrementSame3DTile1DWithThread( + std::atomic_int* num_processed_items, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -4219,9 +5548,10 @@ TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(IncrementSame3DTile1D), + reinterpret_cast( + IncrementSame3DTile1DWithThread), static_cast(&num_processed_items), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4230,9 +5560,9 @@ TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) { kParallelize3DTile1DRangeK); } -static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, - size_t i, size_t j, size_t start_k, - size_t tile_k) { +static void WorkImbalance3DTile1DWithThread( + std::atomic_int* num_processed_items, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); if (i == 0 && j == 0 && start_k == 0) { /* Spin-wait until all items are computed */ @@ -4244,7 +5574,7 @@ static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, } } -TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) { +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -4254,9 +5584,10 @@ TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d( + pthreadpool_parallelize_3d_tile_1d_with_thread( threadpool.get(), - reinterpret_cast(WorkImbalance3DTile1D), + reinterpret_cast( + WorkImbalance3DTile1DWithThread), static_cast(&num_processed_items), kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4265,20 +5596,44 @@ TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) { kParallelize3DTile1DRangeK); } -static void ComputeNothing3DTile1DWithThread(void*, size_t, size_t, size_t, - size_t, size_t) {} +static void CheckThreadIndexValid3DTile1DWithThread(const size_t* num_threads, + size_t thread_index, + size_t i, size_t j, + size_t start_k, + size_t tile_k) { + EXPECT_LE(thread_index, *num_threads); +} -TEST(Parallelize3DTile1DWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); +TEST(Parallelize3DTile1DWithThread, MultiThreadPoolThreadIndexValid) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), ComputeNothing3DTile1DWithThread, nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + threadpool.get(), + reinterpret_cast( + CheckThreadIndexValid3DTile1DWithThread), + static_cast(&num_threads), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolCompletes) { +static void ComputeNothing3DTile1DWithUArch(void*, uint32_t, size_t, size_t, + size_t, size_t) {} + +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); +} + +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4286,31 +5641,66 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), ComputeNothing3DTile1DWithThread, nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckBounds3DTile1DWithThread(void*, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void CheckUArch3DTile1DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } +} + +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckUArch3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); +} + +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckUArch3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); +} + +static void CheckBounds3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { EXPECT_LT(i, kParallelize3DTile1DRangeI); EXPECT_LT(j, kParallelize3DTile1DRangeJ); EXPECT_LT(start_k, kParallelize3DTile1DRangeK); EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } -TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), CheckBounds3DTile1DWithThread, nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckBounds3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4318,14 +5708,15 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), CheckBounds3DTile1DWithThread, nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckBounds3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckTiling3DTile1DWithThread(void*, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void CheckTiling3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { EXPECT_GT(tile_k, 0); EXPECT_LE(tile_k, kParallelize3DTile1DTileK); EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); @@ -4333,17 +5724,18 @@ static void CheckTiling3DTile1DWithThread(void*, size_t, size_t i, size_t j, kParallelize3DTile1DRangeK - start_k)); } -TEST(Parallelize3DTile1DWithThread, SingleThreadPoolUniformTiling) { +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), CheckTiling3DTile1DWithThread, nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckTiling3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolUniformTiling) { +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4351,15 +5743,16 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), CheckTiling3DTile1DWithThread, nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckTiling3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, 0 /* flags */); } -static void SetTrue3DTile1DWithThread(std::atomic_bool* processed_indicators, - size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void SetTrue3DTile1DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; @@ -4367,7 +5760,7 @@ static void SetTrue3DTile1DWithThread(std::atomic_bool* processed_indicators, } } -TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4375,13 +5768,13 @@ TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsProcessed) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - SetTrue3DTile1DWithThread), - static_cast(indicators.data()), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + SetTrue3DTile1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { @@ -4396,7 +5789,7 @@ TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsProcessed) { } } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4408,13 +5801,13 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - SetTrue3DTile1DWithThread), - static_cast(indicators.data()), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + SetTrue3DTile1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { @@ -4429,9 +5822,9 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsProcessed) { } } -static void Increment3DTile1DWithThread(std::atomic_int* processed_counters, - size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void Increment3DTile1DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; @@ -4439,7 +5832,7 @@ static void Increment3DTile1DWithThread(std::atomic_int* processed_counters, } } -TEST(Parallelize3DTile1DWithThread, SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4447,13 +5840,13 @@ TEST(Parallelize3DTile1DWithThread, SingleThreadPoolEachItemProcessedOnce) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithThread), - static_cast(counters.data()), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { @@ -4470,7 +5863,7 @@ TEST(Parallelize3DTile1DWithThread, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4482,13 +5875,13 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithThread), - static_cast(counters.data()), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { @@ -4505,7 +5898,7 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile1DWithThread, +TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * @@ -4515,13 +5908,13 @@ TEST(Parallelize3DTile1DWithThread, ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithThread), - static_cast(counters.data()), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { @@ -4540,7 +5933,7 @@ TEST(Parallelize3DTile1DWithThread, } } -TEST(Parallelize3DTile1DWithThread, +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * @@ -4554,13 +5947,13 @@ TEST(Parallelize3DTile1DWithThread, } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithThread), - static_cast(counters.data()), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { @@ -4579,15 +5972,15 @@ TEST(Parallelize3DTile1DWithThread, } } -static void IncrementSame3DTile1DWithThread( - std::atomic_int* num_processed_items, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void IncrementSame3DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolHighContention) { +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -4597,21 +5990,21 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - IncrementSame3DTile1DWithThread), - static_cast(&num_processed_items), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + IncrementSame3DTile1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); } -static void WorkImbalance3DTile1DWithThread( - std::atomic_int* num_processed_items, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void WorkImbalance3DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); if (i == 0 && j == 0 && start_k == 0) { /* Spin-wait until all items are computed */ @@ -4623,7 +6016,7 @@ static void WorkImbalance3DTile1DWithThread( } } -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolWorkStealing) { +TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -4633,56 +6026,34 @@ TEST(Parallelize3DTile1DWithThread, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_parallelize_3d_tile_1d_with_uarch( threadpool.get(), - reinterpret_cast( - WorkImbalance3DTile1DWithThread), - static_cast(&num_processed_items), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast( + WorkImbalance3DTile1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); } -static void CheckThreadIndexValid3DTile1DWithThread(const size_t* num_threads, - size_t thread_index, - size_t i, size_t j, - size_t start_k, - size_t tile_k) { - EXPECT_LE(thread_index, *num_threads); -} - -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast( - CheckThreadIndexValid3DTile1DWithThread), - static_cast(&num_threads), kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); -} - -static void ComputeNothing3DTile1DWithUArch(void*, uint32_t, size_t, size_t, - size_t, size_t) {} +static void ComputeNothing3DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t, size_t, size_t, + size_t) {} -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolCompletes) { +TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), ComputeNothing3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolCompletes) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4690,32 +6061,33 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolCompletes) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), ComputeNothing3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckUArch3DTile1DWithUArch(void*, uint32_t uarch_index, size_t, - size_t, size_t, size_t) { +static void CheckUArch3DTile1DWithUArchWithThread(void*, uint32_t uarch_index, + size_t, size_t, size_t, + size_t, size_t) { if (uarch_index != kDefaultUArchIndex) { EXPECT_LE(uarch_index, kMaxUArchIndex); } } -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolUArchInBounds) { +TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolUArchInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), CheckUArch3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUArchInBounds) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4723,33 +6095,35 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUArchInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), CheckUArch3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckBounds3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void CheckBounds3DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t j, + size_t start_k, + size_t tile_k) { EXPECT_LT(i, kParallelize3DTile1DRangeI); EXPECT_LT(j, kParallelize3DTile1DRangeJ); EXPECT_LT(start_k, kParallelize3DTile1DRangeK); EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), CheckBounds3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4757,15 +6131,17 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), CheckBounds3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckTiling3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void CheckTiling3DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t j, + size_t start_k, + size_t tile_k) { EXPECT_GT(tile_k, 0); EXPECT_LE(tile_k, kParallelize3DTile1DTileK); EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); @@ -4773,18 +6149,18 @@ static void CheckTiling3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, kParallelize3DTile1DRangeK - start_k)); } -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolUniformTiling) { +TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), CheckTiling3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUniformTiling) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -4792,16 +6168,16 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), CheckTiling3DTile1DWithUArch, nullptr, + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling3DTile1DWithUArchWithThread, nullptr, kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void SetTrue3DTile1DWithUArch(std::atomic_bool* processed_indicators, - uint32_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void SetTrue3DTile1DWithUArchWithThread( + std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, + size_t j, size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; @@ -4809,7 +6185,8 @@ static void SetTrue3DTile1DWithUArch(std::atomic_bool* processed_indicators, } } -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile1DWithUArchWithThread, + SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4817,10 +6194,10 @@ TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - SetTrue3DTile1DWithUArch), + reinterpret_cast( + SetTrue3DTile1DWithUArchWithThread), static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4838,7 +6215,7 @@ TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { } } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4850,10 +6227,10 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - SetTrue3DTile1DWithUArch), + reinterpret_cast( + SetTrue3DTile1DWithUArchWithThread), static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4871,9 +6248,9 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { } } -static void Increment3DTile1DWithUArch(std::atomic_int* processed_counters, - uint32_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void Increment3DTile1DWithUArchWithThread( + std::atomic_int* processed_counters, uint32_t, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; @@ -4881,7 +6258,8 @@ static void Increment3DTile1DWithUArch(std::atomic_int* processed_counters, } } -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4889,10 +6267,10 @@ TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArch), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4912,7 +6290,8 @@ TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); @@ -4924,10 +6303,10 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArch), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4947,7 +6326,7 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile1DWithUArch, +TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * @@ -4957,10 +6336,10 @@ TEST(Parallelize3DTile1DWithUArch, ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArch), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -4982,7 +6361,7 @@ TEST(Parallelize3DTile1DWithUArch, } } -TEST(Parallelize3DTile1DWithUArch, +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * @@ -4996,10 +6375,10 @@ TEST(Parallelize3DTile1DWithUArch, } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArch), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -5021,15 +6400,15 @@ TEST(Parallelize3DTile1DWithUArch, } } -static void IncrementSame3DTile1DWithUArch(std::atomic_int* num_processed_items, - uint32_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void IncrementSame3DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { for (size_t k = start_k; k < start_k + tile_k; k++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); } } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolHighContention) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -5039,10 +6418,10 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - IncrementSame3DTile1DWithUArch), + reinterpret_cast( + IncrementSame3DTile1DWithUArchWithThread), static_cast(&num_processed_items), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -5051,9 +6430,9 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolHighContention) { kParallelize3DTile1DRangeK); } -static void WorkImbalance3DTile1DWithUArch(std::atomic_int* num_processed_items, - uint32_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { +static void WorkImbalance3DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); if (i == 0 && j == 0 && start_k == 0) { /* Spin-wait until all items are computed */ @@ -5065,7 +6444,7 @@ static void WorkImbalance3DTile1DWithUArch(std::atomic_int* num_processed_items, } } -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolWorkStealing) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -5075,10 +6454,10 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( threadpool.get(), - reinterpret_cast( - WorkImbalance3DTile1DWithUArch), + reinterpret_cast( + WorkImbalance3DTile1DWithUArchWithThread), static_cast(&num_processed_items), kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); @@ -5087,56 +6466,42 @@ TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolWorkStealing) { kParallelize3DTile1DRangeK); } -static void ComputeNothing3DTile1DWithUArchWithThread(void*, uint32_t, size_t, - size_t, size_t, size_t, - size_t) {} - -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), ComputeNothing3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); +static void SetThreadTrue3DTile1DWithUArchWithThread( + const size_t* num_threads, uint32_t, size_t thread_index, size_t i, + size_t j, size_t start_k, size_t tile_k) { + EXPECT_LE(thread_index, *num_threads); } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { +TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolThreadIndexValid) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), ComputeNothing3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + threadpool.get(), + reinterpret_cast( + SetThreadTrue3DTile1DWithUArchWithThread), + static_cast(&num_threads), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); } -static void CheckUArch3DTile1DWithUArchWithThread(void*, uint32_t uarch_index, - size_t, size_t, size_t, - size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } -} +static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, + size_t) {} -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolUArchInBounds) { +TEST(Parallelize3DTile2D, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckUArch3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), ComputeNothing3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { +TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5144,35 +6509,34 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckUArch3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), ComputeNothing3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, 0 /* flags */); } -static void CheckBounds3DTile1DWithUArchWithThread(void*, uint32_t, size_t, - size_t i, size_t j, - size_t start_k, - size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile1DRangeI); - EXPECT_LT(j, kParallelize3DTile1DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile1DRangeK); - EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); +static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); } -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckBounds3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckBounds3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5180,36 +6544,40 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckBounds3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckBounds3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, 0 /* flags */); } -static void CheckTiling3DTile1DWithUArchWithThread(void*, uint32_t, size_t, - size_t i, size_t j, - size_t start_k, - size_t tile_k) { +static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, + kParallelize3DTile2DRangeJ - start_j)); + EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile1DTileK); - EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, - kParallelize3DTile1DRangeK - start_k)); + EXPECT_LE(tile_k, kParallelize3DTile2DTileK); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, + kParallelize3DTile2DRangeK - start_k)); } -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolUniformTiling) { +TEST(Parallelize3DTile2D, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckTiling3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckTiling3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, 0 /* flags */); } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { +TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5217,45 +6585,45 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), CheckTiling3DTile1DWithUArchWithThread, nullptr, - kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, - kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckTiling3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, 0 /* flags */); } -static void SetTrue3DTile1DWithUArchWithThread( - std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, - size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, + size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } } -TEST(Parallelize3DTile1DWithUArchWithThread, - SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * - kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); +TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - SetTrue3DTile1DWithUArchWithThread), - static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(SetTrue3DTile2D), + static_cast(indicators.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) << "Element (" << i << ", " << j << ", " << k << ") not processed"; @@ -5264,10 +6632,10 @@ TEST(Parallelize3DTile1DWithUArchWithThread, } } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * - kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); +TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5276,19 +6644,18 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - SetTrue3DTile1DWithUArchWithThread), - static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(SetTrue3DTile2D), + static_cast(indicators.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) << "Element (" << i << ", " << j << ", " << k << ") not processed"; @@ -5297,38 +6664,38 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { } } -static void Increment3DTile1DWithUArchWithThread( - std::atomic_int* processed_counters, uint32_t, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); +static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, + size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } } -TEST(Parallelize3DTile1DWithUArchWithThread, - SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * - kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); +TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) << "Element (" << i << ", " << j << ", " << k << ") was processed " @@ -5339,11 +6706,10 @@ TEST(Parallelize3DTile1DWithUArchWithThread, } } -TEST(Parallelize3DTile1DWithUArchWithThread, - MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * - kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); +TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5352,19 +6718,18 @@ TEST(Parallelize3DTile1DWithUArchWithThread, GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) << "Element (" << i << ", " << j << ", " << k << ") was processed " @@ -5375,30 +6740,28 @@ TEST(Parallelize3DTile1DWithUArchWithThread, } } -TEST(Parallelize3DTile1DWithUArchWithThread, - SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * - kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); +TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) @@ -5410,11 +6773,10 @@ TEST(Parallelize3DTile1DWithUArchWithThread, } } -TEST(Parallelize3DTile1DWithUArchWithThread, - MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * - kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); +TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5424,20 +6786,19 @@ TEST(Parallelize3DTile1DWithUArchWithThread, } for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { const size_t linear_idx = - (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) @@ -5449,15 +6810,17 @@ TEST(Parallelize3DTile1DWithUArchWithThread, } } -static void IncrementSame3DTile1DWithUArchWithThread( - std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); +static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { +TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -5467,33 +6830,32 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - IncrementSame3DTile1DWithUArchWithThread), - static_cast(&num_processed_items), kDefaultUArchIndex, - kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(IncrementSame3DTile2D), + static_cast(&num_processed_items), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), - kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); } -static void WorkImbalance3DTile1DWithUArchWithThread( - std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, - size_t start_k, size_t tile_k) { - num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0) { +static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); + if (i == 0 && start_j == 0 && start_k == 0) { /* Spin-wait until all items are computed */ while (num_processed_items->load(std::memory_order_relaxed) != - kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK) { + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK) { std::atomic_thread_fence(std::memory_order_acquire); } } } -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { +TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -5503,54 +6865,32 @@ TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { GTEST_SKIP(); } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast( - WorkImbalance3DTile1DWithUArchWithThread), - static_cast(&num_processed_items), kDefaultUArchIndex, - kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + reinterpret_cast(WorkImbalance3DTile2D), + static_cast(&num_processed_items), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), - kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * - kParallelize3DTile1DRangeK); -} - -static void SetThreadTrue3DTile1DWithUArchWithThread( - const size_t* num_threads, uint32_t, size_t thread_index, size_t i, - size_t j, size_t start_k, size_t tile_k) { - EXPECT_LE(thread_index, *num_threads); -} - -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast( - SetThreadTrue3DTile1DWithUArchWithThread), - static_cast(&num_threads), kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, - kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, 0 /* flags */); + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); } -static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, - size_t) {} +static void ComputeNothing3DTile2DDynamic(void*, size_t, size_t, size_t, size_t, + size_t) {} -TEST(Parallelize3DTile2D, SingleThreadPoolCompletes) { +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), ComputeNothing3DTile2D, nullptr, + threadpool.get(), ComputeNothing3DTile2DDynamic, nullptr, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } -TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolCompletes) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5559,14 +6899,15 @@ TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) { } pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), ComputeNothing3DTile2D, nullptr, + threadpool.get(), ComputeNothing3DTile2DDynamic, nullptr, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } -static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, - size_t tile_j, size_t tile_k) { +static void CheckBounds3DTile2DDynamic(void*, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { EXPECT_LT(i, kParallelize3DTile2DRangeI); EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); EXPECT_LT(start_k, kParallelize3DTile2DRangeK); @@ -5574,18 +6915,18 @@ static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); } -TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), CheckBounds3DTile2D, nullptr, + threadpool.get(), CheckBounds3DTile2DDynamic, nullptr, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } -TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolAllItemsInBounds) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5594,39 +6935,36 @@ TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) { } pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), CheckBounds3DTile2D, nullptr, + threadpool.get(), CheckBounds3DTile2DDynamic, nullptr, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } -static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, - size_t tile_j, size_t tile_k) { +static void CheckTiling3DTile2DDynamic(void*, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); + EXPECT_LE(tile_j, kParallelize3DTile2DRangeJ); EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, - kParallelize3DTile2DRangeJ - start_j)); EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile2DTileK); + EXPECT_LE(tile_k, kParallelize3DTile2DRangeK); EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, - kParallelize3DTile2DRangeK - start_k)); } -TEST(Parallelize3DTile2D, SingleThreadPoolUniformTiling) { +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), CheckTiling3DTile2D, nullptr, + threadpool.get(), CheckTiling3DTile2DDynamic, nullptr, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } -TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolUniformTiling) { auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); ASSERT_TRUE(threadpool.get()); @@ -5635,15 +6973,15 @@ TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) { } pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), CheckTiling3DTile2D, nullptr, + threadpool.get(), CheckTiling3DTile2DDynamic, nullptr, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); } -static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, - size_t start_j, size_t start_k, size_t tile_j, - size_t tile_k) { +static void SetTrue3DTile2DDynamic(std::atomic_bool* processed_indicators, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { for (size_t j = start_j; j < start_j + tile_j; j++) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = @@ -5653,7 +6991,7 @@ static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, } } -TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); @@ -5663,7 +7001,7 @@ TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(SetTrue3DTile2D), + reinterpret_cast(SetTrue3DTile2DDynamic), static_cast(indicators.data()), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5681,7 +7019,7 @@ TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) { } } -TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolAllItemsProcessed) { std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); @@ -5695,7 +7033,7 @@ TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(SetTrue3DTile2D), + reinterpret_cast(SetTrue3DTile2DDynamic), static_cast(indicators.data()), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5713,9 +7051,9 @@ TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) { } } -static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, - size_t start_j, size_t start_k, size_t tile_j, - size_t tile_k) { +static void Increment3DTile2DDynamic(std::atomic_int* processed_counters, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { for (size_t j = start_j; j < start_j + tile_j; j++) { for (size_t k = start_k; k < start_k + tile_k; k++) { const size_t linear_idx = @@ -5725,7 +7063,7 @@ static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, } } -TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); @@ -5735,7 +7073,7 @@ TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(Increment3DTile2D), + reinterpret_cast(Increment3DTile2DDynamic), static_cast(counters.data()), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5755,7 +7093,7 @@ TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolEachItemProcessedOnce) { std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); @@ -5769,7 +7107,7 @@ TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(Increment3DTile2D), + reinterpret_cast(Increment3DTile2DDynamic), static_cast(counters.data()), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5789,7 +7127,8 @@ TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) { } } -TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { +TEST(Parallelize3DTile2DDynamic, + SingleThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); @@ -5800,7 +7139,8 @@ TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(Increment3DTile2D), + reinterpret_cast( + Increment3DTile2DDynamic), static_cast(counters.data()), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5822,7 +7162,8 @@ TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { } } -TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { +TEST(Parallelize3DTile2DDynamic, + MultiThreadPoolEachItemProcessedMultipleTimes) { std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); @@ -5837,7 +7178,8 @@ TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(Increment3DTile2D), + reinterpret_cast( + Increment3DTile2DDynamic), static_cast(counters.data()), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5859,9 +7201,10 @@ TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { } } -static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, - size_t i, size_t start_j, size_t start_k, - size_t tile_j, size_t tile_k) { +static void IncrementSame3DTile2DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { for (size_t j = start_j; j < start_j + tile_j; j++) { for (size_t k = start_k; k < start_k + tile_k; k++) { num_processed_items->fetch_add(1, std::memory_order_relaxed); @@ -5869,7 +7212,7 @@ static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, } } -TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolHighContention) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -5881,7 +7224,8 @@ TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(IncrementSame3DTile2D), + reinterpret_cast( + IncrementSame3DTile2DDynamic), static_cast(&num_processed_items), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */); @@ -5890,9 +7234,10 @@ TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) { kParallelize3DTile2DRangeK); } -static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, - size_t i, size_t start_j, size_t start_k, - size_t tile_j, size_t tile_k) { +static void WorkImbalance3DTile2DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); if (i == 0 && start_j == 0 && start_k == 0) { /* Spin-wait until all items are computed */ @@ -5904,7 +7249,7 @@ static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, } } -TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) { +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolWorkStealing) { std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); @@ -5916,7 +7261,8 @@ TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) { pthreadpool_parallelize_3d_tile_2d( threadpool.get(), - reinterpret_cast(WorkImbalance3DTile2D), + reinterpret_cast( + WorkImbalance3DTile2DDynamic), static_cast(&num_processed_items), kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, 0 /* flags */);