diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index f8f1931f35547..7019f951b25fb 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory(Common) add_subdirectory(Utils) add_subdirectory(TPCFastTransformation) add_subdirectory(GPUTracking) +add_subdirectory(GPUbenchmark) if(ALIGPU_BUILD_TYPE STREQUAL "O2") add_subdirectory(Workflow) endif() diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt new file mode 100644 index 0000000000000..e008ab4cc0f41 --- /dev/null +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -0,0 +1,58 @@ +# Copyright 2019-2020 CERN and copyright holders of ALICE O2. +# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +# All rights not expressly granted are reserved. +# +# This software is distributed under the terms of the GNU General Public +# License v3 (GPL Version 3), copied verbatim in the file "COPYING". +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +set(HDRS_INSTALL ../Shared/Kernels.h) + +if(CUDA_ENABLED) + # add_subdirectory(cuda) + o2_add_executable(gpu-memory-benchmark-cuda + SOURCES benchmark.cxx + cuda/Kernels.cu + PUBLIC_LINK_LIBRARIES Boost::program_options + ROOT::Tree + TARGETVARNAME targetName) +endif() + +if(HIP_ENABLED) + # Hipify-perl + set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") + + set(HIP_KERNEL "Kernels.hip.cxx") + set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu) + set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/hip/${HIP_KERNEL}") + + if(EXISTS ${HIPIFY_EXECUTABLE}) + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) + message("Generating HIP kernel code ...") + execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}") + elseif() + message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") + endif() + + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + + set(CMAKE_CXX_EXTENSIONS OFF) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") + + o2_add_executable(gpu-memory-benchmark-hip + SOURCES benchmark.cxx + hip/Kernels.hip.cxx + PUBLIC_LINK_LIBRARIES hip::host + Boost::program_options + ROOT::Tree + TARGETVARNAME targetName) + + if(HIP_AMDGPUTARGET) + # Need to add gpu target also to link flags due to gpu-rdc option + target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) + endif() +endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h new file mode 100644 index 0000000000000..a4e7f71440347 --- /dev/null +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -0,0 +1,86 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file Kernels.h +/// \author: mconcas@cern.ch + +#ifndef GPU_BENCHMARK_KERNELS_H +#define GPU_BENCHMARK_KERNELS_H + +#include "Utils.h" +#include +#include +#include +#include +#include + +namespace o2 +{ +namespace benchmark +{ + +template +class GPUbenchmark final +{ + public: + GPUbenchmark() = delete; // need for a configuration + GPUbenchmark(benchmarkOpts& opts, std::shared_ptr rWriter) : mResultWriter{rWriter}, mOptions{opts} + { + } + virtual ~GPUbenchmark() = default; + template + float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); + + // Single stream synchronous (sequential kernels) execution + template + float benchmarkSync(void (*kernel)(T...), + int nLaunches, int blocks, int threads, T&... args); + + // Multi-streams asynchronous executions on whole memory + template + std::vector benchmarkAsync(void (*kernel)(int, T...), + int nStreams, int nLaunches, int blocks, int threads, T&... args); + + // Main interface + void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters + void run(); // Execute all specified callbacks + void globalFinalize(); // Cleanup + void printDevices(); // Dump info + + // Initializations/Finalizations of tests. Not to be measured, in principle used for report + void readInit(); + void readFinalize(); + + void writeInit(); + void writeFinalize(); + + void copyInit(); + void copyFinalize(); + + // Kernel calling wrappers + void readSequential(SplitLevel sl); + void readConcurrent(SplitLevel sl, int nRegions = 2); + + void writeSequential(SplitLevel sl); + void writeConcurrent(SplitLevel sl, int nRegions = 2); + + void copySequential(SplitLevel sl); + void copyConcurrent(SplitLevel sl, int nRegions = 2); + + private: + gpuState mState; + std::shared_ptr mResultWriter; + benchmarkOpts mOptions; +}; + +} // namespace benchmark +} // namespace o2 +#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h new file mode 100644 index 0000000000000..6d3400aa9a6ec --- /dev/null +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -0,0 +1,180 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file Common.h +/// \author: mconcas@cern.ch + +#ifndef GPU_BENCHMARK_UTILS_H +#define GPU_BENCHMARK_UTILS_H + +#include +#include +#include +#include +#include +#include +#include + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +#define GB (1024 * 1024 * 1024) + +namespace o2 +{ +namespace benchmark +{ + +enum class SplitLevel { + Blocks, + Threads +}; + +struct benchmarkOpts { + benchmarkOpts() = default; + + float chunkReservedGB = 1.f; + int nRegions = 2; + float freeMemoryFractionToAllocate = 0.95f; + int kernelLaunches = 1; + int nTests = 1; +}; + +template +struct gpuState { + int getMaxChunks() + { + return (double)scratchSize / (chunkReservedGB * GB); + } + + void computeScratchPtrs() + { + partAddrOnHost.resize(getMaxChunks()); + for (size_t iBuffAddress{0}; iBuffAddress < getMaxChunks(); ++iBuffAddress) { + partAddrOnHost[iBuffAddress] = reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * iBuffAddress); + } + } + + size_t getPartitionCapacity() + { + return static_cast(GB * chunkReservedGB / sizeof(T)); + } + + std::vector getScratchPtrs() + { + return partAddrOnHost; + } + + std::vector>& getHostBuffers() + { + return gpuBuffersHost; + } + + int getNKernelLaunches() { return iterations; } + + // Configuration + size_t nMaxThreadsPerDimension; + int iterations; + + float chunkReservedGB; // Size of each partition (GB) + + // General containers and state + T* scratchPtr; // Pointer to scratch buffer + size_t scratchSize; // Size of scratch area (B) + std::vector partAddrOnHost; // Pointers to scratch partitions on host vector + std::vector> gpuBuffersHost; // Host-based vector-ized data + T* deviceReadResultsPtr; // Results of the read test (single variable) on GPU + std::vector hostReadResultsVector; // Results of the read test (single variable) on host + T* deviceWriteResultsPtr; // Results of the write test (single variable) on GPU + std::vector hostWriteResultsVector; // Results of the write test (single variable) on host + T* deviceCopyInputsPtr; // Inputs of the copy test (single variable) on GPU + std::vector hostCopyInputsVector; // Inputs of the copy test (single variable) on host + + // Static info + size_t totalMemory; + size_t nMultiprocessors; + size_t nMaxThreadsPerBlock; +}; + +// Interface class to stream results to root file +class ResultWriter +{ + public: + explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root"); + ~ResultWriter() = default; + void storeBenchmarkEntry(int chunk, float entry); + void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry); + void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks); + void snapshotBenchmark(); + void saveToFile(); + + private: + std::vector mBenchmarkResults; + std::vector mBenchmarkTrees; + TFile* mOutfile; +}; + +inline ResultWriter::ResultWriter(const std::string resultsTreeFilename) +{ + mOutfile = TFile::Open(resultsTreeFilename.data(), "recreate"); +} + +inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks) +{ + mBenchmarkTrees.emplace_back(new TTree((bName + "_" + type).data(), (bName + "_" + type).data())); + mBenchmarkResults.clear(); + mBenchmarkResults.resize(nChunks); + mBenchmarkTrees.back()->Branch("elapsed", &mBenchmarkResults); +} + +inline void ResultWriter::storeBenchmarkEntry(int chunk, float entry) +{ + mBenchmarkResults[chunk] = entry; +} + +inline void ResultWriter::snapshotBenchmark() +{ + mBenchmarkTrees.back()->Fill(); +} + +inline void ResultWriter::saveToFile() +{ + mOutfile->cd(); + for (auto t : mBenchmarkTrees) { + t->Write(); + } + mOutfile->Close(); +} + +inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry) +{ + // (*mTree) + // << (benchmarkName + "_" + type + "_region_" + region).data() + // << "elapsed=" << entry + // << "\n"; +} + +} // namespace benchmark +} // namespace o2 + +#define failed(...) \ + printf("%serror: ", KRED); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + printf("error: TEST FAILED\n%s", KNRM); \ + exit(EXIT_FAILURE); +#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx new file mode 100644 index 0000000000000..7ee638594f9e3 --- /dev/null +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -0,0 +1,78 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file benchmark.cxx +/// \author mconcas@cern.ch +/// \brief configuration widely inspired/copied by SimConfig +#include "Shared/Kernels.h" + +bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) +{ + namespace bpo = boost::program_options; + bpo::variables_map vm; + bpo::options_description options("Benchmark options"); + options.add_options()( + "help,h", "Print help message.")( + "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( + "regions,r", bpo::value()->default_value(2), "Number of memory regions to partition RAM in.")( + "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( + "launches,l", bpo::value()->default_value(10), "Number of iterations in reading kernels.")( + "ntests,n", bpo::value()->default_value(1), "Number of times each test is run."); + try { + bpo::store(parse_command_line(argc, argv, options), vm); + if (vm.count("help")) { + std::cout << options << std::endl; + return false; + } + + bpo::notify(vm); + } catch (const bpo::error& e) { + std::cerr << e.what() << "\n\n"; + std::cerr << "Error parsing command line arguments. Available options:\n"; + + std::cerr << options << std::endl; + return false; + } + + conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); + conf.chunkReservedGB = vm["chunkSize"].as(); + conf.nRegions = vm["regions"].as(); + conf.kernelLaunches = vm["launches"].as(); + conf.nTests = vm["ntests"].as(); + + return true; +} + +using o2::benchmark::ResultWriter; + +int main(int argc, const char* argv[]) +{ + + o2::benchmark::benchmarkOpts opts; + + if (!parseArgs(opts, argc, argv)) { + return -1; + } + + std::shared_ptr writer = std::make_shared(); + + o2::benchmark::GPUbenchmark bm_char{opts, writer}; + bm_char.run(); + o2::benchmark::GPUbenchmark bm_int{opts, writer}; + bm_int.run(); + o2::benchmark::GPUbenchmark bm_size_t{opts, writer}; + bm_size_t.run(); + + // save results + writer.get()->saveToFile(); + + return 0; +} diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu new file mode 100644 index 0000000000000..8af91423c12e5 --- /dev/null +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -0,0 +1,845 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file Kernels.{cu, hip.cxx} +/// \author: mconcas@cern.ch + +#include "../Shared/Kernels.h" +#if defined(__HIPCC__) +#include "hip/hip_runtime.h" +#endif +#include + +// Memory partitioning legend +// +// |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM +// |--chunk 0--|--chunk 1--|--chunk 2--| *** |--chunk n--| chunks -> default size: 1GB (sing block pins) +// |__________________________________________scratch______________________________________________| scratch -> default size: 95% free GPU RAM + +#define GPUCHECK(error) \ + if (error != cudaSuccess) { \ + printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \ + __LINE__, KNRM); \ + failed("API returned error code."); \ + } + +double bytesToKB(size_t s) { return (double)s / (1024.0); } +double bytesToGB(size_t s) { return (double)s / GB; } + +int getCorrespondingRegionId(int Id, int nChunks, int nRegions = 1) +{ + return Id * nRegions / nChunks; +} + +template +std::string getType() +{ + if (typeid(T).name() == typeid(char).name()) { + return std::string{"char"}; + } + if (typeid(T).name() == typeid(size_t).name()) { + return std::string{"unsigned_long"}; + } + if (typeid(T).name() == typeid(int).name()) { + return std::string{"int"}; + } + if (typeid(T).name() == typeid(int4).name()) { + return std::string{"int4"}; + } + return std::string{"unknown"}; +} + +namespace o2 +{ +namespace benchmark +{ +namespace gpu +{ + +/////////////////////////// +// Device functions go here +template +__host__ __device__ inline chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber) +{ + return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * partNumber); +} + +////////////////// +// Kernels go here +// Reading +template +__global__ void readChunkSBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + chunk_type sink{0}; + chunk_type* ptr = getPartPtrOnScratch(scratch, chunkReservedGB, chunkId); + for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { + sink += ptr[i]; + } + if (sink == static_cast(1)) { + results[chunkId] = sink; + } + } +} + +template +__global__ void readChunkMBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) { + if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast(1)) { // actual read operation is performed here + results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i]; // this case should never happen and waves should be always in sync + } + } +} + +// Writing +template +__global__ void writeChunkSBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1; + } + } +} + +template +__global__ void writeChunkMBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1; + } +} + +// Copying +template +__global__ void copyChunkSBKernel( + int chunkId, + chunk_type* inputs, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId]; + } + } +} + +template +__global__ void copyChunkMBKernel( + int chunkId, + chunk_type* inputs, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId]; + } +} + +} // namespace gpu + +void printDeviceProp(int deviceId) +{ + const int w1 = 34; + std::cout << std::left; + std::cout << std::setw(w1) + << "--------------------------------------------------------------------------------" + << std::endl; + std::cout << std::setw(w1) << "device#" << deviceId << std::endl; + + cudaDeviceProp props; + GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); + + std::cout << std::setw(w1) << "Name: " << props.name << std::endl; + std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl; + std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl; + std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl; + std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl; + std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor + << std::endl; + std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl; + std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl; + std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" + << std::endl; + std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl; + std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0 + << " Mhz" << std::endl; + std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2) + << bytesToGB(props.totalGlobalMem) << " GB" << std::endl; +#if !defined(__CUDACC__) + std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2) + << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << std::endl; +#endif +#if defined(__HIPCC__) + std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2) + << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << std::endl; +#endif + std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl; + std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" + << std::endl; + std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl; + std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl; + std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl; + std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl; + std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl; + std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl; + std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl; + std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl; + std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl; + std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl; + std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl; + std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl; + std::cout << std::setw(w1) << "major: " << props.major << std::endl; + std::cout << std::setw(w1) << "minor: " << props.minor << std::endl; + std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl; + std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl; + std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl; +#if defined(__HIPCC__) + std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch + << std::endl; + std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch + << std::endl; + std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl; + std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl; + std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl; + std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl; + std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl; + std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl; + std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl; + std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl; + std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl; + std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl; + std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl; + std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl; +#endif + std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl; + std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl; + std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl; + std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl; + std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl; + std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl; + std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl; +#if defined(__HIPCC__) + std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl; + std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl; +#endif + + int deviceCnt; + GPUCHECK(cudaGetDeviceCount(&deviceCnt)); + std::cout << std::setw(w1) << "peers: "; + for (int i = 0; i < deviceCnt; i++) { + int isPeer; + GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId)); + if (isPeer) { + std::cout << "device#" << i << " "; + } + } + std::cout << std::endl; + std::cout << std::setw(w1) << "non-peers: "; + for (int i = 0; i < deviceCnt; i++) { + int isPeer; + GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId)); + if (!isPeer) { + std::cout << "device#" << i << " "; + } + } + std::cout << std::endl; + + size_t free, total; + GPUCHECK(cudaMemGetInfo(&free, &total)); + + std::cout << std::fixed << std::setprecision(2); + std::cout << std::setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << std::endl; + std::cout << std::setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << std::setprecision(0) + << (float)free / total * 100.0 << "%)" << std::endl; +} + +template +template +float GPUbenchmark::benchmarkSync(void (*kernel)(T...), + int nLaunches, int blocks, int threads, T&... args) // run for each chunk (id is passed in variadic args) +{ + cudaEvent_t start, stop; + GPUCHECK(cudaEventCreate(&start)); + GPUCHECK(cudaEventCreate(&stop)); + + GPUCHECK(cudaEventRecord(start)); + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches + (*kernel)<<>>(args...); + } + GPUCHECK(cudaEventRecord(stop)); // record checkpoint + + GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions + float milliseconds{0.f}; + GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop)); + + return milliseconds; +} + +template +template +std::vector GPUbenchmark::benchmarkAsync(void (*kernel)(int, T...), + int nStreams, int nLaunches, int blocks, int threads, T&... args) +{ + std::vector starts(nStreams), stops(nStreams); + std::vector streams(nStreams); + std::vector results(nStreams); + + for (auto iStream{0}; iStream < nStreams; ++iStream) { // one stream per chunk + GPUCHECK(cudaStreamCreate(&(streams.at(iStream)))); + GPUCHECK(cudaEventCreate(&(starts[iStream]))); + GPUCHECK(cudaEventCreate(&(stops[iStream]))); + } + + for (auto iStream{0}; iStream < nStreams; ++iStream) { + GPUCHECK(cudaEventRecord(starts[iStream], streams[iStream])); + + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream + (*kernel)<<>>(iStream, args...); + } + GPUCHECK(cudaEventRecord(stops[iStream], streams[iStream])); + } + + for (auto iStream{0}; iStream < nStreams; ++iStream) { + GPUCHECK(cudaEventSynchronize(stops[iStream])); + GPUCHECK(cudaEventElapsedTime(&(results.at(iStream)), starts[iStream], stops[iStream])); + } + + return results; +} + +template +void GPUbenchmark::printDevices() +{ + int deviceCnt; + GPUCHECK(cudaGetDeviceCount(&deviceCnt)); + + for (int i = 0; i < deviceCnt; i++) { + GPUCHECK(cudaSetDevice(i)); + printDeviceProp(i); + } +} + +template +void GPUbenchmark::globalInit(const int deviceId) +{ + cudaDeviceProp props; + size_t free; + + // Fetch and store features + GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); + GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); + + mState.chunkReservedGB = mOptions.chunkReservedGB; + mState.iterations = mOptions.kernelLaunches; + mState.nMultiprocessors = props.multiProcessorCount; + mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; + mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; + mState.scratchSize = static_cast(mOptions.freeMemoryFractionToAllocate * free); + std::cout << ">>> Running on: \033[1;31m" << props.name << "\e[0m" << std::endl; + + // Allocate scratch on GPU + GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); + + mState.computeScratchPtrs(); + GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize)) + + std::cout << " ├ Buffer type: \e[1m" << getType() << "\e[0m" << std::endl + << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) + << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n" + << " ├ Number of scratch chunks: " << mState.getMaxChunks() << " of " << mOptions.chunkReservedGB << "GB each\n" + << " └ Each chunk can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl + << std::endl; +} + +/// Read +template +void GPUbenchmark::readInit() +{ + std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; + mState.hostReadResultsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); +} + +template +void GPUbenchmark::readSequential(SplitLevel sl) +{ + switch (sl) { + case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("seq_read_SB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::readChunkSBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceReadResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + + case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("seq_read_MB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::readChunkMBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceReadResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) +{ + switch (sl) { + case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("conc_read_SB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::readChunkSBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("conc_read_MB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::readChunkMBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::readFinalize() +{ + GPUCHECK(cudaMemcpy(mState.hostReadResultsVector.data(), mState.deviceReadResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceReadResultsPtr)); + std::cout << " └ done." << std::endl; +} + +/// Write +template +void GPUbenchmark::writeInit() +{ + std::cout << ">>> Initializing write benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; + mState.hostWriteResultsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceWriteResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); +} + +template +void GPUbenchmark::writeSequential(SplitLevel sl) +{ + switch (sl) { + case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("seq_write_SB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::writeChunkSBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceWriteResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + + case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("seq_write_MB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::writeChunkMBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceWriteResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) +{ + switch (sl) { + case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("conc_write_SB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::writeChunkSBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("conc_write_MB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::writeChunkMBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::writeFinalize() +{ + GPUCHECK(cudaMemcpy(mState.hostWriteResultsVector.data(), mState.deviceWriteResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceWriteResultsPtr)); + std::cout << " └ done." << std::endl; +} + +/// Copy +template +void GPUbenchmark::copyInit() +{ + std::cout << ">>> Initializing copy benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; + mState.hostCopyInputsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceCopyInputsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); + GPUCHECK(cudaMemset(mState.deviceCopyInputsPtr, 1, mState.getMaxChunks() * sizeof(chunk_type))); +} + +template +void GPUbenchmark::copySequential(SplitLevel sl) +{ + switch (sl) { + case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("seq_copy_SB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::copyChunkSBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceCopyInputsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + + case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("seq_copy_MB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::copyChunkMBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceCopyInputsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) +{ + switch (sl) { + case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("conc_copy_SB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::copyChunkSBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("conc_copy_MB", getType(), mState.getMaxChunks()); + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::copyChunkMBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); + } + mResultWriter.get()->snapshotBenchmark(); + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::copyFinalize() +{ + GPUCHECK(cudaMemcpy(mState.hostCopyInputsVector.data(), mState.deviceCopyInputsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceCopyInputsPtr)); + std::cout << " └ done." << std::endl; +} + +template +void GPUbenchmark::globalFinalize() +{ + GPUCHECK(cudaFree(mState.scratchPtr)); +} + +template +void GPUbenchmark::run() +{ + globalInit(0); + + readInit(); + // Reading in whole memory + readSequential(SplitLevel::Blocks); + readSequential(SplitLevel::Threads); + + // Reading in memory regions + readConcurrent(SplitLevel::Blocks); + readConcurrent(SplitLevel::Threads); + readFinalize(); + + writeInit(); + // Write on whole memory + writeSequential(SplitLevel::Blocks); + writeSequential(SplitLevel::Threads); + + // Write on memory regions + writeConcurrent(SplitLevel::Blocks); + writeConcurrent(SplitLevel::Threads); + writeFinalize(); + + copyInit(); + // Copy from input buffer (size = nChunks) on whole memory + copySequential(SplitLevel::Blocks); + copySequential(SplitLevel::Threads); + + // Copy from input buffer (size = nChunks) on memory regions + copyConcurrent(SplitLevel::Blocks); + copyConcurrent(SplitLevel::Threads); + copyFinalize(); + + GPUbenchmark::globalFinalize(); +} + +template class GPUbenchmark; +template class GPUbenchmark; +template class GPUbenchmark; +// template class GPUbenchmark; + +} // namespace benchmark +} // namespace o2 \ No newline at end of file diff --git a/GPU/GPUbenchmark/hip/.gitignore b/GPU/GPUbenchmark/hip/.gitignore new file mode 100644 index 0000000000000..14f27f00c53c2 --- /dev/null +++ b/GPU/GPUbenchmark/hip/.gitignore @@ -0,0 +1 @@ +*.hip.cxx \ No newline at end of file