From 244287b024ccf6b3d1a99a3390c7867b028d78ad Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 19 May 2021 23:34:32 +0200 Subject: [PATCH 01/42] Add CUDA backbone --- GPU/CMakeLists.txt | 1 + GPU/GPUBenchmark/CMakeLists.txt | 55 ++++++++++++++++++++++++ GPU/GPUBenchmark/GPUbenchmark.cu | 34 +++++++++++++++ GPU/GPUBenchmark/GPUbenchmark.h | 37 ++++++++++++++++ GPU/GPUBenchmark/macro/CMakeLists.txt | 4 ++ GPU/GPUBenchmark/macro/runGPUbenchmark.C | 9 ++++ 6 files changed, 140 insertions(+) create mode 100644 GPU/GPUBenchmark/CMakeLists.txt create mode 100644 GPU/GPUBenchmark/GPUbenchmark.cu create mode 100644 GPU/GPUBenchmark/GPUbenchmark.h create mode 100644 GPU/GPUBenchmark/macro/CMakeLists.txt create mode 100644 GPU/GPUBenchmark/macro/runGPUbenchmark.C diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index f8f1931f35547..74c3cbd6da6bc 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -22,6 +22,7 @@ add_subdirectory(Common) add_subdirectory(Utils) add_subdirectory(TPCFastTransformation) add_subdirectory(GPUTracking) +add_subdirectory(GPUBenchmark) if(ALIGPU_BUILD_TYPE STREQUAL "O2") add_subdirectory(Workflow) endif() diff --git a/GPU/GPUBenchmark/CMakeLists.txt b/GPU/GPUBenchmark/CMakeLists.txt new file mode 100644 index 0000000000000..1274cf00407e0 --- /dev/null +++ b/GPU/GPUBenchmark/CMakeLists.txt @@ -0,0 +1,55 @@ +# Copyright CERN and copyright holders of ALICE O2. This software is distributed +# under the terms of the GNU General Public License v3 (GPL Version 3), copied +# verbatim in the file "COPYING". +# +# See http://alice-o2.web.cern.ch/license for full licensing information. +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization or +# submit itself to any jurisdiction. + +add_subdirectory(macro) +set(HDRS_INSTALL GPUbenchmark.h) + +o2_add_library(GPUbenchmarkCUDA + SOURCES GPUbenchmark.cu + PUBLIC_INCLUDE_DIRECTORIES . + PUBLIC_LINK_LIBRARIES O2::GPUCommon + TARGETVARNAME targetName) + + set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) + +target_compile_definitions( + ${targetName} PRIVATE $) + +install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark) + +# set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) +# set(CMAKE_CXX_EXTENSIONS OFF) +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") +# # Hipify-perl +# set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") +# set(HIP_KERNEL "GPUbenchmark.hip.cxx") +# set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu) +# set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/${HIP_KERNEL}") + +# if(EXISTS ${HIPIFY_EXECUTABLE}) +# # generate on-the-fly the HIP kernel +# execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") +# o2_add_library(GPUbenchmarkHIP +# SOURCES GPUbenchmark.hip.cxx +# PUBLIC_INCLUDE_DIRECTORIES . +# PUBLIC_LINK_LIBRARIES O2::GPUCommon +# hip::host +# hip::device +# TARGETVARNAME targetName) +# target_compile_definitions( +# ${targetName} PRIVATE $) + +# if(HIP_AMDGPUTARGET) +# # Need to add gpu target also to link flags due to gpu-rdc option +# target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) +# endif() +# elseif() +# message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") +# endif() \ No newline at end of file diff --git a/GPU/GPUBenchmark/GPUbenchmark.cu b/GPU/GPUBenchmark/GPUbenchmark.cu new file mode 100644 index 0000000000000..99c49558630f4 --- /dev/null +++ b/GPU/GPUBenchmark/GPUbenchmark.cu @@ -0,0 +1,34 @@ +// Copyright CERN and copyright holders of ALICE O2. This software is +// distributed under the terms of the GNU General Public License v3 (GPL +// Version 3), copied verbatim in the file "COPYING". +// +// See http://alice-o2.web.cern.ch/license for full licensing information. +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file GPUbenchmark.cu +/// \author: mconcas@cern.ch + +#include +#include + +namespace o2 +{ +namespace benchmark +{ +namespace gpu +{ +GPUg() void helloKernel() +{ + printf("Hello World from GPU!!\n"); +} +} // namespace gpu + +void hello_util() +{ + gpu::helloKernel<<<1, 1>>>(); +} +} // namespace benchmark +} // namespace o2 \ No newline at end of file diff --git a/GPU/GPUBenchmark/GPUbenchmark.h b/GPU/GPUBenchmark/GPUbenchmark.h new file mode 100644 index 0000000000000..fca4a05144c13 --- /dev/null +++ b/GPU/GPUBenchmark/GPUbenchmark.h @@ -0,0 +1,37 @@ +// Copyright CERN and copyright holders of ALICE O2. This software is +// distributed under the terms of the GNU General Public License v3 (GPL +// Version 3), copied verbatim in the file "COPYING". +// +// See http://alice-o2.web.cern.ch/license for full licensing information. +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file GPUbenchmark.h +/// \author: mconcas@cern.ch + +#include "GPUCommonDef.h" + +namespace o2 +{ +namespace benchmark +{ +void hello_util(); + +class GPUbenchmark final +{ + public: + GPUbenchmark() = default; + virtual ~GPUbenchmark() = default; + + void hello(); +}; + +// Steers +void GPUbenchmark::hello() +{ + hello_util(); +} +} // namespace benchmark +} // namespace o2 \ No newline at end of file diff --git a/GPU/GPUBenchmark/macro/CMakeLists.txt b/GPU/GPUBenchmark/macro/CMakeLists.txt new file mode 100644 index 0000000000000..dd74a1e43db7e --- /dev/null +++ b/GPU/GPUBenchmark/macro/CMakeLists.txt @@ -0,0 +1,4 @@ +o2_add_test_root_macro(runGPUbenchmark.C + PUBLIC_LINK_LIBRARIES O2::GPUbenchmarkCUDA + O2::GPUCommon + LABELS gpu COMPILE_ONLY) \ No newline at end of file diff --git a/GPU/GPUBenchmark/macro/runGPUbenchmark.C b/GPU/GPUBenchmark/macro/runGPUbenchmark.C new file mode 100644 index 0000000000000..b7e6c4138d260 --- /dev/null +++ b/GPU/GPUBenchmark/macro/runGPUbenchmark.C @@ -0,0 +1,9 @@ +#if !defined(__CLING__) || defined(__ROOTCLING__) +#include +#endif + +void runGPUbenchmark() +{ + o2::benchmark::GPUbenchmark bm{}; + bm.hello(); +} \ No newline at end of file From 4071b1156b705f22467d31f7b6fe856a2ff5fec1 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 20 May 2021 00:03:25 +0200 Subject: [PATCH 02/42] HIP breaks --- GPU/GPUBenchmark/CMakeLists.txt | 70 +++++++++++++-------------- GPU/GPUBenchmark/GPUbenchmark.hip.cxx | 35 ++++++++++++++ 2 files changed, 68 insertions(+), 37 deletions(-) create mode 100644 GPU/GPUBenchmark/GPUbenchmark.hip.cxx diff --git a/GPU/GPUBenchmark/CMakeLists.txt b/GPU/GPUBenchmark/CMakeLists.txt index 1274cf00407e0..75ca30fcf97f8 100644 --- a/GPU/GPUBenchmark/CMakeLists.txt +++ b/GPU/GPUBenchmark/CMakeLists.txt @@ -11,45 +11,41 @@ add_subdirectory(macro) set(HDRS_INSTALL GPUbenchmark.h) -o2_add_library(GPUbenchmarkCUDA - SOURCES GPUbenchmark.cu - PUBLIC_INCLUDE_DIRECTORIES . - PUBLIC_LINK_LIBRARIES O2::GPUCommon - TARGETVARNAME targetName) - - set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) +# o2_add_library(GPUbenchmarkCUDA +# SOURCES GPUbenchmark.cu +# PUBLIC_INCLUDE_DIRECTORIES . +# PUBLIC_LINK_LIBRARIES O2::GPUCommon +# TARGETVARNAME targetName) -target_compile_definitions( - ${targetName} PRIVATE $) +# set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark) -# set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) -# set(CMAKE_CXX_EXTENSIONS OFF) -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") -# # Hipify-perl -# set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") -# set(HIP_KERNEL "GPUbenchmark.hip.cxx") -# set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu) -# set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/${HIP_KERNEL}") +set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") +# Hipify-perl +set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") +set(HIP_KERNEL "GPUbenchmark.hip.cxx") +set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu) +set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") -# if(EXISTS ${HIPIFY_EXECUTABLE}) -# # generate on-the-fly the HIP kernel -# execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") -# o2_add_library(GPUbenchmarkHIP -# SOURCES GPUbenchmark.hip.cxx -# PUBLIC_INCLUDE_DIRECTORIES . -# PUBLIC_LINK_LIBRARIES O2::GPUCommon -# hip::host -# hip::device -# TARGETVARNAME targetName) -# target_compile_definitions( -# ${targetName} PRIVATE $) - -# if(HIP_AMDGPUTARGET) -# # Need to add gpu target also to link flags due to gpu-rdc option -# target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) -# endif() -# elseif() -# message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") -# endif() \ No newline at end of file +if(EXISTS ${HIPIFY_EXECUTABLE}) +# generate on-the-fly the HIP kernel +execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") +o2_add_library(GPUbenchmarkHIP + SOURCES GPUbenchmark.hip.cxx + PUBLIC_INCLUDE_DIRECTORIES . + PUBLIC_LINK_LIBRARIES O2::GPUCommon + hip::host + hip::device + TARGETVARNAME targetName) + + if(HIP_AMDGPUTARGET) + message(FATAL_ERROR ${HIP_AMDGPUTARGET}) + # Need to add gpu target also to link flags due to gpu-rdc option + target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) + endif() +elseif() + message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") +endif() \ No newline at end of file diff --git a/GPU/GPUBenchmark/GPUbenchmark.hip.cxx b/GPU/GPUBenchmark/GPUbenchmark.hip.cxx new file mode 100644 index 0000000000000..786bb72d5ae4e --- /dev/null +++ b/GPU/GPUBenchmark/GPUbenchmark.hip.cxx @@ -0,0 +1,35 @@ +#include "hip/hip_runtime.h" +// Copyright CERN and copyright holders of ALICE O2. This software is +// distributed under the terms of the GNU General Public License v3 (GPL +// Version 3), copied verbatim in the file "COPYING". +// +// See http://alice-o2.web.cern.ch/license for full licensing information. +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file GPUbenchmark.cu +/// \author: mconcas@cern.ch + +#include +#include + +namespace o2 +{ +namespace benchmark +{ +namespace gpu +{ +GPUg() void helloKernel() +{ + printf("Hello World from GPU!!\n"); +} +} // namespace gpu + +void hello_util() +{ + hipLaunchKernelGGL(gpu::helloKernel, dim3(1), dim3(1), 0, 0); +} +} // namespace benchmark +} // namespace o2 \ No newline at end of file From 04ca94bfcf5536688548144077a9cd5a10219df7 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 20 May 2021 22:52:57 +0200 Subject: [PATCH 03/42] Make two separate libraries --- GPU/CMakeLists.txt | 2 +- GPU/GPUBenchmark/GPUbenchmark.hip.cxx | 35 ------------------- GPU/GPUbenchmark/CMakeLists.txt | 13 +++++++ .../GPUbenchmark.h | 0 GPU/GPUbenchmark/cuda/CMakeLists.txt | 21 +++++++++++ .../cuda}/GPUbenchmark.cu | 0 .../hip}/CMakeLists.txt | 17 ++------- GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx | 0 .../macro/CMakeLists.txt | 0 .../macro/runGPUbenchmark.C | 0 10 files changed, 38 insertions(+), 50 deletions(-) delete mode 100644 GPU/GPUBenchmark/GPUbenchmark.hip.cxx create mode 100644 GPU/GPUbenchmark/CMakeLists.txt rename GPU/{GPUBenchmark => GPUbenchmark}/GPUbenchmark.h (100%) create mode 100644 GPU/GPUbenchmark/cuda/CMakeLists.txt rename GPU/{GPUBenchmark => GPUbenchmark/cuda}/GPUbenchmark.cu (100%) rename GPU/{GPUBenchmark => GPUbenchmark/hip}/CMakeLists.txt (74%) create mode 100644 GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx rename GPU/{GPUBenchmark => GPUbenchmark}/macro/CMakeLists.txt (100%) rename GPU/{GPUBenchmark => GPUbenchmark}/macro/runGPUbenchmark.C (100%) diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 74c3cbd6da6bc..7019f951b25fb 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -22,7 +22,7 @@ add_subdirectory(Common) add_subdirectory(Utils) add_subdirectory(TPCFastTransformation) add_subdirectory(GPUTracking) -add_subdirectory(GPUBenchmark) +add_subdirectory(GPUbenchmark) if(ALIGPU_BUILD_TYPE STREQUAL "O2") add_subdirectory(Workflow) endif() diff --git a/GPU/GPUBenchmark/GPUbenchmark.hip.cxx b/GPU/GPUBenchmark/GPUbenchmark.hip.cxx deleted file mode 100644 index 786bb72d5ae4e..0000000000000 --- a/GPU/GPUBenchmark/GPUbenchmark.hip.cxx +++ /dev/null @@ -1,35 +0,0 @@ -#include "hip/hip_runtime.h" -// Copyright CERN and copyright holders of ALICE O2. This software is -// distributed under the terms of the GNU General Public License v3 (GPL -// Version 3), copied verbatim in the file "COPYING". -// -// See http://alice-o2.web.cern.ch/license for full licensing information. -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file GPUbenchmark.cu -/// \author: mconcas@cern.ch - -#include -#include - -namespace o2 -{ -namespace benchmark -{ -namespace gpu -{ -GPUg() void helloKernel() -{ - printf("Hello World from GPU!!\n"); -} -} // namespace gpu - -void hello_util() -{ - hipLaunchKernelGGL(gpu::helloKernel, dim3(1), dim3(1), 0, 0); -} -} // namespace benchmark -} // namespace o2 \ No newline at end of file diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt new file mode 100644 index 0000000000000..b41feca915d3d --- /dev/null +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -0,0 +1,13 @@ +# Copyright CERN and copyright holders of ALICE O2. This software is distributed +# under the terms of the GNU General Public License v3 (GPL Version 3), copied +# verbatim in the file "COPYING". +# +# See http://alice-o2.web.cern.ch/license for full licensing information. +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization or +# submit itself to any jurisdiction. + +add_subdirectory(macro) +add_subdirectory(cuda) +add_subdirectory(hip) \ No newline at end of file diff --git a/GPU/GPUBenchmark/GPUbenchmark.h b/GPU/GPUbenchmark/GPUbenchmark.h similarity index 100% rename from GPU/GPUBenchmark/GPUbenchmark.h rename to GPU/GPUbenchmark/GPUbenchmark.h diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt new file mode 100644 index 0000000000000..2eab170cde6a5 --- /dev/null +++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt @@ -0,0 +1,21 @@ +# Copyright CERN and copyright holders of ALICE O2. This software is distributed +# under the terms of the GNU General Public License v3 (GPL Version 3), copied +# verbatim in the file "COPYING". +# +# See http://alice-o2.web.cern.ch/license for full licensing information. +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization or +# submit itself to any jurisdiction. + +set(HDRS_INSTALL ../GPUbenchmark.h) + +o2_add_library(GPUbenchmarkCUDA + SOURCES GPUbenchmark.cu + PUBLIC_INCLUDE_DIRECTORIES ../ + PUBLIC_LINK_LIBRARIES O2::GPUCommon + TARGETVARNAME targetName) + + set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) + +install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark) \ No newline at end of file diff --git a/GPU/GPUBenchmark/GPUbenchmark.cu b/GPU/GPUbenchmark/cuda/GPUbenchmark.cu similarity index 100% rename from GPU/GPUBenchmark/GPUbenchmark.cu rename to GPU/GPUbenchmark/cuda/GPUbenchmark.cu diff --git a/GPU/GPUBenchmark/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt similarity index 74% rename from GPU/GPUBenchmark/CMakeLists.txt rename to GPU/GPUbenchmark/hip/CMakeLists.txt index 75ca30fcf97f8..f90267d09b4e9 100644 --- a/GPU/GPUBenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/hip/CMakeLists.txt @@ -8,18 +8,7 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -add_subdirectory(macro) -set(HDRS_INSTALL GPUbenchmark.h) - -# o2_add_library(GPUbenchmarkCUDA -# SOURCES GPUbenchmark.cu -# PUBLIC_INCLUDE_DIRECTORIES . -# PUBLIC_LINK_LIBRARIES O2::GPUCommon -# TARGETVARNAME targetName) - -# set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) - -install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark) +set(HDRS_INSTALL ../GPUbenchmark.h) set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) set(CMAKE_CXX_EXTENSIONS OFF) @@ -27,7 +16,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") # Hipify-perl set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") set(HIP_KERNEL "GPUbenchmark.hip.cxx") -set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu) +set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/GPUbenchmark.cu) set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") if(EXISTS ${HIPIFY_EXECUTABLE}) @@ -35,7 +24,7 @@ if(EXISTS ${HIPIFY_EXECUTABLE}) execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") o2_add_library(GPUbenchmarkHIP SOURCES GPUbenchmark.hip.cxx - PUBLIC_INCLUDE_DIRECTORIES . + PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host hip::device diff --git a/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx b/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/GPU/GPUBenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt similarity index 100% rename from GPU/GPUBenchmark/macro/CMakeLists.txt rename to GPU/GPUbenchmark/macro/CMakeLists.txt diff --git a/GPU/GPUBenchmark/macro/runGPUbenchmark.C b/GPU/GPUbenchmark/macro/runGPUbenchmark.C similarity index 100% rename from GPU/GPUBenchmark/macro/runGPUbenchmark.C rename to GPU/GPUbenchmark/macro/runGPUbenchmark.C From aae7f42a2b56718a23319142d2f5bda0a744bdb6 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Sun, 23 May 2021 21:42:12 +0200 Subject: [PATCH 04/42] Re-arrange directories --- GPU/GPUbenchmark/CMakeLists.txt | 3 +-- GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx | 15 +++++++++++++++ GPU/GPUbenchmark/Steer/BenchmarkSteer.h | 17 +++++++++++++++++ GPU/GPUbenchmark/Steer/CMakeLists.txt | 17 +++++++++++++++++ GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 12 ++++++++++++ .../{GPUbenchmark.h => Steer/Kernels/Kernels.h} | 6 ++++-- .../{ => Steer/Kernels}/cuda/CMakeLists.txt | 8 ++++---- .../Kernels/cuda/Kernels.cu} | 6 +++--- GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore | 1 + .../{ => Steer/Kernels}/hip/CMakeLists.txt | 12 +++++++----- GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx | 0 GPU/GPUbenchmark/macro/CMakeLists.txt | 2 +- GPU/GPUbenchmark/macro/runGPUbenchmark.C | 2 +- 13 files changed, 83 insertions(+), 18 deletions(-) create mode 100644 GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx create mode 100644 GPU/GPUbenchmark/Steer/BenchmarkSteer.h create mode 100644 GPU/GPUbenchmark/Steer/CMakeLists.txt create mode 100644 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt rename GPU/GPUbenchmark/{GPUbenchmark.h => Steer/Kernels/Kernels.h} (92%) rename GPU/GPUbenchmark/{ => Steer/Kernels}/cuda/CMakeLists.txt (87%) rename GPU/GPUbenchmark/{cuda/GPUbenchmark.cu => Steer/Kernels/cuda/Kernels.cu} (88%) create mode 100644 GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore rename GPU/GPUbenchmark/{ => Steer/Kernels}/hip/CMakeLists.txt (85%) delete mode 100644 GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index b41feca915d3d..28a562412c927 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -9,5 +9,4 @@ # submit itself to any jurisdiction. add_subdirectory(macro) -add_subdirectory(cuda) -add_subdirectory(hip) \ No newline at end of file +add_subdirectory(Steer) \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx b/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx new file mode 100644 index 0000000000000..fd0fc5db989cf --- /dev/null +++ b/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx @@ -0,0 +1,15 @@ +// Copyright CERN and copyright holders of ALICE O2. This software is +// distributed under the terms of the GNU General Public License v3 (GPL +// Version 3), copied verbatim in the file "COPYING". +// +// See http://alice-o2.web.cern.ch/license for full licensing information. +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file BenchmarkSteer.cxx +/// \author: mconcas@cern.ch + +#include + diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.h b/GPU/GPUbenchmark/Steer/BenchmarkSteer.h new file mode 100644 index 0000000000000..c246b649d834a --- /dev/null +++ b/GPU/GPUbenchmark/Steer/BenchmarkSteer.h @@ -0,0 +1,17 @@ +// Copyright CERN and copyright holders of ALICE O2. This software is +// distributed under the terms of the GNU General Public License v3 (GPL +// Version 3), copied verbatim in the file "COPYING". +// +// See http://alice-o2.web.cern.ch/license for full licensing information. +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file BenchmarkSteer.h +/// \author: mconcas@cern.ch + +#ifndef BENCHAMARKSTEER_H +#define BENCHAMARKSTEER_H + +#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt new file mode 100644 index 0000000000000..ab0ee700120e1 --- /dev/null +++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt @@ -0,0 +1,17 @@ +# Copyright CERN and copyright holders of ALICE O2. This software is distributed +# under the terms of the GNU General Public License v3 (GPL Version 3), copied +# verbatim in the file "COPYING". +# +# See http://alice-o2.web.cern.ch/license for full licensing information. +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization or +# submit itself to any jurisdiction. + +add_subdirectory(Kernels) + +o2_add_library(GPUBenchmark + SOURCES BenchmarkSteer.cxx + PUBLIC_LINK_LIBRARIES O2::HIPbenchmark + O2::CUDAbenchmark + TARGETVARNAME targetName) \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt new file mode 100644 index 0000000000000..8f53feffba52f --- /dev/null +++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt @@ -0,0 +1,12 @@ +# Copyright CERN and copyright holders of ALICE O2. This software is distributed +# under the terms of the GNU General Public License v3 (GPL Version 3), copied +# verbatim in the file "COPYING". +# +# See http://alice-o2.web.cern.ch/license for full licensing information. +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization or +# submit itself to any jurisdiction. + +add_subdirectory(cuda) +add_subdirectory(hip) \ No newline at end of file diff --git a/GPU/GPUbenchmark/GPUbenchmark.h b/GPU/GPUbenchmark/Steer/Kernels/Kernels.h similarity index 92% rename from GPU/GPUbenchmark/GPUbenchmark.h rename to GPU/GPUbenchmark/Steer/Kernels/Kernels.h index fca4a05144c13..0390c93e33040 100644 --- a/GPU/GPUbenchmark/GPUbenchmark.h +++ b/GPU/GPUbenchmark/Steer/Kernels/Kernels.h @@ -10,6 +10,8 @@ /// /// \file GPUbenchmark.h /// \author: mconcas@cern.ch +#ifndef GPUBENCHMARK_H +#define GPUBENCHMARK_H #include "GPUCommonDef.h" @@ -24,7 +26,6 @@ class GPUbenchmark final public: GPUbenchmark() = default; virtual ~GPUbenchmark() = default; - void hello(); }; @@ -34,4 +35,5 @@ void GPUbenchmark::hello() hello_util(); } } // namespace benchmark -} // namespace o2 \ No newline at end of file +} // namespace o2 +#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt similarity index 87% rename from GPU/GPUbenchmark/cuda/CMakeLists.txt rename to GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt index 2eab170cde6a5..f9866db53131f 100644 --- a/GPU/GPUbenchmark/cuda/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt @@ -8,14 +8,14 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -set(HDRS_INSTALL ../GPUbenchmark.h) +set(HDRS_INSTALL ../Kernels.h) -o2_add_library(GPUbenchmarkCUDA - SOURCES GPUbenchmark.cu +o2_add_library(CUDAbenchmark + SOURCES Kernels.cu PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::GPUCommon TARGETVARNAME targetName) set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) -install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark) \ No newline at end of file +install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file diff --git a/GPU/GPUbenchmark/cuda/GPUbenchmark.cu b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu similarity index 88% rename from GPU/GPUbenchmark/cuda/GPUbenchmark.cu rename to GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu index 99c49558630f4..b023eafcadfe2 100644 --- a/GPU/GPUbenchmark/cuda/GPUbenchmark.cu +++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu @@ -8,10 +8,10 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. /// -/// \file GPUbenchmark.cu +/// \file Kernels.cu /// \author: mconcas@cern.ch -#include +#include #include namespace o2 @@ -22,7 +22,7 @@ namespace gpu { GPUg() void helloKernel() { - printf("Hello World from GPU!!\n"); + printf("Hello World from GPU!\n"); } } // namespace gpu diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore b/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore new file mode 100644 index 0000000000000..14f27f00c53c2 --- /dev/null +++ b/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore @@ -0,0 +1 @@ +*.hip.cxx \ No newline at end of file diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt similarity index 85% rename from GPU/GPUbenchmark/hip/CMakeLists.txt rename to GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt index f90267d09b4e9..a2aeae011e7aa 100644 --- a/GPU/GPUbenchmark/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt @@ -8,22 +8,24 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -set(HDRS_INSTALL ../GPUbenchmark.h) +set(HDRS_INSTALL ../Kernels.h) set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") + # Hipify-perl set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") -set(HIP_KERNEL "GPUbenchmark.hip.cxx") -set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/GPUbenchmark.cu) +set(HIP_KERNEL "Kernels.hip.cxx") +message(FATAL_ERROR "${CMAKE_CURRENT_SOURCE_DIR}") +set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu) set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") if(EXISTS ${HIPIFY_EXECUTABLE}) # generate on-the-fly the HIP kernel execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") -o2_add_library(GPUbenchmarkHIP - SOURCES GPUbenchmark.hip.cxx +o2_add_library(HIPbenchmark + SOURCES Kernels.hip.cxx PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host diff --git a/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx b/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/GPU/GPUbenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt index dd74a1e43db7e..ff210986e7205 100644 --- a/GPU/GPUbenchmark/macro/CMakeLists.txt +++ b/GPU/GPUbenchmark/macro/CMakeLists.txt @@ -1,4 +1,4 @@ o2_add_test_root_macro(runGPUbenchmark.C - PUBLIC_LINK_LIBRARIES O2::GPUbenchmarkCUDA + PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark O2::GPUCommon LABELS gpu COMPILE_ONLY) \ No newline at end of file diff --git a/GPU/GPUbenchmark/macro/runGPUbenchmark.C b/GPU/GPUbenchmark/macro/runGPUbenchmark.C index b7e6c4138d260..0d3bf53899984 100644 --- a/GPU/GPUbenchmark/macro/runGPUbenchmark.C +++ b/GPU/GPUbenchmark/macro/runGPUbenchmark.C @@ -1,5 +1,5 @@ #if !defined(__CLING__) || defined(__ROOTCLING__) -#include +#include #endif void runGPUbenchmark() From 3cc969580d3384c60979db200a34cc18faa402fb Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Sun, 23 May 2021 21:44:22 +0200 Subject: [PATCH 05/42] Add missing header --- GPU/GPUbenchmark/Steer/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt index ab0ee700120e1..83841550410b4 100644 --- a/GPU/GPUbenchmark/Steer/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt @@ -12,6 +12,7 @@ add_subdirectory(Kernels) o2_add_library(GPUBenchmark SOURCES BenchmarkSteer.cxx + PUBLIC_INCLUDE_DIRECTORIES . PUBLIC_LINK_LIBRARIES O2::HIPbenchmark O2::CUDAbenchmark TARGETVARNAME targetName) \ No newline at end of file From 1f317913e6f31efd9d8a549005f6de719e14ea17 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Sun, 23 May 2021 22:17:21 +0200 Subject: [PATCH 06/42] Meta library does not compile --- GPU/GPUbenchmark/Steer/CMakeLists.txt | 13 ++++++------- GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt | 3 +-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt index 83841550410b4..e0afe118e576f 100644 --- a/GPU/GPUbenchmark/Steer/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt @@ -8,11 +8,10 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -add_subdirectory(Kernels) +# o2_add_library(GPUBenchmark +# SOURCES BenchmarkSteer.cxx +# PUBLIC_INCLUDE_DIRECTORIES . +# PUBLIC_LINK_LIBRARIES O2::HIPbenchmark +# O2::CUDAbenchmark) -o2_add_library(GPUBenchmark - SOURCES BenchmarkSteer.cxx - PUBLIC_INCLUDE_DIRECTORIES . - PUBLIC_LINK_LIBRARIES O2::HIPbenchmark - O2::CUDAbenchmark - TARGETVARNAME targetName) \ No newline at end of file +add_subdirectory(Kernels) \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt index a2aeae011e7aa..52eef2dd8b420 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt @@ -17,7 +17,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") # Hipify-perl set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") set(HIP_KERNEL "Kernels.hip.cxx") -message(FATAL_ERROR "${CMAKE_CURRENT_SOURCE_DIR}") + set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu) set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") @@ -33,7 +33,6 @@ o2_add_library(HIPbenchmark TARGETVARNAME targetName) if(HIP_AMDGPUTARGET) - message(FATAL_ERROR ${HIP_AMDGPUTARGET}) # Need to add gpu target also to link flags due to gpu-rdc option target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) endif() From 175be3f455d082d8f6d94ec951044bc45bbdc959 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 24 May 2021 18:50:29 +0200 Subject: [PATCH 07/42] Port hipInfo example to test gpu specs --- GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 12 +- .../Steer/Kernels/cuda/Kernels.cu | 175 +++++++++++++++++- .../Steer/Kernels/hip/CMakeLists.txt | 13 +- GPU/GPUbenchmark/macro/CMakeLists.txt | 2 +- 4 files changed, 193 insertions(+), 9 deletions(-) diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt index 8f53feffba52f..43e9caa230d88 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt @@ -8,5 +8,13 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -add_subdirectory(cuda) -add_subdirectory(hip) \ No newline at end of file +# if(CUDA_ENABLED) + # message("Building CUDA benchmark library") + # add_subdirectory(cuda) + # target_compile_definitions(${targetName} PRIVATE CUDA_ENABLED) +# endif() +if(HIP_ENABLED) + message("Building HIP benchmark library") + add_subdirectory(hip) + # target_compile_definitions(${targetName} PRIVATE HIP_ENABLED) +endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu index b023eafcadfe2..ae8651f916f7a 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu @@ -12,7 +12,48 @@ /// \author: mconcas@cern.ch #include -#include +#include +#include + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +#define failed(...) \ + printf("%serror: ", KRED); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + printf("error: TEST FAILED\n%s", KNRM); \ + exit(EXIT_FAILURE); + +#define GPUCHECK(error) \ + if (error != cudaSuccess) { \ + printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \ + __LINE__, KNRM); \ + failed("API returned error code."); \ + } + +void printCompilerInfo() +{ +#ifdef __NVCC__ + printf("compiler: nvcc\n"); +#endif +} + +double bytesToKB(size_t s) { return (double)s / (1024.0); } +double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } + +#define printLimit(w1, limit, units) \ + { \ + size_t val; \ + cudaDeviceGetLimit(&val, limit); \ + std::cout << setw(w1) << #limit ": " << val << " " << units << std::endl; \ + } namespace o2 { @@ -25,10 +66,140 @@ GPUg() void helloKernel() printf("Hello World from GPU!\n"); } } // namespace gpu +void printDeviceProp(int deviceId) +{ + using namespace std; + const int w1 = 34; + cout << left; + cout << setw(w1) + << "--------------------------------------------------------------------------------" + << endl; + cout << setw(w1) << "device#" << deviceId << endl; + + cudaDeviceProp props; + GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); + + cout << setw(w1) << "Name: " << props.name << endl; + cout << setw(w1) << "pciBusID: " << props.pciBusID << endl; + cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; + cout << setw(w1) << "pciDomainID: " << props.pciDomainID << endl; + cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; + cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor + << endl; + cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl; + cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; + cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" + << endl; + cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; + cout << setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0 + << " Mhz" << endl; + cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) + << bytesToGB(props.totalGlobalMem) << " GB" << endl; +#if !defined(__CUDACC__) + cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) + << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << endl; +#endif +#if defined(__HIPCC__) + cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) + << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl; +#endif + cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; + cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" + << endl; + cout << setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << endl; + cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl; + cout << setw(w1) << "warpSize: " << props.warpSize << endl; + cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl; + cout << setw(w1) << "computeMode: " << props.computeMode << endl; + cout << setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << endl; + cout << setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << endl; + cout << setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << endl; + cout << setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << endl; + cout << setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << endl; + cout << setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << endl; + cout << setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << endl; + cout << setw(w1) << "major: " << props.major << endl; + cout << setw(w1) << "minor: " << props.minor << endl; + cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl; + cout << setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << endl; + cout << setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << endl; +#if defined(__HIPCC__) + cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl; + cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch + << endl; + cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl; + cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch + << endl; + cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl; + cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl; + cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl; + cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl; + cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl; + cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl; + cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl; + cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl; + cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl; + cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl; + cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; + cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; + cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; + cout << setw(w1) << "gcnArchName: " << props.gcnArchName << endl; +#endif + cout << setw(w1) << "isIntegrated: " << props.integrated << endl; + cout << setw(w1) << "maxTexture1D: " << props.maxTexture1D << endl; + cout << setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << endl; + cout << setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << endl; + cout << setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << endl; + cout << setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << endl; + cout << setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << endl; +#if defined(__HIPCC__) + cout << setw(w1) << "isLargeBar: " << props.isLargeBar << endl; + cout << setw(w1) << "asicRevision: " << props.asicRevision << endl; +#endif + + int deviceCnt; + cudaGetDeviceCount(&deviceCnt); + cout << setw(w1) << "peers: "; + for (int i = 0; i < deviceCnt; i++) { + int isPeer; + cudaDeviceCanAccessPeer(&isPeer, i, deviceId); + if (isPeer) { + cout << "device#" << i << " "; + } + } + cout << endl; + cout << setw(w1) << "non-peers: "; + for (int i = 0; i < deviceCnt; i++) { + int isPeer; + cudaDeviceCanAccessPeer(&isPeer, i, deviceId); + if (!isPeer) { + cout << "device#" << i << " "; + } + } + cout << endl; + + size_t free, total; + cudaMemGetInfo(&free, &total); + + cout << fixed << setprecision(2); + cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl; + cout << setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << setprecision(0) + << (float)free / total * 100.0 << "%)" << endl; +} void hello_util() { - gpu::helloKernel<<<1, 1>>>(); + int deviceCnt; + + GPUCHECK(cudaGetDeviceCount(&deviceCnt)); + + for (int i = 0; i < deviceCnt; i++) { + cudaSetDevice(i); + printDeviceProp(i); + } + + // gpu::helloKernel<<<1, 1>>>(); + // displayCard(); } } // namespace benchmark } // namespace o2 \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt index 52eef2dd8b420..a40cc6a77204a 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt @@ -18,12 +18,14 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") set(HIP_KERNEL "Kernels.hip.cxx") -set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu) +set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu) set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") if(EXISTS ${HIPIFY_EXECUTABLE}) -# generate on-the-fly the HIP kernel -execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") + +# Generate on-the-fly the HIP kernel +message("Generating HIP kernel code on the fly...") +execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") o2_add_library(HIPbenchmark SOURCES Kernels.hip.cxx PUBLIC_INCLUDE_DIRECTORIES ../ @@ -34,8 +36,11 @@ o2_add_library(HIPbenchmark if(HIP_AMDGPUTARGET) # Need to add gpu target also to link flags due to gpu-rdc option + target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) endif() elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") -endif() \ No newline at end of file +endif() + +install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file diff --git a/GPU/GPUbenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt index ff210986e7205..556948e819a65 100644 --- a/GPU/GPUbenchmark/macro/CMakeLists.txt +++ b/GPU/GPUbenchmark/macro/CMakeLists.txt @@ -1,4 +1,4 @@ o2_add_test_root_macro(runGPUbenchmark.C - PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark + PUBLIC_LINK_LIBRARIES O2::HIPbenchmark O2::GPUCommon LABELS gpu COMPILE_ONLY) \ No newline at end of file From ed9ade90d8f903c95a3ad2b3ae9385bbb65c2702 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 25 May 2021 16:35:56 +0200 Subject: [PATCH 08/42] Fix compilation to test build on EPN --- GPU/GPUbenchmark/Steer/CMakeLists.txt | 6 +++--- GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt index e0afe118e576f..aeca995eb458f 100644 --- a/GPU/GPUbenchmark/Steer/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt @@ -8,10 +8,10 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. +add_subdirectory(Kernels) + # o2_add_library(GPUBenchmark # SOURCES BenchmarkSteer.cxx # PUBLIC_INCLUDE_DIRECTORIES . # PUBLIC_LINK_LIBRARIES O2::HIPbenchmark -# O2::CUDAbenchmark) - -add_subdirectory(Kernels) \ No newline at end of file +# O2::CUDAbenchmark) \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt index 43e9caa230d88..3626924ac5cc2 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt @@ -8,11 +8,11 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -# if(CUDA_ENABLED) - # message("Building CUDA benchmark library") - # add_subdirectory(cuda) +if(CUDA_ENABLED) + message("Building CUDA benchmark library") + add_subdirectory(cuda) # target_compile_definitions(${targetName} PRIVATE CUDA_ENABLED) -# endif() +endif() if(HIP_ENABLED) message("Building HIP benchmark library") add_subdirectory(hip) From d74f0b437412082235195c7972688bfa0f795fb4 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 27 May 2021 11:41:08 +0200 Subject: [PATCH 09/42] Produce two separate executables --- GPU/GPUbenchmark/Steer/BenchmarkSteer.h | 17 -------------- GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 17 ++++++++++++-- .../benchmark.cxx} | 12 ++++++++-- .../Steer/Kernels/cuda/Kernels.cu | 22 ++++++++++++++++++- .../Steer/Kernels/hip/CMakeLists.txt | 8 +++---- 5 files changed, 50 insertions(+), 26 deletions(-) delete mode 100644 GPU/GPUbenchmark/Steer/BenchmarkSteer.h rename GPU/GPUbenchmark/Steer/{BenchmarkSteer.cxx => Kernels/benchmark.cxx} (73%) diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.h b/GPU/GPUbenchmark/Steer/BenchmarkSteer.h deleted file mode 100644 index c246b649d834a..0000000000000 --- a/GPU/GPUbenchmark/Steer/BenchmarkSteer.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright CERN and copyright holders of ALICE O2. This software is -// distributed under the terms of the GNU General Public License v3 (GPL -// Version 3), copied verbatim in the file "COPYING". -// -// See http://alice-o2.web.cern.ch/license for full licensing information. -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. -/// -/// \file BenchmarkSteer.h -/// \author: mconcas@cern.ch - -#ifndef BENCHAMARKSTEER_H -#define BENCHAMARKSTEER_H - -#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt index 3626924ac5cc2..911d567cda350 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt @@ -11,10 +11,23 @@ if(CUDA_ENABLED) message("Building CUDA benchmark library") add_subdirectory(cuda) - # target_compile_definitions(${targetName} PRIVATE CUDA_ENABLED) + o2_add_executable(memory-benchmark-cuda + SOURCES benchmark.cxx + PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark + TARGETVARNAME targetName) endif() + if(HIP_ENABLED) message("Building HIP benchmark library") add_subdirectory(hip) - # target_compile_definitions(${targetName} PRIVATE HIP_ENABLED) + set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) + set(CMAKE_CXX_EXTENSIONS OFF) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") + + o2_add_executable(memory-benchmark-hip + SOURCES benchmark.cxx + PUBLIC_LINK_LIBRARIES O2::HIPbenchmark + hip::host + hip::device + TARGETVARNAME targetName) endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx b/GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx similarity index 73% rename from GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx rename to GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx index fd0fc5db989cf..19cd67d354942 100644 --- a/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx +++ b/GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx @@ -8,8 +8,16 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. /// -/// \file BenchmarkSteer.cxx +/// \file benchmark.cxx /// \author: mconcas@cern.ch -#include +#include +#include +int main() +{ + std::cout << "HELLO WORLD" << std::endl; + o2::benchmark::GPUbenchmark bm{}; + bm.hello(); + return 0; +} diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu index ae8651f916f7a..d9aa3ac88359c 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu @@ -65,6 +65,7 @@ GPUg() void helloKernel() { printf("Hello World from GPU!\n"); } + } // namespace gpu void printDeviceProp(int deviceId) { @@ -202,4 +203,23 @@ void hello_util() // displayCard(); } } // namespace benchmark -} // namespace o2 \ No newline at end of file +} // namespace o2 + +/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before. +I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP. + +Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel. +For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level. +We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks. +We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant. + +For the tests I want to run in the segments, I think these should be: +- Linear read in a multithreaded way: i.e. the standard GPU for loop: +for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i]; +In the end we have to write foo to some output address to make sure the compiler cannot optimize anything. +- Then I'd do the same with some stride, i.e.: +foo += array[i * stride]; +- I'd try a random access with some simple linear congruence RNG per thread to determine the address. +- Then I'd do the same with writing memory, and with copying memory. +- Finally the data type should be flexible, going from char to uint4. +That should cover most cases, but if you have more ideas, feel free to add something.*/ \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt index a40cc6a77204a..93192c2c5a3d0 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt @@ -34,11 +34,11 @@ o2_add_library(HIPbenchmark hip::device TARGETVARNAME targetName) - if(HIP_AMDGPUTARGET) - # Need to add gpu target also to link flags due to gpu-rdc option + # if(HIP_AMDGPUTARGET) + # # Need to add gpu target also to link flags due to gpu-rdc option - target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) - endif() + # target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) + # endif() elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() From 9454b545ee5893e45afe84a7eafd66df6b46ce3f Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 27 May 2021 14:47:12 +0200 Subject: [PATCH 10/42] Flatten dir tree a bit --- GPU/GPUbenchmark/CMakeLists.txt | 25 ++++++++++++-- .../{Steer/Kernels => Shared}/Kernels.h | 0 GPU/GPUbenchmark/Steer/CMakeLists.txt | 17 ---------- GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 33 ------------------- .../{Steer/Kernels => }/benchmark.cxx | 0 .../{Steer/Kernels => }/cuda/CMakeLists.txt | 6 ++-- .../{Steer/Kernels => }/cuda/Kernels.cu | 0 .../{Steer/Kernels => }/hip/.gitignore | 0 .../{Steer/Kernels => }/hip/CMakeLists.txt | 11 ++----- GPU/GPUbenchmark/macro/CMakeLists.txt | 4 --- GPU/GPUbenchmark/macro/runGPUbenchmark.C | 9 ----- 11 files changed, 29 insertions(+), 76 deletions(-) rename GPU/GPUbenchmark/{Steer/Kernels => Shared}/Kernels.h (100%) delete mode 100644 GPU/GPUbenchmark/Steer/CMakeLists.txt delete mode 100644 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt rename GPU/GPUbenchmark/{Steer/Kernels => }/benchmark.cxx (100%) rename GPU/GPUbenchmark/{Steer/Kernels => }/cuda/CMakeLists.txt (80%) rename GPU/GPUbenchmark/{Steer/Kernels => }/cuda/Kernels.cu (100%) rename GPU/GPUbenchmark/{Steer/Kernels => }/hip/.gitignore (100%) rename GPU/GPUbenchmark/{Steer/Kernels => }/hip/CMakeLists.txt (80%) delete mode 100644 GPU/GPUbenchmark/macro/CMakeLists.txt delete mode 100644 GPU/GPUbenchmark/macro/runGPUbenchmark.C diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 28a562412c927..911d567cda350 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -8,5 +8,26 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -add_subdirectory(macro) -add_subdirectory(Steer) \ No newline at end of file +if(CUDA_ENABLED) + message("Building CUDA benchmark library") + add_subdirectory(cuda) + o2_add_executable(memory-benchmark-cuda + SOURCES benchmark.cxx + PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark + TARGETVARNAME targetName) +endif() + +if(HIP_ENABLED) + message("Building HIP benchmark library") + add_subdirectory(hip) + set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) + set(CMAKE_CXX_EXTENSIONS OFF) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") + + o2_add_executable(memory-benchmark-hip + SOURCES benchmark.cxx + PUBLIC_LINK_LIBRARIES O2::HIPbenchmark + hip::host + hip::device + TARGETVARNAME targetName) +endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h similarity index 100% rename from GPU/GPUbenchmark/Steer/Kernels/Kernels.h rename to GPU/GPUbenchmark/Shared/Kernels.h diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt deleted file mode 100644 index aeca995eb458f..0000000000000 --- a/GPU/GPUbenchmark/Steer/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright CERN and copyright holders of ALICE O2. This software is distributed -# under the terms of the GNU General Public License v3 (GPL Version 3), copied -# verbatim in the file "COPYING". -# -# See http://alice-o2.web.cern.ch/license for full licensing information. -# -# In applying this license CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization or -# submit itself to any jurisdiction. - -add_subdirectory(Kernels) - -# o2_add_library(GPUBenchmark -# SOURCES BenchmarkSteer.cxx -# PUBLIC_INCLUDE_DIRECTORIES . -# PUBLIC_LINK_LIBRARIES O2::HIPbenchmark -# O2::CUDAbenchmark) \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt deleted file mode 100644 index 911d567cda350..0000000000000 --- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright CERN and copyright holders of ALICE O2. This software is distributed -# under the terms of the GNU General Public License v3 (GPL Version 3), copied -# verbatim in the file "COPYING". -# -# See http://alice-o2.web.cern.ch/license for full licensing information. -# -# In applying this license CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization or -# submit itself to any jurisdiction. - -if(CUDA_ENABLED) - message("Building CUDA benchmark library") - add_subdirectory(cuda) - o2_add_executable(memory-benchmark-cuda - SOURCES benchmark.cxx - PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark - TARGETVARNAME targetName) -endif() - -if(HIP_ENABLED) - message("Building HIP benchmark library") - add_subdirectory(hip) - set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) - set(CMAKE_CXX_EXTENSIONS OFF) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") - - o2_add_executable(memory-benchmark-hip - SOURCES benchmark.cxx - PUBLIC_LINK_LIBRARIES O2::HIPbenchmark - hip::host - hip::device - TARGETVARNAME targetName) -endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx similarity index 100% rename from GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx rename to GPU/GPUbenchmark/benchmark.cxx diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt similarity index 80% rename from GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt rename to GPU/GPUbenchmark/cuda/CMakeLists.txt index f9866db53131f..3ce3990534ece 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt +++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt @@ -8,14 +8,14 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -set(HDRS_INSTALL ../Kernels.h) +set(HDRS_INSTALL ../Shared/Kernels.h) o2_add_library(CUDAbenchmark SOURCES Kernels.cu - PUBLIC_INCLUDE_DIRECTORIES ../ + PUBLIC_INCLUDE_DIRECTORIES ../Shared PUBLIC_LINK_LIBRARIES O2::GPUCommon TARGETVARNAME targetName) set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) -install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file +# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu similarity index 100% rename from GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu rename to GPU/GPUbenchmark/cuda/Kernels.cu diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore b/GPU/GPUbenchmark/hip/.gitignore similarity index 100% rename from GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore rename to GPU/GPUbenchmark/hip/.gitignore diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt similarity index 80% rename from GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt rename to GPU/GPUbenchmark/hip/CMakeLists.txt index 93192c2c5a3d0..27e8a15efdc20 100644 --- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/hip/CMakeLists.txt @@ -8,7 +8,7 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. -set(HDRS_INSTALL ../Kernels.h) +set(HDRS_INSTALL ../Shared/Kernels.h) set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) set(CMAKE_CXX_EXTENSIONS OFF) @@ -28,19 +28,14 @@ message("Generating HIP kernel code on the fly...") execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") o2_add_library(HIPbenchmark SOURCES Kernels.hip.cxx - PUBLIC_INCLUDE_DIRECTORIES ../ + PUBLIC_INCLUDE_DIRECTORIES ../Shared PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host hip::device TARGETVARNAME targetName) - # if(HIP_AMDGPUTARGET) - # # Need to add gpu target also to link flags due to gpu-rdc option - - # target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) - # endif() elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() -install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file +# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file diff --git a/GPU/GPUbenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt deleted file mode 100644 index 556948e819a65..0000000000000 --- a/GPU/GPUbenchmark/macro/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -o2_add_test_root_macro(runGPUbenchmark.C - PUBLIC_LINK_LIBRARIES O2::HIPbenchmark - O2::GPUCommon - LABELS gpu COMPILE_ONLY) \ No newline at end of file diff --git a/GPU/GPUbenchmark/macro/runGPUbenchmark.C b/GPU/GPUbenchmark/macro/runGPUbenchmark.C deleted file mode 100644 index 0d3bf53899984..0000000000000 --- a/GPU/GPUbenchmark/macro/runGPUbenchmark.C +++ /dev/null @@ -1,9 +0,0 @@ -#if !defined(__CLING__) || defined(__ROOTCLING__) -#include -#endif - -void runGPUbenchmark() -{ - o2::benchmark::GPUbenchmark bm{}; - bm.hello(); -} \ No newline at end of file From 84f8534610d10f112ec17c60afae8955ad12f6d6 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 27 May 2021 15:30:25 +0200 Subject: [PATCH 11/42] Cleanup --- GPU/GPUbenchmark/Shared/Common.h | 44 +++++ GPU/GPUbenchmark/Shared/Kernels.h | 37 ++++- GPU/GPUbenchmark/benchmark.cxx | 3 +- GPU/GPUbenchmark/cuda/CMakeLists.txt | 4 +- GPU/GPUbenchmark/cuda/Kernels.cu | 238 +++++++++++---------------- GPU/GPUbenchmark/hip/CMakeLists.txt | 6 +- 6 files changed, 170 insertions(+), 162 deletions(-) create mode 100644 GPU/GPUbenchmark/Shared/Common.h diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h new file mode 100644 index 0000000000000..b98d01923dc61 --- /dev/null +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -0,0 +1,44 @@ +// Copyright CERN and copyright holders of ALICE O2. This software is +// distributed under the terms of the GNU General Public License v3 (GPL +// Version 3), copied verbatim in the file "COPYING". +// +// See http://alice-o2.web.cern.ch/license for full licensing information. +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. +/// +/// \file Common.h +/// \author: mconcas@cern.ch + +#ifndef GPUBENCHMARK_COMMON_H +#define GPUBENCHMARK_COMMON_H +#if defined (__HIPCC__) +#include "hip/hip_runtime.h" +#endif +#if defined (__HIPCC__) +#define AUTO_DISCARD "auto discard =" +#else +#define AUTO_DISCARD +#endif + +#include +#include +#include "GPUCommonDef.h" + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +#define failed(...) \ + printf("%serror: ", KRED); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + printf("error: TEST FAILED\n%s", KNRM); \ + exit(EXIT_FAILURE); +#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 0390c93e33040..514eede070e73 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -8,10 +8,11 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. /// -/// \file GPUbenchmark.h +/// \file Kernels.h /// \author: mconcas@cern.ch -#ifndef GPUBENCHMARK_H -#define GPUBENCHMARK_H + +#ifndef GPU_BENCHMARK_KERNELS_H +#define GPU_BENCHMARK_KERNELS_H #include "GPUCommonDef.h" @@ -19,21 +20,41 @@ namespace o2 { namespace benchmark { -void hello_util(); +void printDevices(); class GPUbenchmark final { public: GPUbenchmark() = default; virtual ~GPUbenchmark() = default; - void hello(); + void run(); }; // Steers -void GPUbenchmark::hello() +void GPUbenchmark::run() { - hello_util(); + printDevices(); } } // namespace benchmark } // namespace o2 -#endif \ No newline at end of file +#endif + + +/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before. +I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP. + +Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel. +For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level. +We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks. +We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant. + +For the tests I want to run in the segments, I think these should be: +- Linear read in a multithreaded way: i.e. the standard GPU for loop: +for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i]; +In the end we have to write foo to some output address to make sure the compiler cannot optimize anything. +- Then I'd do the same with some stride, i.e.: +foo += array[i * stride]; +- I'd try a random access with some simple linear congruence RNG per thread to determine the address. +- Then I'd do the same with writing memory, and with copying memory. +- Finally the data type should be flexible, going from char to uint4. +That should cover most cases, but if you have more ideas, feel free to add something.*/ \ No newline at end of file diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 19cd67d354942..aaaa0ffbbe390 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -16,8 +16,7 @@ int main() { - std::cout << "HELLO WORLD" << std::endl; o2::benchmark::GPUbenchmark bm{}; - bm.hello(); + bm.run(); return 0; } diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt index 3ce3990534ece..0e8415cef1262 100644 --- a/GPU/GPUbenchmark/cuda/CMakeLists.txt +++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt @@ -16,6 +16,4 @@ o2_add_library(CUDAbenchmark PUBLIC_LINK_LIBRARIES O2::GPUCommon TARGETVARNAME targetName) - set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) - -# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file + set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) \ No newline at end of file diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index d9aa3ac88359c..7d2070d7065d8 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -12,24 +12,7 @@ /// \author: mconcas@cern.ch #include -#include -#include - -#define KNRM "\x1B[0m" -#define KRED "\x1B[31m" -#define KGRN "\x1B[32m" -#define KYEL "\x1B[33m" -#define KBLU "\x1B[34m" -#define KMAG "\x1B[35m" -#define KCYN "\x1B[36m" -#define KWHT "\x1B[37m" - -#define failed(...) \ - printf("%serror: ", KRED); \ - printf(__VA_ARGS__); \ - printf("\n"); \ - printf("error: TEST FAILED\n%s", KNRM); \ - exit(EXIT_FAILURE); +#include #define GPUCHECK(error) \ if (error != cudaSuccess) { \ @@ -38,13 +21,6 @@ failed("API returned error code."); \ } -void printCompilerInfo() -{ -#ifdef __NVCC__ - printf("compiler: nvcc\n"); -#endif -} - double bytesToKB(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } @@ -61,165 +37,137 @@ namespace benchmark { namespace gpu { -GPUg() void helloKernel() -{ - printf("Hello World from GPU!\n"); -} - +// Kernels here } // namespace gpu void printDeviceProp(int deviceId) { - using namespace std; const int w1 = 34; - cout << left; - cout << setw(w1) - << "--------------------------------------------------------------------------------" - << endl; - cout << setw(w1) << "device#" << deviceId << endl; + std::cout << std::left; + std::cout << std::setw(w1) + << "--------------------------------------------------------------------------------" + << std::endl; + std::cout << std::setw(w1) << "device#" << deviceId << std::endl; cudaDeviceProp props; GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); - cout << setw(w1) << "Name: " << props.name << endl; - cout << setw(w1) << "pciBusID: " << props.pciBusID << endl; - cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; - cout << setw(w1) << "pciDomainID: " << props.pciDomainID << endl; - cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; - cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor - << endl; - cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl; - cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; - cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" - << endl; - cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; - cout << setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0 - << " Mhz" << endl; - cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) - << bytesToGB(props.totalGlobalMem) << " GB" << endl; + std::cout << std::setw(w1) << "Name: " << props.name << std::endl; + std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl; + std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl; + std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl; + std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl; + std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor + << std::endl; + std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl; + std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl; + std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" + << std::endl; + std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl; + std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0 + << " Mhz" << std::endl; + std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2) + << bytesToGB(props.totalGlobalMem) << " GB" << std::endl; #if !defined(__CUDACC__) - cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) - << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << endl; + std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2) + << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << std::endl; #endif #if defined(__HIPCC__) - cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) - << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl; + std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2) + << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << std::endl; #endif - cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; - cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" - << endl; - cout << setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << endl; - cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl; - cout << setw(w1) << "warpSize: " << props.warpSize << endl; - cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl; - cout << setw(w1) << "computeMode: " << props.computeMode << endl; - cout << setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << endl; - cout << setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << endl; - cout << setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << endl; - cout << setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << endl; - cout << setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << endl; - cout << setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << endl; - cout << setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << endl; - cout << setw(w1) << "major: " << props.major << endl; - cout << setw(w1) << "minor: " << props.minor << endl; - cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl; - cout << setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << endl; - cout << setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << endl; + std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl; + std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" + << std::endl; + std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl; + std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl; + std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl; + std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl; + std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl; + std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl; + std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl; + std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl; + std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl; + std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl; + std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl; + std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl; + std::cout << std::setw(w1) << "major: " << props.major << std::endl; + std::cout << std::setw(w1) << "minor: " << props.minor << std::endl; + std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl; + std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl; + std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl; #if defined(__HIPCC__) - cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl; - cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch - << endl; - cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl; - cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch - << endl; - cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl; - cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl; - cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl; - cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl; - cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl; - cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl; - cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl; - cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl; - cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl; - cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl; - cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; - cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; - cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; - cout << setw(w1) << "gcnArchName: " << props.gcnArchName << endl; + std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch + << std::endl; + std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch + << std::endl; + std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl; + std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl; + std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl; + std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl; + std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl; + std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl; + std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl; + std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl; + std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl; + std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl; + std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl; + std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl; + std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl; #endif - cout << setw(w1) << "isIntegrated: " << props.integrated << endl; - cout << setw(w1) << "maxTexture1D: " << props.maxTexture1D << endl; - cout << setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << endl; - cout << setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << endl; - cout << setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << endl; - cout << setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << endl; - cout << setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << endl; + std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl; + std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl; + std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl; + std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl; + std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl; + std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl; + std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl; #if defined(__HIPCC__) - cout << setw(w1) << "isLargeBar: " << props.isLargeBar << endl; - cout << setw(w1) << "asicRevision: " << props.asicRevision << endl; + std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl; + std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl; #endif int deviceCnt; - cudaGetDeviceCount(&deviceCnt); - cout << setw(w1) << "peers: "; + GPUCHECK(cudaGetDeviceCount(&deviceCnt)); + std::cout << std::setw(w1) << "peers: "; for (int i = 0; i < deviceCnt; i++) { int isPeer; - cudaDeviceCanAccessPeer(&isPeer, i, deviceId); + GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId)); if (isPeer) { - cout << "device#" << i << " "; + std::cout << "device#" << i << " "; } } - cout << endl; - cout << setw(w1) << "non-peers: "; + std::cout << std::endl; + std::cout << std::setw(w1) << "non-peers: "; for (int i = 0; i < deviceCnt; i++) { int isPeer; - cudaDeviceCanAccessPeer(&isPeer, i, deviceId); + GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId)); if (!isPeer) { - cout << "device#" << i << " "; + std::cout << "device#" << i << " "; } } - cout << endl; + std::cout << std::endl; size_t free, total; - cudaMemGetInfo(&free, &total); + GPUCHECK(cudaMemGetInfo(&free, &total)); - cout << fixed << setprecision(2); - cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl; - cout << setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << setprecision(0) - << (float)free / total * 100.0 << "%)" << endl; + std::cout << std::fixed << std::setprecision(2); + std::cout << std::setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << std::endl; + std::cout << std::setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << std::setprecision(0) + << (float)free / total * 100.0 << "%)" << std::endl; } -void hello_util() +void printDevices() { int deviceCnt; - GPUCHECK(cudaGetDeviceCount(&deviceCnt)); for (int i = 0; i < deviceCnt; i++) { - cudaSetDevice(i); + GPUCHECK(cudaSetDevice(i)); printDeviceProp(i); } - - // gpu::helloKernel<<<1, 1>>>(); - // displayCard(); } } // namespace benchmark -} // namespace o2 - -/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before. -I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP. - -Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel. -For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level. -We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks. -We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant. - -For the tests I want to run in the segments, I think these should be: -- Linear read in a multithreaded way: i.e. the standard GPU for loop: -for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i]; -In the end we have to write foo to some output address to make sure the compiler cannot optimize anything. -- Then I'd do the same with some stride, i.e.: -foo += array[i * stride]; -- I'd try a random access with some simple linear congruence RNG per thread to determine the address. -- Then I'd do the same with writing memory, and with copying memory. -- Finally the data type should be flexible, going from char to uint4. -That should cover most cases, but if you have more ideas, feel free to add something.*/ \ No newline at end of file +} // namespace o2 \ No newline at end of file diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt index 27e8a15efdc20..1447441c9da1a 100644 --- a/GPU/GPUbenchmark/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/hip/CMakeLists.txt @@ -24,7 +24,7 @@ set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") if(EXISTS ${HIPIFY_EXECUTABLE}) # Generate on-the-fly the HIP kernel -message("Generating HIP kernel code on the fly...") +message("Generating HIP kernel code ....") execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") o2_add_library(HIPbenchmark SOURCES Kernels.hip.cxx @@ -36,6 +36,4 @@ o2_add_library(HIPbenchmark elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") -endif() - -# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels) \ No newline at end of file +endif() \ No newline at end of file From 7e2ef6ae0e3589569070b0d58aa5cb4f3f37d3fc Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 28 May 2021 17:27:38 +0200 Subject: [PATCH 12/42] Add CMake forced re-configuration --- GPU/GPUbenchmark/Shared/Common.h | 7 +------ GPU/GPUbenchmark/Shared/Kernels.h | 23 ++++++++++++++++++++++- GPU/GPUbenchmark/cuda/Kernels.cu | 10 ++++++++++ GPU/GPUbenchmark/hip/CMakeLists.txt | 3 ++- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index b98d01923dc61..38f34cf4c1902 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -13,14 +13,9 @@ #ifndef GPUBENCHMARK_COMMON_H #define GPUBENCHMARK_COMMON_H -#if defined (__HIPCC__) +#if defined(__HIPCC__) #include "hip/hip_runtime.h" #endif -#if defined (__HIPCC__) -#define AUTO_DISCARD "auto discard =" -#else -#define AUTO_DISCARD -#endif #include #include diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 514eede070e73..5640f07a78e57 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -15,12 +15,32 @@ #define GPU_BENCHMARK_KERNELS_H #include "GPUCommonDef.h" +#include +#include +#include namespace o2 { namespace benchmark { void printDevices(); +void init(); +template +float measure(void (*task)(T...), const char* taskName, T&&... args) +{ + float diff{0.f}; + + auto start = std::chrono::high_resolution_clock::now(); + (*task)(std::forward(args)...); + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration diff_t{end - start}; + diff = diff_t.count(); + + std::cout << std::setw(2) << " - " << taskName << " completed in: " << diff << " ms" << std::endl; + + return diff; +} class GPUbenchmark final { @@ -33,7 +53,8 @@ class GPUbenchmark final // Steers void GPUbenchmark::run() { - printDevices(); + // printDevices(); + measure(&init, "Init"); } } // namespace benchmark } // namespace o2 diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 7d2070d7065d8..e555385c600f9 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -169,5 +169,15 @@ void printDevices() printDeviceProp(i); } } + +void init() +{ + size_t free, total; + GPUCHECK(cudaMemGetInfo(&free, &total)); + + void* devicePtr; + GPUCHECK(cudaMalloc(&devicePtr, total)); +} + } // namespace benchmark } // namespace o2 \ No newline at end of file diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt index 1447441c9da1a..a785a74ec40c5 100644 --- a/GPU/GPUbenchmark/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/hip/CMakeLists.txt @@ -22,9 +22,10 @@ set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu) set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") if(EXISTS ${HIPIFY_EXECUTABLE}) +set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) # Generate on-the-fly the HIP kernel -message("Generating HIP kernel code ....") +message("Generating HIP kernel code ...") execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") o2_add_library(HIPbenchmark SOURCES Kernels.hip.cxx From 664759fe46879ad435ce2bc2c595c06758d87852 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 31 May 2021 19:13:27 +0200 Subject: [PATCH 13/42] HIP can't find symbols --- GPU/GPUbenchmark/Shared/Common.h | 1 - GPU/GPUbenchmark/Shared/Kernels.h | 66 +++++++++++++------- GPU/GPUbenchmark/benchmark.cxx | 2 +- GPU/GPUbenchmark/cuda/Kernels.cu | 94 +++++++++++++++++++++++------ GPU/GPUbenchmark/hip/CMakeLists.txt | 7 +++ 5 files changed, 130 insertions(+), 40 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index 38f34cf4c1902..c831ac46882d8 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -19,7 +19,6 @@ #include #include -#include "GPUCommonDef.h" #define KNRM "\x1B[0m" #define KRED "\x1B[31m" diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 5640f07a78e57..f8983139718b9 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -15,52 +15,76 @@ #define GPU_BENCHMARK_KERNELS_H #include "GPUCommonDef.h" +#include #include #include #include +#define PARTITION_SIZE_GB 1 +#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.99f +#define GB 1073741824 + +double bytesToKB(size_t s) { return (double)s / (1024.0); } +double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } + namespace o2 { namespace benchmark { -void printDevices(); -void init(); -template -float measure(void (*task)(T...), const char* taskName, T&&... args) -{ - float diff{0.f}; - auto start = std::chrono::high_resolution_clock::now(); - (*task)(std::forward(args)...); - auto end = std::chrono::high_resolution_clock::now(); +template +struct gpuState { + int getMaxSegments() + { + return bytesToGB(allocatedMemory); + } - std::chrono::duration diff_t{end - start}; - diff = diff_t.count(); + void computeBufferPointers() + { + addresses.resize(getMaxSegments()); + for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) { + addresses[iBuffAddress] = scratchPtr + GB * PARTITION_SIZE_GB * iBuffAddress; + } + } - std::cout << std::setw(2) << " - " << taskName << " completed in: " << diff << " ms" << std::endl; + std::vector getBuffersPointers() + { + return addresses; + } - return diff; -} + std::vector addresses; + size_t allocatedMemory; + T* scratchPtr; + + //Static info + size_t totalMemory; + size_t nMultiprocessors; + size_t nMaxThreadsPerBlock; +}; +template class GPUbenchmark final { public: GPUbenchmark() = default; virtual ~GPUbenchmark() = default; + template + float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); + + void init(const int deviceId); void run(); + void finalize(); + void readingBenchmark(); + void printDevices(); + + private: + gpuState mState; }; -// Steers -void GPUbenchmark::run() -{ - // printDevices(); - measure(&init, "Init"); -} } // namespace benchmark } // namespace o2 #endif - /*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before. I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP. diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index aaaa0ffbbe390..610028a8e16f6 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -16,7 +16,7 @@ int main() { - o2::benchmark::GPUbenchmark bm{}; + o2::benchmark::GPUbenchmark bm{}; bm.run(); return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e555385c600f9..3bc8566ef3191 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -21,24 +21,27 @@ failed("API returned error code."); \ } -double bytesToKB(size_t s) { return (double)s / (1024.0); } -double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } - -#define printLimit(w1, limit, units) \ - { \ - size_t val; \ - cudaDeviceGetLimit(&val, limit); \ - std::cout << setw(w1) << #limit ": " << val << " " << units << std::endl; \ - } - namespace o2 { namespace benchmark { namespace gpu { -// Kernels here +// Kernels go here +template +GPUg() void readerKernel( + // buffer_type* buffer, + // size_t bufferSize) +) +{ + printf("ciao"); + // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { + // if (i == 0) { + // } + // } +} } // namespace gpu + void printDeviceProp(int deviceId) { const int w1 = 34; @@ -159,7 +162,22 @@ void printDeviceProp(int deviceId) << (float)free / total * 100.0 << "%)" << std::endl; } -void printDevices() +template +template +float GPUbenchmark::measure(void (GPUbenchmark::*task)(T...), const char* taskName, T&&... args) +{ + float diff{0.f}; + auto start = std::chrono::high_resolution_clock::now(); + (this->*task)(std::forward(args)...); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff_t{end - start}; + diff = diff_t.count(); + std::cout << std::setw(2) << ">>> " << taskName << " completed in: " << diff << " ms" << std::endl; + return diff; +} + +template +void GPUbenchmark::printDevices() { int deviceCnt; GPUCHECK(cudaGetDeviceCount(&deviceCnt)); @@ -170,14 +188,56 @@ void printDevices() } } -void init() +template +void GPUbenchmark::init(const int deviceId) { - size_t free, total; - GPUCHECK(cudaMemGetInfo(&free, &total)); + cudaDeviceProp props; + size_t free; + + // Fetch and store traits + GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); + GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); - void* devicePtr; - GPUCHECK(cudaMalloc(&devicePtr, total)); + mState.nMultiprocessors = props.multiProcessorCount; + mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; + mState.allocatedMemory = static_cast(FREE_MEMORY_FRACTION_TO_ALLOCATE * free); + + // Setup + GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.allocatedMemory)); +} + +template +void GPUbenchmark::readingBenchmark() +{ + dim3 nBlocks(mState.nMultiprocessors); + dim3 nThreads(mState.nMaxThreadsPerBlock); + gpu::readerKernel<<<1, 1>>>(); +} + +template +void GPUbenchmark::finalize() +{ + GPUCHECK(cudaFree(mState.scratchPtr)); +} + +template +void GPUbenchmark::run() +{ + printDevices(); + measure(&GPUbenchmark::init, "Init", 0); + std::cout << " ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory + << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n"; + std::cout << " └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n"; + mState.computeBufferPointers(); + + // for (auto& addr : mState.getBuffersPointers()) { + // std::cout << (void*)addr << std::endl; + // } + measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); + GPUbenchmark::finalize(); } +template class GPUbenchmark; + } // namespace benchmark } // namespace o2 \ No newline at end of file diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt index a785a74ec40c5..6d41c72148374 100644 --- a/GPU/GPUbenchmark/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/hip/CMakeLists.txt @@ -35,6 +35,13 @@ o2_add_library(HIPbenchmark hip::device TARGETVARNAME targetName) +target_compile_definitions(${targetName} PRIVATE $) + +if(HIP_AMDGPUTARGET) + # Need to add gpu target also to link flags due to gpu-rdc option + target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) +endif() + elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() \ No newline at end of file From 1d352ab03ab27f6c011de64d698a1ea85c53d3b9 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 1 Jun 2021 11:54:08 +0200 Subject: [PATCH 14/42] Checkpoint before radical change --- GPU/GPUbenchmark/CMakeLists.txt | 12 ++- GPU/GPUbenchmark/cuda/CMakeLists.txt | 4 +- GPU/GPUbenchmark/cuda/Kernels.cu | 107 ++++++++++++++++++++++----- GPU/GPUbenchmark/hip/CMakeLists.txt | 14 ++-- 4 files changed, 102 insertions(+), 35 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 911d567cda350..30969425203a9 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -9,7 +9,6 @@ # submit itself to any jurisdiction. if(CUDA_ENABLED) - message("Building CUDA benchmark library") add_subdirectory(cuda) o2_add_executable(memory-benchmark-cuda SOURCES benchmark.cxx @@ -18,16 +17,21 @@ if(CUDA_ENABLED) endif() if(HIP_ENABLED) - message("Building HIP benchmark library") add_subdirectory(hip) set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) + set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") o2_add_executable(memory-benchmark-hip SOURCES benchmark.cxx PUBLIC_LINK_LIBRARIES O2::HIPbenchmark - hip::host - hip::device + hip::host TARGETVARNAME targetName) + +if(HIP_AMDGPUTARGET) +# Need to add gpu target also to link flags due to gpu-rdc option +target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) +endif() endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt index 0e8415cef1262..89a88969c2a3f 100644 --- a/GPU/GPUbenchmark/cuda/CMakeLists.txt +++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt @@ -14,6 +14,4 @@ o2_add_library(CUDAbenchmark SOURCES Kernels.cu PUBLIC_INCLUDE_DIRECTORIES ../Shared PUBLIC_LINK_LIBRARIES O2::GPUCommon - TARGETVARNAME targetName) - - set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) \ No newline at end of file + TARGETVARNAME targetName) \ No newline at end of file diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 3bc8566ef3191..20e5187ba69f1 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -13,6 +13,7 @@ #include #include +#include #define GPUCHECK(error) \ if (error != cudaSuccess) { \ @@ -21,6 +22,15 @@ failed("API returned error code."); \ } +#define CHECK(cmd) \ + { \ + cudaError_t error = cmd; \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } + namespace o2 { namespace benchmark @@ -28,18 +38,33 @@ namespace benchmark namespace gpu { // Kernels go here -template -GPUg() void readerKernel( - // buffer_type* buffer, - // size_t bufferSize) -) +/* + * Square each element in the array A and write to array C. + */ +template +__global__ void + vector_square(T* C_d, T* A_d, size_t N) { - printf("ciao"); - // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { - // if (i == 0) { - // } - // } + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = offset; i < N; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } } + +// template +// GPUg() void readerKernel( +// // buffer_type* buffer, +// // size_t bufferSize) +// ) +// { +// printf("ciao"); +// // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { +// // if (i == 0) { +// // } +// // } +// } } // namespace gpu void printDeviceProp(int deviceId) @@ -209,9 +234,51 @@ void GPUbenchmark::init(const int deviceId) template void GPUbenchmark::readingBenchmark() { - dim3 nBlocks(mState.nMultiprocessors); - dim3 nThreads(mState.nMaxThreadsPerBlock); - gpu::readerKernel<<<1, 1>>>(); + // dim3 nBlocks(mState.nMultiprocessors); + // dim3 nThreads(mState.nMaxThreadsPerBlock); + // gpu::readerKernel<<<1, 1>>>(); + float *A_d, *C_d; + float *A_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + + cudaDeviceProp props; + CHECK(cudaGetDeviceProperties(&props, 0 /*deviceID*/)); + printf("info: running on device %s\n", props.name); + + printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0); + A_h = (float*)malloc(Nbytes); + CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess); + C_h = (float*)malloc(Nbytes); + CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess); + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + } + + printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0); + CHECK(cudaMalloc(&A_d, Nbytes)); + CHECK(cudaMalloc(&C_d, Nbytes)); + + printf("info: copy Host2Device\n"); + CHECK(cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice)); + + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + + printf("info: launch 'vector_square' kernel\n"); + gpu::vector_square<<>>(C_d, A_d, N); + + printf("info: copy Device2Host\n"); + CHECK(cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost)); + + printf("info: check result\n"); + for (size_t i = 0; i < N; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + CHECK(cudaErrorUnknown); + } + } + printf("PASSED!\n"); } template @@ -223,18 +290,18 @@ void GPUbenchmark::finalize() template void GPUbenchmark::run() { - printDevices(); - measure(&GPUbenchmark::init, "Init", 0); - std::cout << " ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory - << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n"; - std::cout << " └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n"; - mState.computeBufferPointers(); + // printDevices(); + // measure(&GPUbenchmark::init, "Init", 0); + // std::cout << " ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory + // << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n"; + // std::cout << " └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n"; + // mState.computeBufferPointers(); // for (auto& addr : mState.getBuffersPointers()) { // std::cout << (void*)addr << std::endl; // } measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); - GPUbenchmark::finalize(); + // GPUbenchmark::finalize(); } template class GPUbenchmark; diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt index 6d41c72148374..b599db0de6cc6 100644 --- a/GPU/GPUbenchmark/hip/CMakeLists.txt +++ b/GPU/GPUbenchmark/hip/CMakeLists.txt @@ -10,33 +10,31 @@ set(HDRS_INSTALL ../Shared/Kernels.h) -set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") # Hipify-perl set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") -set(HIP_KERNEL "Kernels.hip.cxx") +set(HIP_KERNEL "Kernels.hip.cxx") set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu) set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") if(EXISTS ${HIPIFY_EXECUTABLE}) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) - -# Generate on-the-fly the HIP kernel message("Generating HIP kernel code ...") execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") o2_add_library(HIPbenchmark SOURCES Kernels.hip.cxx PUBLIC_INCLUDE_DIRECTORIES ../Shared PUBLIC_LINK_LIBRARIES O2::GPUCommon - hip::host - hip::device + # hip::host + # hip::device TARGETVARNAME targetName) -target_compile_definitions(${targetName} PRIVATE $) - if(HIP_AMDGPUTARGET) # Need to add gpu target also to link flags due to gpu-rdc option target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) From 12c5d394f61e0ec0d2375ca260668d52741ad25e Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 1 Jun 2021 12:39:12 +0200 Subject: [PATCH 15/42] Create single executable --- GPU/GPUbenchmark/CMakeLists.txt | 36 ++++++++++++++++------ GPU/GPUbenchmark/Shared/Common.h | 5 ++-- GPU/GPUbenchmark/Shared/Kernels.h | 5 +--- GPU/GPUbenchmark/benchmark.cxx | 2 +- GPU/GPUbenchmark/cuda/CMakeLists.txt | 17 ----------- GPU/GPUbenchmark/cuda/Kernels.cu | 7 +++-- GPU/GPUbenchmark/hip/CMakeLists.txt | 45 ---------------------------- 7 files changed, 37 insertions(+), 80 deletions(-) delete mode 100644 GPU/GPUbenchmark/cuda/CMakeLists.txt delete mode 100644 GPU/GPUbenchmark/hip/CMakeLists.txt diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 30969425203a9..0829637fee964 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -8,17 +8,34 @@ # granted to it by virtue of its status as an Intergovernmental Organization or # submit itself to any jurisdiction. +set(HDRS_INSTALL ../Shared/Kernels.h) + if(CUDA_ENABLED) - add_subdirectory(cuda) + # add_subdirectory(cuda) o2_add_executable(memory-benchmark-cuda SOURCES benchmark.cxx - PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark + cuda/Kernels.cu + PUBLIC_LINK_LIBRARIES O2::GPUCommon TARGETVARNAME targetName) endif() if(HIP_ENABLED) - add_subdirectory(hip) - set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE}) + # Hipify-perl + set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") + + set(HIP_KERNEL "Kernels.hip.cxx") + set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu) + set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/hip/${HIP_KERNEL}") + + if(EXISTS ${HIPIFY_EXECUTABLE}) + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) + message("Generating HIP kernel code ...") + execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") + elseif() + message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") + endif() + + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) set(CMAKE_CXX_EXTENSIONS OFF) @@ -26,12 +43,13 @@ if(HIP_ENABLED) o2_add_executable(memory-benchmark-hip SOURCES benchmark.cxx - PUBLIC_LINK_LIBRARIES O2::HIPbenchmark + hip/Kernels.hip.cxx + PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host TARGETVARNAME targetName) -if(HIP_AMDGPUTARGET) -# Need to add gpu target also to link flags due to gpu-rdc option -target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) -endif() + if(HIP_AMDGPUTARGET) + # Need to add gpu target also to link flags due to gpu-rdc option + target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) + endif() endif() \ No newline at end of file diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index c831ac46882d8..99db8e114aee6 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -11,8 +11,8 @@ /// \file Common.h /// \author: mconcas@cern.ch -#ifndef GPUBENCHMARK_COMMON_H -#define GPUBENCHMARK_COMMON_H +#ifndef GPU_BENCHMARK_COMMON_H +#define GPU_BENCHMARK_COMMON_H #if defined(__HIPCC__) #include "hip/hip_runtime.h" #endif @@ -35,4 +35,5 @@ printf("\n"); \ printf("error: TEST FAILED\n%s", KNRM); \ exit(EXIT_FAILURE); + #endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index f8983139718b9..32d93159ac958 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -24,9 +24,6 @@ #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.99f #define GB 1073741824 -double bytesToKB(size_t s) { return (double)s / (1024.0); } -double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } - namespace o2 { namespace benchmark @@ -36,7 +33,7 @@ template struct gpuState { int getMaxSegments() { - return bytesToGB(allocatedMemory); + return (double)allocatedMemory / (1024.0 * 1024.0 * 1024.0); } void computeBufferPointers() diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 610028a8e16f6..89c2b3d79be76 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -12,7 +12,7 @@ /// \author: mconcas@cern.ch #include -#include +#include "Shared/Kernels.h" int main() { diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt deleted file mode 100644 index 89a88969c2a3f..0000000000000 --- a/GPU/GPUbenchmark/cuda/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright CERN and copyright holders of ALICE O2. This software is distributed -# under the terms of the GNU General Public License v3 (GPL Version 3), copied -# verbatim in the file "COPYING". -# -# See http://alice-o2.web.cern.ch/license for full licensing information. -# -# In applying this license CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization or -# submit itself to any jurisdiction. - -set(HDRS_INSTALL ../Shared/Kernels.h) - -o2_add_library(CUDAbenchmark - SOURCES Kernels.cu - PUBLIC_INCLUDE_DIRECTORIES ../Shared - PUBLIC_LINK_LIBRARIES O2::GPUCommon - TARGETVARNAME targetName) \ No newline at end of file diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 20e5187ba69f1..a2953c5b48d9f 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -11,10 +11,13 @@ /// \file Kernels.cu /// \author: mconcas@cern.ch -#include -#include +#include "../Shared/Kernels.h" +#include "../Shared/Common.h" #include +double bytesToKB(size_t s) { return (double)s / (1024.0); } +double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } + #define GPUCHECK(error) \ if (error != cudaSuccess) { \ printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \ diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt deleted file mode 100644 index b599db0de6cc6..0000000000000 --- a/GPU/GPUbenchmark/hip/CMakeLists.txt +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright CERN and copyright holders of ALICE O2. This software is distributed -# under the terms of the GNU General Public License v3 (GPL Version 3), copied -# verbatim in the file "COPYING". -# -# See http://alice-o2.web.cern.ch/license for full licensing information. -# -# In applying this license CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization or -# submit itself to any jurisdiction. - -set(HDRS_INSTALL ../Shared/Kernels.h) - -set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) -set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") - -# Hipify-perl -set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl") - -set(HIP_KERNEL "Kernels.hip.cxx") -set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu) -set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}") - -if(EXISTS ${HIPIFY_EXECUTABLE}) -set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) -message("Generating HIP kernel code ...") -execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") -o2_add_library(HIPbenchmark - SOURCES Kernels.hip.cxx - PUBLIC_INCLUDE_DIRECTORIES ../Shared - PUBLIC_LINK_LIBRARIES O2::GPUCommon - # hip::host - # hip::device - TARGETVARNAME targetName) - -if(HIP_AMDGPUTARGET) - # Need to add gpu target also to link flags due to gpu-rdc option - target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET}) -endif() - -elseif() - message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") -endif() \ No newline at end of file From 0a7ae2ec7dfc1c4d536e942b904b914115c1ab46 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 1 Jun 2021 17:27:27 +0200 Subject: [PATCH 16/42] Update --- GPU/GPUbenchmark/CMakeLists.txt | 4 +- GPU/GPUbenchmark/Shared/Kernels.h | 5 ++ GPU/GPUbenchmark/benchmark.cxx | 7 +- GPU/GPUbenchmark/cuda/Kernels.cu | 130 +++++++++++------------------- 4 files changed, 58 insertions(+), 88 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 0829637fee964..71ba08edeb975 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -12,7 +12,7 @@ set(HDRS_INSTALL ../Shared/Kernels.h) if(CUDA_ENABLED) # add_subdirectory(cuda) - o2_add_executable(memory-benchmark-cuda + o2_add_executable(gpu-memory-benchmark-cuda SOURCES benchmark.cxx cuda/Kernels.cu PUBLIC_LINK_LIBRARIES O2::GPUCommon @@ -41,7 +41,7 @@ if(HIP_ENABLED) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc") - o2_add_executable(memory-benchmark-hip + o2_add_executable(gpu-memory-benchmark-hip SOURCES benchmark.cxx hip/Kernels.hip.cxx PUBLIC_LINK_LIBRARIES O2::GPUCommon diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 32d93159ac958..a27938ad320ab 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -44,6 +44,11 @@ struct gpuState { } } + size_t getArrayLength() + { + return static_cast(GB * PARTITION_SIZE_GB / sizeof(T)); + } + std::vector getBuffersPointers() { return addresses; diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 89c2b3d79be76..7a83888bd57ed 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -16,7 +16,10 @@ int main() { - o2::benchmark::GPUbenchmark bm{}; - bm.run(); + o2::benchmark::GPUbenchmark bm_char{}; + bm_char.run(); + o2::benchmark::GPUbenchmark bm_int{}; + bm_int.run(); + return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index a2953c5b48d9f..ffb0af087c147 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -25,14 +25,14 @@ double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } failed("API returned error code."); \ } -#define CHECK(cmd) \ - { \ - cudaError_t error = cmd; \ - if (error != cudaSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } +// #define CHECK(cmd) \ +// { \ +// cudaError_t error = cmd; \ +// if (error != cudaSuccess) { \ +// fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \ +// exit(EXIT_FAILURE); \ +// } \ +// } namespace o2 { @@ -44,30 +44,26 @@ namespace gpu /* * Square each element in the array A and write to array C. */ -template -__global__ void - vector_square(T* C_d, T* A_d, size_t N) -{ - size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x; +// template +// __global__ void +// vector_square(T* C_d, T* A_d, size_t N) +// { +// size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); +// size_t stride = blockDim.x * gridDim.x; - for (size_t i = offset; i < N; i += stride) { - C_d[i] = A_d[i] * A_d[i]; +// for (size_t i = offset; i < N; i += stride) { +// C_d[i] = A_d[i] * A_d[i]; +// } +// } + +template +GPUg() void readerKernel( + buffer_type* buffer, + size_t bufferSize) +{ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { } } - -// template -// GPUg() void readerKernel( -// // buffer_type* buffer, -// // size_t bufferSize) -// ) -// { -// printf("ciao"); -// // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { -// // if (i == 0) { -// // } -// // } -// } } // namespace gpu void printDeviceProp(int deviceId) @@ -232,56 +228,22 @@ void GPUbenchmark::init(const int deviceId) // Setup GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.allocatedMemory)); -} -template -void GPUbenchmark::readingBenchmark() -{ - // dim3 nBlocks(mState.nMultiprocessors); - // dim3 nThreads(mState.nMaxThreadsPerBlock); - // gpu::readerKernel<<<1, 1>>>(); - float *A_d, *C_d; - float *A_h, *C_h; - size_t N = 1000000; - size_t Nbytes = N * sizeof(float); + mState.computeBufferPointers(); - cudaDeviceProp props; - CHECK(cudaGetDeviceProperties(&props, 0 /*deviceID*/)); - printf("info: running on device %s\n", props.name); - - printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0); - A_h = (float*)malloc(Nbytes); - CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess); - C_h = (float*)malloc(Nbytes); - CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess); - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; - } - - printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0); - CHECK(cudaMalloc(&A_d, Nbytes)); - CHECK(cudaMalloc(&C_d, Nbytes)); - - printf("info: copy Host2Device\n"); - CHECK(cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice)); + for (size_t iAddr{0}; iAddr < mState.getBuffersPointers().size(); ++iAddr) { - const unsigned blocks = 512; - const unsigned threadsPerBlock = 256; + } - printf("info: launch 'vector_square' kernel\n"); - gpu::vector_square<<>>(C_d, A_d, N); - printf("info: copy Device2Host\n"); - CHECK(cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost)); +} - printf("info: check result\n"); - for (size_t i = 0; i < N; i++) { - if (C_h[i] != A_h[i] * A_h[i]) { - CHECK(cudaErrorUnknown); - } - } - printf("PASSED!\n"); +template +void GPUbenchmark::readingBenchmark() +{ + dim3 nBlocks(mState.nMultiprocessors); + dim3 nThreads(mState.nMaxThreadsPerBlock); + // gpu::readerKernel<<>>(); } template @@ -294,20 +256,20 @@ template void GPUbenchmark::run() { // printDevices(); - // measure(&GPUbenchmark::init, "Init", 0); - // std::cout << " ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory - // << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n"; - // std::cout << " └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n"; - // mState.computeBufferPointers(); - - // for (auto& addr : mState.getBuffersPointers()) { - // std::cout << (void*)addr << std::endl; - // } - measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); - // GPUbenchmark::finalize(); + measure(&GPUbenchmark::init, "Init", 0); + std::cout << " ├ Allocated: " << mState.allocatedMemory << "/" << mState.totalMemory + << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n"; + std::cout << " ├ Can do: " << mState.getMaxSegments() << " segments of " << PARTITION_SIZE_GB << "GB each\n"; + std::cout << " └ Length of arrays in segments: " << mState.getArrayLength() << std::endl; + + + // measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); + GPUbenchmark::finalize(); } template class GPUbenchmark; +template class GPUbenchmark; +template class GPUbenchmark; } // namespace benchmark } // namespace o2 \ No newline at end of file From 6c352b15ce004b36df534f5c7babfef678f591ac Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 21 Jun 2021 17:28:26 +0200 Subject: [PATCH 17/42] Add first dummy benchmark --- GPU/GPUbenchmark/Shared/Common.h | 6 +- GPU/GPUbenchmark/Shared/Kernels.h | 70 +++++++++---------- GPU/GPUbenchmark/benchmark.cxx | 11 +-- GPU/GPUbenchmark/cuda/Kernels.cu | 109 +++++++++++++++++------------- 4 files changed, 106 insertions(+), 90 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index 99db8e114aee6..b9af3be3ed966 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -13,9 +13,9 @@ #ifndef GPU_BENCHMARK_COMMON_H #define GPU_BENCHMARK_COMMON_H -#if defined(__HIPCC__) -#include "hip/hip_runtime.h" -#endif +// #if defined(__HIPCC__) +// #include "hip/hip_runtime.h" +// #endif #include #include diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index a27938ad320ab..3c030cf9cae01 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -21,7 +21,7 @@ #include #define PARTITION_SIZE_GB 1 -#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.99f +#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f #define GB 1073741824 namespace o2 @@ -33,32 +33,46 @@ template struct gpuState { int getMaxSegments() { - return (double)allocatedMemory / (1024.0 * 1024.0 * 1024.0); + return (double)scratchSize / (1024.0 * 1024.0 * 1024.0); } - void computeBufferPointers() + void computeScratchPtrs() { addresses.resize(getMaxSegments()); for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) { - addresses[iBuffAddress] = scratchPtr + GB * PARTITION_SIZE_GB * iBuffAddress; + addresses[iBuffAddress] = reinterpret_cast(reinterpret_cast(scratchPtr) + GB * iBuffAddress); } } - size_t getArrayLength() + static constexpr size_t getArraySize() { return static_cast(GB * PARTITION_SIZE_GB / sizeof(T)); } - std::vector getBuffersPointers() + std::vector getScratchPtrs() { return addresses; } - std::vector addresses; - size_t allocatedMemory; - T* scratchPtr; + std::vector>& getHostBuffers() + { + return gpuBuffersHost; + } + + // General containers and state + T* scratchPtr; // Pointer to scratch buffer + size_t scratchSize; // Size of scratch area (B) + std::vector addresses; // Pointers to scratch partitions + std::vector> gpuBuffersHost; // Host-based vector-ized data + + // Test-specific containers + std::vector deviceReadingResultsPtrs; // Results of the reading test (single variable) on GPU + std::vector hostReadingResultsVector; // Results of the reading test (single variable) on host - //Static info + // Configuration + size_t nMaxThreadsPerDimension; + + // Static info size_t totalMemory; size_t nMultiprocessors; size_t nMaxThreadsPerBlock; @@ -73,11 +87,18 @@ class GPUbenchmark final template float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); - void init(const int deviceId); - void run(); - void finalize(); + // Main interface + void generalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters + void run(); // Execute all specified callbacks + void generalFinalize(); // Cleanup + void printDevices(); // Dump info + + // Initializations/Finalizations of tests. Not to be measured, in principle used for report + void readingInit(); + void readingFinalize(); + + // Benchmark kernel callbacks void readingBenchmark(); - void printDevices(); private: gpuState mState; @@ -85,23 +106,4 @@ class GPUbenchmark final } // namespace benchmark } // namespace o2 -#endif - -/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before. -I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP. - -Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel. -For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level. -We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks. -We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant. - -For the tests I want to run in the segments, I think these should be: -- Linear read in a multithreaded way: i.e. the standard GPU for loop: -for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i]; -In the end we have to write foo to some output address to make sure the compiler cannot optimize anything. -- Then I'd do the same with some stride, i.e.: -foo += array[i * stride]; -- I'd try a random access with some simple linear congruence RNG per thread to determine the address. -- Then I'd do the same with writing memory, and with copying memory. -- Finally the data type should be flexible, going from char to uint4. -That should cover most cases, but if you have more ideas, feel free to add something.*/ \ No newline at end of file +#endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 7a83888bd57ed..80d3d04595f3d 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -11,15 +11,16 @@ /// \file benchmark.cxx /// \author: mconcas@cern.ch -#include #include "Shared/Kernels.h" int main() { - o2::benchmark::GPUbenchmark bm_char{}; - bm_char.run(); - o2::benchmark::GPUbenchmark bm_int{}; - bm_int.run(); + // o2::benchmark::GPUbenchmark bm_char{}; + // bm_char.run(); + o2::benchmark::GPUbenchmark bm_size_t{}; + bm_size_t.run(); + // o2::benchmark::GPUbenchmark bm_int{}; + // bm_int.run(); return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index ffb0af087c147..b18d926e42b4e 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -25,45 +25,30 @@ double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } failed("API returned error code."); \ } -// #define CHECK(cmd) \ -// { \ -// cudaError_t error = cmd; \ -// if (error != cudaSuccess) { \ -// fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \ -// exit(EXIT_FAILURE); \ -// } \ -// } - namespace o2 { namespace benchmark { namespace gpu { -// Kernels go here -/* - * Square each element in the array A and write to array C. - */ -// template -// __global__ void -// vector_square(T* C_d, T* A_d, size_t N) -// { -// size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); -// size_t stride = blockDim.x * gridDim.x; -// for (size_t i = offset; i < N; i += stride) { -// C_d[i] = A_d[i] * A_d[i]; -// } -// } +/////////////////// +/// Kernels go here template GPUg() void readerKernel( + size_t Ntimes, + buffer_type* result, buffer_type* buffer, size_t bufferSize) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { + buffer_type result{0}; + result += buffer[i]; } } +/////////////////// + } // namespace gpu void printDeviceProp(int deviceId) @@ -191,12 +176,13 @@ template float GPUbenchmark::measure(void (GPUbenchmark::*task)(T...), const char* taskName, T&&... args) { float diff{0.f}; + std::cout << std::setw(2) << ">>> " << taskName; auto start = std::chrono::high_resolution_clock::now(); (this->*task)(std::forward(args)...); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff_t{end - start}; diff = diff_t.count(); - std::cout << std::setw(2) << ">>> " << taskName << " completed in: " << diff << " ms" << std::endl; + std::cout << std::setw(2) << " completed in: \x1B[32m" << diff << " ms\x1B[0m" << std::endl; return diff; } @@ -213,41 +199,71 @@ void GPUbenchmark::printDevices() } template -void GPUbenchmark::init(const int deviceId) +void GPUbenchmark::generalInit(const int deviceId) { cudaDeviceProp props; size_t free; - // Fetch and store traits + // Fetch and store features GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; - mState.allocatedMemory = static_cast(FREE_MEMORY_FRACTION_TO_ALLOCATE * free); - - // Setup - GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.allocatedMemory)); - - mState.computeBufferPointers(); - - for (size_t iAddr{0}; iAddr < mState.getBuffersPointers().size(); ++iAddr) { + mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; + mState.scratchSize = static_cast(FREE_MEMORY_FRACTION_TO_ALLOCATE * free); + std::cout << ">>> Running on: " << props.name << std::endl; + // Allocate scratch on GPU + GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); + mState.computeScratchPtrs(); + + // Initialize corresponding buffers on host and copy content on GPU + mState.getHostBuffers().resize(mState.getScratchPtrs().size()); + for (size_t iScratchPart{0}; iScratchPart < mState.getScratchPtrs().size(); ++iScratchPart) { + mState.getHostBuffers()[iScratchPart].resize(gpuState::getArraySize()); + GPUCHECK(cudaMemcpy(mState.getScratchPtrs()[iScratchPart], mState.getHostBuffers()[iScratchPart].data(), gpuState::getArraySize() * sizeof(buffer_type), cudaMemcpyHostToDevice)); + } + std::cout << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) + << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n" + << " ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << PARTITION_SIZE_GB << "GB each\n" + << " ├ Size of arrays in segments: " << gpuState::getArraySize() << " elements" << std::endl + << " └ Memory buffers copied from host to device" + << std::endl; +} +template +void GPUbenchmark::readingInit() +{ + mState.deviceReadingResultsPtrs.resize(mState.getMaxSegments()); + mState.hostReadingResultsVector.resize(mState.getMaxSegments()); + for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) { + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtrs[iScratchPart])), sizeof(buffer_type))); } +} +template +void GPUbenchmark::readingBenchmark() +{ + dim3 nBlocks{static_cast(mState.nMultiprocessors / mState.getMaxSegments())}; + dim3 nThreads{static_cast(std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock))}; + for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) { + gpu::readerKernel<<>>(1, mState.deviceReadingResultsPtrs[iScratchPart], mState.getScratchPtrs()[iScratchPart], gpuState::getArraySize()); + } + GPUCHECK(cudaDeviceSynchronize()); } template -void GPUbenchmark::readingBenchmark() +void GPUbenchmark::readingFinalize() { - dim3 nBlocks(mState.nMultiprocessors); - dim3 nThreads(mState.nMaxThreadsPerBlock); - // gpu::readerKernel<<>>(); + for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) { + GPUCHECK(cudaMemcpy(&mState.hostReadingResultsVector[iScratchPart], mState.deviceReadingResultsPtrs[iScratchPart], sizeof(buffer_type), cudaMemcpyDeviceToHost)); + std::cout << "result " << iScratchPart << ": " << mState.hostReadingResultsVector[iScratchPart] << std::endl; + } } template -void GPUbenchmark::finalize() +void GPUbenchmark::generalFinalize() { GPUCHECK(cudaFree(mState.scratchPtr)); } @@ -256,19 +272,16 @@ template void GPUbenchmark::run() { // printDevices(); - measure(&GPUbenchmark::init, "Init", 0); - std::cout << " ├ Allocated: " << mState.allocatedMemory << "/" << mState.totalMemory - << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n"; - std::cout << " ├ Can do: " << mState.getMaxSegments() << " segments of " << PARTITION_SIZE_GB << "GB each\n"; - std::cout << " └ Length of arrays in segments: " << mState.getArrayLength() << std::endl; - + generalInit(0); - // measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); - GPUbenchmark::finalize(); + readingInit(); + measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); + GPUbenchmark::generalFinalize(); } template class GPUbenchmark; -template class GPUbenchmark; +// template class GPUbenchmark; +template class GPUbenchmark; template class GPUbenchmark; } // namespace benchmark From 377365589e2ad89cd283dcc811dc9cd8a9c039da Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 23 Jun 2021 14:21:34 +0200 Subject: [PATCH 18/42] Assign a block to each scratch segment --- GPU/GPUbenchmark/CMakeLists.txt | 4 +- GPU/GPUbenchmark/Shared/Common.h | 71 ++++++++++++++++++++++- GPU/GPUbenchmark/Shared/Kernels.h | 62 +++------------------ GPU/GPUbenchmark/benchmark.cxx | 55 +++++++++++++++--- GPU/GPUbenchmark/cuda/Kernels.cu | 93 +++++++++++++++++++------------ 5 files changed, 185 insertions(+), 100 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 71ba08edeb975..77e0e63509936 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -16,6 +16,7 @@ if(CUDA_ENABLED) SOURCES benchmark.cxx cuda/Kernels.cu PUBLIC_LINK_LIBRARIES O2::GPUCommon + Boost::program_options TARGETVARNAME targetName) endif() @@ -30,7 +31,7 @@ if(HIP_ENABLED) if(EXISTS ${HIPIFY_EXECUTABLE}) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) message("Generating HIP kernel code ...") - execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") + execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() @@ -46,6 +47,7 @@ if(HIP_ENABLED) hip/Kernels.hip.cxx PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host + Boost::program_options TARGETVARNAME targetName) if(HIP_AMDGPUTARGET) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index b9af3be3ed966..74e7115b8623a 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -13,12 +13,11 @@ #ifndef GPU_BENCHMARK_COMMON_H #define GPU_BENCHMARK_COMMON_H -// #if defined(__HIPCC__) -// #include "hip/hip_runtime.h" -// #endif #include #include +#include +#include #define KNRM "\x1B[0m" #define KRED "\x1B[31m" @@ -29,6 +28,72 @@ #define KCYN "\x1B[36m" #define KWHT "\x1B[37m" +#define GB (1024 * 1024 * 1024) + +namespace o2 +{ +namespace benchmark +{ +struct benchmarkOpts { + benchmarkOpts() = default; + + float partitionSizeGB = 1.f; + float freeMemoryFractionToAllocate = 0.95f; +}; + +template +struct gpuState { + int getMaxSegments() + { + return (double)scratchSize / (partitionSizeGB * GB); + } + + void computeScratchPtrs() + { + partAddrOnHost.resize(getMaxSegments()); + for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) { + partAddrOnHost[iBuffAddress] = reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partitionSizeGB) * iBuffAddress); + } + } + + size_t getPartitionCapacity() + { + return static_cast(GB * partitionSizeGB / sizeof(T)); + } + + std::vector getScratchPtrs() + { + return partAddrOnHost; + } + + std::vector>& getHostBuffers() + { + return gpuBuffersHost; + } + + // Configuration + size_t nMaxThreadsPerDimension; + float partitionSizeGB; // Size of each partition (GB) + + // General containers and state + T* scratchPtr; // Pointer to scratch buffer + size_t scratchSize; // Size of scratch area (B) + std::vector partAddrOnHost; // Pointers to scratch partitions on host vector + std::vector> gpuBuffersHost; // Host-based vector-ized data + + // Test-specific containers + T* deviceReadingResultsPtr; // Results of the reading test (single variable) on GPU + std::vector hostReadingResultsVector; // Results of the reading test (single variable) on host + + // Static info + size_t totalMemory; + size_t nMultiprocessors; + size_t nMaxThreadsPerBlock; +}; + +} // namespace benchmark +} // namespace o2 + #define failed(...) \ printf("%serror: ", KRED); \ printf(__VA_ARGS__); \ diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 3c030cf9cae01..07c89836eabcc 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -15,74 +15,29 @@ #define GPU_BENCHMARK_KERNELS_H #include "GPUCommonDef.h" +#include "Common.h" #include #include #include #include -#define PARTITION_SIZE_GB 1 -#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f -#define GB 1073741824 +// #define PARTITION_SIZE_GB 1 +// #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f + namespace o2 { namespace benchmark { -template -struct gpuState { - int getMaxSegments() - { - return (double)scratchSize / (1024.0 * 1024.0 * 1024.0); - } - - void computeScratchPtrs() - { - addresses.resize(getMaxSegments()); - for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) { - addresses[iBuffAddress] = reinterpret_cast(reinterpret_cast(scratchPtr) + GB * iBuffAddress); - } - } - - static constexpr size_t getArraySize() - { - return static_cast(GB * PARTITION_SIZE_GB / sizeof(T)); - } - - std::vector getScratchPtrs() - { - return addresses; - } - - std::vector>& getHostBuffers() - { - return gpuBuffersHost; - } - - // General containers and state - T* scratchPtr; // Pointer to scratch buffer - size_t scratchSize; // Size of scratch area (B) - std::vector addresses; // Pointers to scratch partitions - std::vector> gpuBuffersHost; // Host-based vector-ized data - - // Test-specific containers - std::vector deviceReadingResultsPtrs; // Results of the reading test (single variable) on GPU - std::vector hostReadingResultsVector; // Results of the reading test (single variable) on host - - // Configuration - size_t nMaxThreadsPerDimension; - - // Static info - size_t totalMemory; - size_t nMultiprocessors; - size_t nMaxThreadsPerBlock; -}; - template class GPUbenchmark final { public: - GPUbenchmark() = default; + GPUbenchmark() = delete; // need for a configuration + GPUbenchmark(benchmarkOpts& opts) : mOptions{opts} { + + } virtual ~GPUbenchmark() = default; template float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); @@ -102,6 +57,7 @@ class GPUbenchmark final private: gpuState mState; + benchmarkOpts mOptions; }; } // namespace benchmark diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 80d3d04595f3d..2824bdc8bd07f 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -9,18 +9,57 @@ // or submit itself to any jurisdiction. /// /// \file benchmark.cxx -/// \author: mconcas@cern.ch - +/// \author mconcas@cern.ch +/// \brief configuration widely inspired/copied by SimConfig #include "Shared/Kernels.h" -int main() +bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) +{ + namespace bpo = boost::program_options; + bpo::variables_map vm; + bpo::options_description options("Benchmark options"); + options.add_options()( + "help,h", "Print help message.")( + "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( + "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f)."); + try { + bpo::store(parse_command_line(argc, argv, options), vm); + if (vm.count("help")) { + std::cout << options << std::endl; + return false; + } + + bpo::notify(vm); + } catch (const bpo::error& e) { + std::cerr << e.what() << "\n\n"; + std::cerr << "Error parsing command line arguments. Available options:\n"; + + std::cerr << options << std::endl; + return false; + } + + conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); + conf.partitionSizeGB = vm["chunkSize"].as(); + + return true; +} + +int main(int argc, const char* argv[]) { - // o2::benchmark::GPUbenchmark bm_char{}; + + o2::benchmark::benchmarkOpts opts; + if (argc > 1) { + if (!parseArgs(opts, argc, argv)) { + return -1; + } + } + + // o2::benchmark::GPUbenchmark bm_char{opts}; // bm_char.run(); - o2::benchmark::GPUbenchmark bm_size_t{}; - bm_size_t.run(); - // o2::benchmark::GPUbenchmark bm_int{}; - // bm_int.run(); + // o2::benchmark::GPUbenchmark bm_size_t{opts}; + // bm_size_t.run(); + o2::benchmark::GPUbenchmark bm_int{opts}; + bm_int.run(); return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index b18d926e42b4e..d0c7c59b86495 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -12,12 +12,8 @@ /// \author: mconcas@cern.ch #include "../Shared/Kernels.h" -#include "../Shared/Common.h" #include -double bytesToKB(size_t s) { return (double)s / (1024.0); } -double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } - #define GPUCHECK(error) \ if (error != cudaSuccess) { \ printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \ @@ -25,6 +21,9 @@ double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); } failed("API returned error code."); \ } +double bytesToKB(size_t s) { return (double)s / (1024.0); } +double bytesToGB(size_t s) { return (double)s / GB; } + namespace o2 { namespace benchmark @@ -33,24 +32,52 @@ namespace gpu { /////////////////// -/// Kernels go here +/// Kernels and device functions go here +template +GPUhd() buffer_type* getPartPtrOnScratch(buffer_type* scratchPtr, float partSizeGB, size_t partNumber) +{ + return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partSizeGB) * partNumber); +} template GPUg() void readerKernel( - size_t Ntimes, - buffer_type* result, - buffer_type* buffer, - size_t bufferSize) + buffer_type* results, + buffer_type* scratch, + size_t iterations, + size_t bufferSize, + float partitionSize = 1.f) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { - buffer_type result{0}; - result += buffer[i]; + for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { + buffer_type tmpResult{0}; + for (size_t j{0}; j < iterations; ++j) { + tmpResult += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; + } + results[blockIdx.x] += tmpResult; // FIXME: do something with data w/o data racing condition (avoid compiler optimizations) + // atomicAdd(reinterpret_cast(&(results[blockIdx.x])), tmpResult); // Does not work in CUDA } } /////////////////// } // namespace gpu +template +char* getType() +{ + if (typeid(T).name() == typeid(char).name()) { + return const_cast("\e[1mchar\e[0m"); + } + if (typeid(T).name() == typeid(size_t).name()) { + return const_cast("\e[1msize_t\e[0m"); + } + if (typeid(T).name() == typeid(int).name()) { + return const_cast("\e[1mint\e[0m"); + } + if (typeid(T).name() == typeid(int4).name()) { + return const_cast("\e[1mint4\e[0m"); + } + return const_cast("\e[1m unknown\e[0m"); +} + void printDeviceProp(int deviceId) { const int w1 = 34; @@ -208,25 +235,24 @@ void GPUbenchmark::generalInit(const int deviceId) GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); + mState.partitionSizeGB = mOptions.partitionSizeGB; mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; - mState.scratchSize = static_cast(FREE_MEMORY_FRACTION_TO_ALLOCATE * free); - std::cout << ">>> Running on: " << props.name << std::endl; + mState.scratchSize = static_cast(mOptions.freeMemoryFractionToAllocate * free); + std::cout << ">>> Running benchmark on : " << props.name << std::endl; + // Allocate scratch on GPU GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); + mState.computeScratchPtrs(); + GPUCHECK(cudaMemset(mState.scratchPtr, 1, mState.scratchSize)) - // Initialize corresponding buffers on host and copy content on GPU - mState.getHostBuffers().resize(mState.getScratchPtrs().size()); - for (size_t iScratchPart{0}; iScratchPart < mState.getScratchPtrs().size(); ++iScratchPart) { - mState.getHostBuffers()[iScratchPart].resize(gpuState::getArraySize()); - GPUCHECK(cudaMemcpy(mState.getScratchPtrs()[iScratchPart], mState.getHostBuffers()[iScratchPart].data(), gpuState::getArraySize() * sizeof(buffer_type), cudaMemcpyHostToDevice)); - } - std::cout << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) + std::cout << " ├ Buffer type: " << getType() << std::endl + << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n" - << " ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << PARTITION_SIZE_GB << "GB each\n" - << " ├ Size of arrays in segments: " << gpuState::getArraySize() << " elements" << std::endl + << " ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n" + << " ├ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl << " └ Memory buffers copied from host to device" << std::endl; } @@ -234,31 +260,27 @@ void GPUbenchmark::generalInit(const int deviceId) template void GPUbenchmark::readingInit() { - mState.deviceReadingResultsPtrs.resize(mState.getMaxSegments()); mState.hostReadingResultsVector.resize(mState.getMaxSegments()); - for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) { - GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtrs[iScratchPart])), sizeof(buffer_type))); - } + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type))); } template void GPUbenchmark::readingBenchmark() { - dim3 nBlocks{static_cast(mState.nMultiprocessors / mState.getMaxSegments())}; - dim3 nThreads{static_cast(std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock))}; + auto nBlocks{mState.getMaxSegments()}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) { - gpu::readerKernel<<>>(1, mState.deviceReadingResultsPtrs[iScratchPart], mState.getScratchPtrs()[iScratchPart], gpuState::getArraySize()); - } + gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1000, mState.getPartitionCapacity(), mState.partitionSizeGB); GPUCHECK(cudaDeviceSynchronize()); } template void GPUbenchmark::readingFinalize() { - for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) { - GPUCHECK(cudaMemcpy(&mState.hostReadingResultsVector[iScratchPart], mState.deviceReadingResultsPtrs[iScratchPart], sizeof(buffer_type), cudaMemcpyDeviceToHost)); - std::cout << "result " << iScratchPart << ": " << mState.hostReadingResultsVector[iScratchPart] << std::endl; + + GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost)); + for (auto r : mState.hostReadingResultsVector) { + std::cout << "Result " << r << std::endl; } } @@ -276,6 +298,7 @@ void GPUbenchmark::run() readingInit(); measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); + readingFinalize(); GPUbenchmark::generalFinalize(); } From 8933ff1a073e90ef42b3fa871652918301be8a8d Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 23 Jun 2021 14:25:19 +0200 Subject: [PATCH 19/42] Fix copyright --- GPU/GPUbenchmark/CMakeLists.txt | 13 +++++++------ GPU/GPUbenchmark/Shared/Common.h | 9 +++++---- GPU/GPUbenchmark/Shared/Kernels.h | 9 +++++---- GPU/GPUbenchmark/benchmark.cxx | 9 +++++---- GPU/GPUbenchmark/cuda/Kernels.cu | 9 +++++---- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 77e0e63509936..5b821a09d4fd7 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -1,12 +1,13 @@ -# Copyright CERN and copyright holders of ALICE O2. This software is distributed -# under the terms of the GNU General Public License v3 (GPL Version 3), copied -# verbatim in the file "COPYING". +# Copyright 2019-2020 CERN and copyright holders of ALICE O2. +# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +# All rights not expressly granted are reserved. # -# See http://alice-o2.web.cern.ch/license for full licensing information. +# This software is distributed under the terms of the GNU General Public +# License v3 (GPL Version 3), copied verbatim in the file "COPYING". # # In applying this license CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization or -# submit itself to any jurisdiction. +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. set(HDRS_INSTALL ../Shared/Kernels.h) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index 74e7115b8623a..269847f7b483c 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -1,8 +1,9 @@ -// Copyright CERN and copyright holders of ALICE O2. This software is -// distributed under the terms of the GNU General Public License v3 (GPL -// Version 3), copied verbatim in the file "COPYING". +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. // -// See http://alice-o2.web.cern.ch/license for full licensing information. +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". // // In applying this license CERN does not waive the privileges and immunities // granted to it by virtue of its status as an Intergovernmental Organization diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 07c89836eabcc..0325e13fd3782 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -1,8 +1,9 @@ -// Copyright CERN and copyright holders of ALICE O2. This software is -// distributed under the terms of the GNU General Public License v3 (GPL -// Version 3), copied verbatim in the file "COPYING". +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. // -// See http://alice-o2.web.cern.ch/license for full licensing information. +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". // // In applying this license CERN does not waive the privileges and immunities // granted to it by virtue of its status as an Intergovernmental Organization diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 2824bdc8bd07f..6bdbd6c7e1237 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -1,8 +1,9 @@ -// Copyright CERN and copyright holders of ALICE O2. This software is -// distributed under the terms of the GNU General Public License v3 (GPL -// Version 3), copied verbatim in the file "COPYING". +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. // -// See http://alice-o2.web.cern.ch/license for full licensing information. +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". // // In applying this license CERN does not waive the privileges and immunities // granted to it by virtue of its status as an Intergovernmental Organization diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index d0c7c59b86495..0743bda8ee616 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -1,8 +1,9 @@ -// Copyright CERN and copyright holders of ALICE O2. This software is -// distributed under the terms of the GNU General Public License v3 (GPL -// Version 3), copied verbatim in the file "COPYING". +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. // -// See http://alice-o2.web.cern.ch/license for full licensing information. +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". // // In applying this license CERN does not waive the privileges and immunities // granted to it by virtue of its status as an Intergovernmental Organization From fd8041b6a42421be7ccdd95d87111242a8d1c164 Mon Sep 17 00:00:00 2001 From: ALICE Builder Date: Wed, 23 Jun 2021 14:33:15 +0200 Subject: [PATCH 20/42] Please consider the following formatting changes (#16) --- GPU/GPUbenchmark/Shared/Common.h | 2 +- GPU/GPUbenchmark/Shared/Kernels.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index 269847f7b483c..5acc4eca56d62 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -83,7 +83,7 @@ struct gpuState { std::vector> gpuBuffersHost; // Host-based vector-ized data // Test-specific containers - T* deviceReadingResultsPtr; // Results of the reading test (single variable) on GPU + T* deviceReadingResultsPtr; // Results of the reading test (single variable) on GPU std::vector hostReadingResultsVector; // Results of the reading test (single variable) on host // Static info diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 0325e13fd3782..54b4a057bc53b 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -25,7 +25,6 @@ // #define PARTITION_SIZE_GB 1 // #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f - namespace o2 { namespace benchmark @@ -36,8 +35,8 @@ class GPUbenchmark final { public: GPUbenchmark() = delete; // need for a configuration - GPUbenchmark(benchmarkOpts& opts) : mOptions{opts} { - + GPUbenchmark(benchmarkOpts& opts) : mOptions{opts} + { } virtual ~GPUbenchmark() = default; template From f80c6843d02731ea212d0fdabbad973b46161f80 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 23 Jun 2021 17:39:23 +0200 Subject: [PATCH 21/42] Set configurable iterations --- GPU/GPUbenchmark/Shared/Common.h | 5 +++++ GPU/GPUbenchmark/Shared/Kernels.h | 2 +- GPU/GPUbenchmark/benchmark.cxx | 12 +++++++----- GPU/GPUbenchmark/cuda/Kernels.cu | 15 +++++++++------ 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h index 5acc4eca56d62..33067d6f4b8c0 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Common.h @@ -40,6 +40,7 @@ struct benchmarkOpts { float partitionSizeGB = 1.f; float freeMemoryFractionToAllocate = 0.95f; + size_t iterations = 1; }; template @@ -72,8 +73,12 @@ struct gpuState { return gpuBuffersHost; } + size_t getNiterations() { return iterations; } + // Configuration size_t nMaxThreadsPerDimension; + size_t iterations; + float partitionSizeGB; // Size of each partition (GB) // General containers and state diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 54b4a057bc53b..8d77dd9feabac 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -53,7 +53,7 @@ class GPUbenchmark final void readingFinalize(); // Benchmark kernel callbacks - void readingBenchmark(); + void readingBenchmark(size_t iterations); private: gpuState mState; diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 6bdbd6c7e1237..8455b1921eb0b 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -22,7 +22,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) options.add_options()( "help,h", "Print help message.")( "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( - "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f)."); + "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( + "iterations,i", bpo::value()->default_value(50), "Number of iterations in reading kernels."); try { bpo::store(parse_command_line(argc, argv, options), vm); if (vm.count("help")) { @@ -41,6 +42,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); conf.partitionSizeGB = vm["chunkSize"].as(); + conf.iterations = vm["iterations"].as(); return true; } @@ -55,10 +57,10 @@ int main(int argc, const char* argv[]) } } - // o2::benchmark::GPUbenchmark bm_char{opts}; - // bm_char.run(); - // o2::benchmark::GPUbenchmark bm_size_t{opts}; - // bm_size_t.run(); + o2::benchmark::GPUbenchmark bm_char{opts}; + bm_char.run(); + o2::benchmark::GPUbenchmark bm_size_t{opts}; + bm_size_t.run(); o2::benchmark::GPUbenchmark bm_int{opts}; bm_int.run(); diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 0743bda8ee616..326f993999ae8 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -237,11 +237,12 @@ void GPUbenchmark::generalInit(const int deviceId) GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); mState.partitionSizeGB = mOptions.partitionSizeGB; + mState.iterations = mOptions.iterations; mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; mState.scratchSize = static_cast(mOptions.freeMemoryFractionToAllocate * free); - std::cout << ">>> Running benchmark on : " << props.name << std::endl; + std::cout << ">>> Running on : " << props.name << std::endl; // Allocate scratch on GPU GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); @@ -261,17 +262,19 @@ void GPUbenchmark::generalInit(const int deviceId) template void GPUbenchmark::readingInit() { + std::cout << ">>> Initializing reading benchmark with \e[1m" << mState.iterations << "\e[0m iterations." << std::endl; mState.hostReadingResultsVector.resize(mState.getMaxSegments()); GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type))); } template -void GPUbenchmark::readingBenchmark() +void GPUbenchmark::readingBenchmark(size_t iterations) { auto nBlocks{mState.getMaxSegments()}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - - gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1000, mState.getPartitionCapacity(), mState.partitionSizeGB); + for (auto iteration{iterations}; iteration--;) { + gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1, mState.getPartitionCapacity(), mState.partitionSizeGB); + } GPUCHECK(cudaDeviceSynchronize()); } @@ -281,7 +284,7 @@ void GPUbenchmark::readingFinalize() GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost)); for (auto r : mState.hostReadingResultsVector) { - std::cout << "Result " << r << std::endl; + // std::cout << "Result " << r << std::endl; } } @@ -298,7 +301,7 @@ void GPUbenchmark::run() generalInit(0); readingInit(); - measure(&GPUbenchmark::readingBenchmark, "Reading benchmark"); + measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); readingFinalize(); GPUbenchmark::generalFinalize(); } From 3896a71aa13394f6d1b61b882d353865f5ff3819 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 24 Jun 2021 18:06:57 +0200 Subject: [PATCH 22/42] Improve busy fucntion + streaming results on file --- GPU/GPUbenchmark/CMakeLists.txt | 2 + GPU/GPUbenchmark/Shared/Kernels.h | 6 +- GPU/GPUbenchmark/Shared/{Common.h => Utils.h} | 38 ++++++++++++- GPU/GPUbenchmark/benchmark.cxx | 14 +++-- GPU/GPUbenchmark/cuda/Kernels.cu | 57 +++++++++---------- 5 files changed, 77 insertions(+), 40 deletions(-) rename GPU/GPUbenchmark/Shared/{Common.h => Utils.h} (74%) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 5b821a09d4fd7..9151acc8bc478 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -18,6 +18,7 @@ if(CUDA_ENABLED) cuda/Kernels.cu PUBLIC_LINK_LIBRARIES O2::GPUCommon Boost::program_options + O2::CommonUtils TARGETVARNAME targetName) endif() @@ -49,6 +50,7 @@ if(HIP_ENABLED) PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host Boost::program_options + O2::CommonUtils TARGETVARNAME targetName) if(HIP_AMDGPUTARGET) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 8d77dd9feabac..66ae5aa05e176 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -16,10 +16,11 @@ #define GPU_BENCHMARK_KERNELS_H #include "GPUCommonDef.h" -#include "Common.h" +#include "Utils.h" #include #include #include +#include #include // #define PARTITION_SIZE_GB 1 @@ -35,7 +36,7 @@ class GPUbenchmark final { public: GPUbenchmark() = delete; // need for a configuration - GPUbenchmark(benchmarkOpts& opts) : mOptions{opts} + GPUbenchmark(benchmarkOpts& opts, std::shared_ptr streamer) : mStreamer{streamer}, mOptions{opts} { } virtual ~GPUbenchmark() = default; @@ -57,6 +58,7 @@ class GPUbenchmark final private: gpuState mState; + std::shared_ptr mStreamer; benchmarkOpts mOptions; }; diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Utils.h similarity index 74% rename from GPU/GPUbenchmark/Shared/Common.h rename to GPU/GPUbenchmark/Shared/Utils.h index 33067d6f4b8c0..0fc1d9ef24d6f 100644 --- a/GPU/GPUbenchmark/Shared/Common.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -12,13 +12,14 @@ /// \file Common.h /// \author: mconcas@cern.ch -#ifndef GPU_BENCHMARK_COMMON_H -#define GPU_BENCHMARK_COMMON_H +#ifndef GPU_BENCHMARK_UTILS_H +#define GPU_BENCHMARK_UTILS_H #include #include #include #include +#include "CommonUtils/TreeStreamRedirector.h" #define KNRM "\x1B[0m" #define KRED "\x1B[31m" @@ -97,6 +98,39 @@ struct gpuState { size_t nMaxThreadsPerBlock; }; +// Interface class to stream results to root file +class ResultStreamer +{ + public: + explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root"); + ~ResultStreamer(); + void storeBenchmarkEntry(std::string benchmarkName, float entry); + + private: + std::string mDebugTreeFileName = "benchmark_results.root"; // output filename + o2::utils::TreeStreamRedirector* mTreeStream; // observer +}; + +inline ResultStreamer::ResultStreamer(const std::string debugTreeFileName) +{ + mDebugTreeFileName = debugTreeFileName; + mTreeStream = new o2::utils::TreeStreamRedirector(debugTreeFileName.data(), "recreate"); +} + +inline ResultStreamer::~ResultStreamer() +{ + delete mTreeStream; +} + +inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, float entry) +{ + (*mTreeStream) + << "Benchmarks" + << benchmarkName.data() + << "elapsed=" << entry + << "\n"; +} + } // namespace benchmark } // namespace o2 diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 8455b1921eb0b..4e71cb5c1e7d6 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -21,9 +21,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) bpo::options_description options("Benchmark options"); options.add_options()( "help,h", "Print help message.")( - "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( - "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( - "iterations,i", bpo::value()->default_value(50), "Number of iterations in reading kernels."); + "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")("freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")("iterations,i", bpo::value()->default_value(50), "Number of iterations in reading kernels."); try { bpo::store(parse_command_line(argc, argv, options), vm); if (vm.count("help")) { @@ -47,6 +45,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) return true; } +using o2::benchmark::ResultStreamer; + int main(int argc, const char* argv[]) { @@ -57,11 +57,13 @@ int main(int argc, const char* argv[]) } } - o2::benchmark::GPUbenchmark bm_char{opts}; + std::shared_ptr streamer = std::make_shared(); + + o2::benchmark::GPUbenchmark bm_char{opts, streamer}; bm_char.run(); - o2::benchmark::GPUbenchmark bm_size_t{opts}; + o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; bm_size_t.run(); - o2::benchmark::GPUbenchmark bm_int{opts}; + o2::benchmark::GPUbenchmark bm_int{opts, streamer}; bm_int.run(); return 0; diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 326f993999ae8..5fce52d6a445e 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -25,6 +25,24 @@ double bytesToKB(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / GB; } +template +char* getType() +{ + if (typeid(T).name() == typeid(char).name()) { + return const_cast("\e[1mchar\e[0m"); + } + if (typeid(T).name() == typeid(size_t).name()) { + return const_cast("\e[1msize_t\e[0m"); + } + if (typeid(T).name() == typeid(int).name()) { + return const_cast("\e[1mint\e[0m"); + } + if (typeid(T).name() == typeid(int4).name()) { + return const_cast("\e[1mint4\e[0m"); + } + return const_cast("\e[1m unknown\e[0m"); +} + namespace o2 { namespace benchmark @@ -44,41 +62,22 @@ template GPUg() void readerKernel( buffer_type* results, buffer_type* scratch, - size_t iterations, + size_t innerIterations, size_t bufferSize, float partitionSize = 1.f) { for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { - buffer_type tmpResult{0}; - for (size_t j{0}; j < iterations; ++j) { - tmpResult += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; + for (size_t j{0}; j < innerIterations; ++j) { + if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { + results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen threads and should be always in sync + } } - results[blockIdx.x] += tmpResult; // FIXME: do something with data w/o data racing condition (avoid compiler optimizations) - // atomicAdd(reinterpret_cast(&(results[blockIdx.x])), tmpResult); // Does not work in CUDA } } /////////////////// } // namespace gpu -template -char* getType() -{ - if (typeid(T).name() == typeid(char).name()) { - return const_cast("\e[1mchar\e[0m"); - } - if (typeid(T).name() == typeid(size_t).name()) { - return const_cast("\e[1msize_t\e[0m"); - } - if (typeid(T).name() == typeid(int).name()) { - return const_cast("\e[1mint\e[0m"); - } - if (typeid(T).name() == typeid(int4).name()) { - return const_cast("\e[1mint4\e[0m"); - } - return const_cast("\e[1m unknown\e[0m"); -} - void printDeviceProp(int deviceId) { const int w1 = 34; @@ -242,13 +241,13 @@ void GPUbenchmark::generalInit(const int deviceId) mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; mState.scratchSize = static_cast(mOptions.freeMemoryFractionToAllocate * free); - std::cout << ">>> Running on : " << props.name << std::endl; + std::cout << ">>> Running on: \e[1m" << props.name << "\e[0m" << std::endl; // Allocate scratch on GPU GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); mState.computeScratchPtrs(); - GPUCHECK(cudaMemset(mState.scratchPtr, 1, mState.scratchSize)) + GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize)) std::cout << " ├ Buffer type: " << getType() << std::endl << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) @@ -283,9 +282,6 @@ void GPUbenchmark::readingFinalize() { GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost)); - for (auto r : mState.hostReadingResultsVector) { - // std::cout << "Result " << r << std::endl; - } } template @@ -301,7 +297,8 @@ void GPUbenchmark::run() generalInit(0); readingInit(); - measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); + auto result = measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); + mStreamer.get()->storeBenchmarkEntry("readingBenchmark", result); readingFinalize(); GPUbenchmark::generalFinalize(); } From cf1276e001a6769afd2b1d02caee6c386429c0ae Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 25 Jun 2021 14:13:40 +0200 Subject: [PATCH 23/42] Fix bug in CLI params --- GPU/GPUbenchmark/Shared/Utils.h | 7 +++---- GPU/GPUbenchmark/benchmark.cxx | 17 +++++++++-------- GPU/GPUbenchmark/cuda/Kernels.cu | 21 +++++++++++---------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 0fc1d9ef24d6f..bd10aedfbf108 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -104,7 +104,7 @@ class ResultStreamer public: explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root"); ~ResultStreamer(); - void storeBenchmarkEntry(std::string benchmarkName, float entry); + void storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry); private: std::string mDebugTreeFileName = "benchmark_results.root"; // output filename @@ -122,11 +122,10 @@ inline ResultStreamer::~ResultStreamer() delete mTreeStream; } -inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, float entry) +inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry) { (*mTreeStream) - << "Benchmarks" - << benchmarkName.data() + << (benchmarkName + type).data() << "elapsed=" << entry << "\n"; } diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 4e71cb5c1e7d6..10cd63537282d 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -21,7 +21,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) bpo::options_description options("Benchmark options"); options.add_options()( "help,h", "Print help message.")( - "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")("freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")("iterations,i", bpo::value()->default_value(50), "Number of iterations in reading kernels."); + "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( + "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( + "iterations,i", bpo::value()->default_value(50), "Number of iterations in reading kernels."); try { bpo::store(parse_command_line(argc, argv, options), vm); if (vm.count("help")) { @@ -40,7 +42,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); conf.partitionSizeGB = vm["chunkSize"].as(); - conf.iterations = vm["iterations"].as(); + conf.iterations = vm["iterations"].as(); return true; } @@ -51,20 +53,19 @@ int main(int argc, const char* argv[]) { o2::benchmark::benchmarkOpts opts; - if (argc > 1) { + if (!parseArgs(opts, argc, argv)) { return -1; } - } std::shared_ptr streamer = std::make_shared(); o2::benchmark::GPUbenchmark bm_char{opts, streamer}; bm_char.run(); - o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; - bm_size_t.run(); - o2::benchmark::GPUbenchmark bm_int{opts, streamer}; - bm_int.run(); + // o2::benchmark::GPUbenchmark bm_int{opts, streamer}; + // bm_int.run(); + // o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; + // bm_size_t.run(); return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 5fce52d6a445e..17093307e2944 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -26,21 +26,21 @@ double bytesToKB(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / GB; } template -char* getType() +std::string getType() { if (typeid(T).name() == typeid(char).name()) { - return const_cast("\e[1mchar\e[0m"); + return std::string{"char"}; } if (typeid(T).name() == typeid(size_t).name()) { - return const_cast("\e[1msize_t\e[0m"); + return std::string{"unsigned long"}; } if (typeid(T).name() == typeid(int).name()) { - return const_cast("\e[1mint\e[0m"); + return std::string{"int"}; } if (typeid(T).name() == typeid(int4).name()) { - return const_cast("\e[1mint4\e[0m"); + return std::string{"int4"}; } - return const_cast("\e[1m unknown\e[0m"); + return std::string{"unknown"}; } namespace o2 @@ -249,7 +249,7 @@ void GPUbenchmark::generalInit(const int deviceId) mState.computeScratchPtrs(); GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize)) - std::cout << " ├ Buffer type: " << getType() << std::endl + std::cout << " ├ Buffer type: \e[1m" << getType() << "\e[0m" << std::endl << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n" << " ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n" @@ -293,13 +293,14 @@ void GPUbenchmark::generalFinalize() template void GPUbenchmark::run() { - // printDevices(); generalInit(0); - + // Test calls go here + // - Reading readingInit(); auto result = measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); - mStreamer.get()->storeBenchmarkEntry("readingBenchmark", result); + mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType(), result); readingFinalize(); + GPUbenchmark::generalFinalize(); } From 2b7e27ec668869d23837894b1eaa1a1098563498 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 25 Jun 2021 17:54:43 +0200 Subject: [PATCH 24/42] Add configurable number of tests --- GPU/GPUbenchmark/Shared/Utils.h | 3 ++- GPU/GPUbenchmark/benchmark.cxx | 7 +++++-- GPU/GPUbenchmark/cuda/Kernels.cu | 26 +++++++++++++------------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index bd10aedfbf108..6ea03f2971052 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -41,7 +41,8 @@ struct benchmarkOpts { float partitionSizeGB = 1.f; float freeMemoryFractionToAllocate = 0.95f; - size_t iterations = 1; + size_t kernelLaunches = 1; + size_t nTests = 1; }; template diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 10cd63537282d..ab108aa839acc 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -23,7 +23,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) "help,h", "Print help message.")( "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( - "iterations,i", bpo::value()->default_value(50), "Number of iterations in reading kernels."); + "launches,l", bpo::value()->default_value(50), "Number of iterations in reading kernels.")( + "ntests,n", bpo::value()->default_value(1), "Number of times each test is run." + ); try { bpo::store(parse_command_line(argc, argv, options), vm); if (vm.count("help")) { @@ -42,7 +44,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); conf.partitionSizeGB = vm["chunkSize"].as(); - conf.iterations = vm["iterations"].as(); + conf.kernelLaunches = vm["launches"].as(); + conf.nTests = vm["ntests"].as(); return true; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 17093307e2944..ab175b91fe8e8 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -62,14 +62,14 @@ template GPUg() void readerKernel( buffer_type* results, buffer_type* scratch, - size_t innerIterations, size_t bufferSize, - float partitionSize = 1.f) + float partitionSize = 1.f, + size_t innerLoops = 1) { for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { - for (size_t j{0}; j < innerIterations; ++j) { + for (size_t j{0}; j < innerLoops; ++j) { if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { - results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen threads and should be always in sync + results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync } } } @@ -236,7 +236,6 @@ void GPUbenchmark::generalInit(const int deviceId) GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); mState.partitionSizeGB = mOptions.partitionSizeGB; - mState.iterations = mOptions.iterations; mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; @@ -253,26 +252,25 @@ void GPUbenchmark::generalInit(const int deviceId) << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n" << " ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n" - << " ├ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl - << " └ Memory buffers copied from host to device" + << " └ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl << std::endl; } template void GPUbenchmark::readingInit() { - std::cout << ">>> Initializing reading benchmark with \e[1m" << mState.iterations << "\e[0m iterations." << std::endl; + std::cout << ">>> Initializing reading benchmark with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; mState.hostReadingResultsVector.resize(mState.getMaxSegments()); GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type))); } template -void GPUbenchmark::readingBenchmark(size_t iterations) +void GPUbenchmark::readingBenchmark(size_t kernelLaunches) { auto nBlocks{mState.getMaxSegments()}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - for (auto iteration{iterations}; iteration--;) { - gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1, mState.getPartitionCapacity(), mState.partitionSizeGB); + for (auto launch{kernelLaunches}; launch--;) { + gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, mState.getPartitionCapacity(), mState.partitionSizeGB); } GPUCHECK(cudaDeviceSynchronize()); } @@ -297,8 +295,10 @@ void GPUbenchmark::run() // Test calls go here // - Reading readingInit(); - auto result = measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); - mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType(), result); + for (auto measures{mOptions.nTests}; measures--;) { + auto result = measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); + mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType(), result); + } readingFinalize(); GPUbenchmark::generalFinalize(); From 134b31db1f796a7a2d00204943aacbf1684e45d5 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 25 Jun 2021 18:29:44 +0200 Subject: [PATCH 25/42] Fix undefined behaviour insetting nLaunches --- GPU/GPUbenchmark/Shared/Kernels.h | 2 +- GPU/GPUbenchmark/Shared/Utils.h | 8 ++++---- GPU/GPUbenchmark/benchmark.cxx | 8 ++++---- GPU/GPUbenchmark/cuda/Kernels.cu | 15 ++++++--------- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 66ae5aa05e176..5b872e521a173 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -54,7 +54,7 @@ class GPUbenchmark final void readingFinalize(); // Benchmark kernel callbacks - void readingBenchmark(size_t iterations); + void readingBenchmark(int iterations); private: gpuState mState; diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 6ea03f2971052..8551419efd4ff 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -41,8 +41,8 @@ struct benchmarkOpts { float partitionSizeGB = 1.f; float freeMemoryFractionToAllocate = 0.95f; - size_t kernelLaunches = 1; - size_t nTests = 1; + int kernelLaunches = 1; + int nTests = 1; }; template @@ -75,11 +75,11 @@ struct gpuState { return gpuBuffersHost; } - size_t getNiterations() { return iterations; } + int getNiterations() { return iterations; } // Configuration size_t nMaxThreadsPerDimension; - size_t iterations; + int iterations; float partitionSizeGB; // Size of each partition (GB) diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index ab108aa839acc..ad4a9fe88cddb 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -23,8 +23,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) "help,h", "Print help message.")( "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( - "launches,l", bpo::value()->default_value(50), "Number of iterations in reading kernels.")( - "ntests,n", bpo::value()->default_value(1), "Number of times each test is run." + "launches,l", bpo::value()->default_value(50), "Number of iterations in reading kernels.")( + "ntests,n", bpo::value()->default_value(1), "Number of times each test is run." ); try { bpo::store(parse_command_line(argc, argv, options), vm); @@ -44,8 +44,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); conf.partitionSizeGB = vm["chunkSize"].as(); - conf.kernelLaunches = vm["launches"].as(); - conf.nTests = vm["ntests"].as(); + conf.kernelLaunches = vm["launches"].as(); + conf.nTests = vm["ntests"].as(); return true; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index ab175b91fe8e8..b8f0ac6691e9f 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -63,14 +63,11 @@ GPUg() void readerKernel( buffer_type* results, buffer_type* scratch, size_t bufferSize, - float partitionSize = 1.f, - size_t innerLoops = 1) + float partitionSize = 1.f) { for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { - for (size_t j{0}; j < innerLoops; ++j) { - if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { - results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync - } + if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { + results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync } } } @@ -236,6 +233,7 @@ void GPUbenchmark::generalInit(const int deviceId) GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); mState.partitionSizeGB = mOptions.partitionSizeGB; + mState.iterations = mOptions.kernelLaunches; mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; @@ -265,11 +263,11 @@ void GPUbenchmark::readingInit() } template -void GPUbenchmark::readingBenchmark(size_t kernelLaunches) +void GPUbenchmark::readingBenchmark(int kernelLaunches) { auto nBlocks{mState.getMaxSegments()}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - for (auto launch{kernelLaunches}; launch--;) { + for (auto launch{0}; launch < kernelLaunches; ++launch) { gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, mState.getPartitionCapacity(), mState.partitionSizeGB); } GPUCHECK(cudaDeviceSynchronize()); @@ -278,7 +276,6 @@ void GPUbenchmark::readingBenchmark(size_t kernelLaunches) template void GPUbenchmark::readingFinalize() { - GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost)); } From 3e961257777cd7f1ddadc93cad9a50069536d103 Mon Sep 17 00:00:00 2001 From: ALICE Builder Date: Fri, 25 Jun 2021 18:32:20 +0200 Subject: [PATCH 26/42] Please consider the following formatting changes (#17) --- GPU/GPUbenchmark/benchmark.cxx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index ad4a9fe88cddb..0abd0d52be1ba 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -24,8 +24,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( "launches,l", bpo::value()->default_value(50), "Number of iterations in reading kernels.")( - "ntests,n", bpo::value()->default_value(1), "Number of times each test is run." - ); + "ntests,n", bpo::value()->default_value(1), "Number of times each test is run."); try { bpo::store(parse_command_line(argc, argv, options), vm); if (vm.count("help")) { @@ -57,9 +56,9 @@ int main(int argc, const char* argv[]) o2::benchmark::benchmarkOpts opts; - if (!parseArgs(opts, argc, argv)) { - return -1; - } + if (!parseArgs(opts, argc, argv)) { + return -1; + } std::shared_ptr streamer = std::make_shared(); From d928a4cb7de0db7d81de82b82ed79d511e076ade Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 30 Jun 2021 12:06:49 +0200 Subject: [PATCH 27/42] Streamline ker benchmarking w/ events --- GPU/GPUbenchmark/CMakeLists.txt | 2 +- GPU/GPUbenchmark/Shared/Kernels.h | 17 +++- GPU/GPUbenchmark/Shared/Utils.h | 14 ++- GPU/GPUbenchmark/cuda/Kernels.cu | 159 +++++++++++++++++++++++++----- 4 files changed, 160 insertions(+), 32 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 9151acc8bc478..9ed33e179cc84 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -33,7 +33,7 @@ if(HIP_ENABLED) if(EXISTS ${HIPIFY_EXECUTABLE}) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) message("Generating HIP kernel code ...") - execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") + execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 5b872e521a173..09272d37d8ccd 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -43,18 +43,25 @@ class GPUbenchmark final template float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); + template + float benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args); + + template + std::vector benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nSplits, int nLaunches, int blocks, int threads, T&... args); + // Main interface - void generalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters - void run(); // Execute all specified callbacks - void generalFinalize(); // Cleanup - void printDevices(); // Dump info + void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters + void run(); // Execute all specified callbacks + void globalFinalize(); // Cleanup + void printDevices(); // Dump info // Initializations/Finalizations of tests. Not to be measured, in principle used for report void readingInit(); void readingFinalize(); // Benchmark kernel callbacks - void readingBenchmark(int iterations); + void readingSequential(SplitLevel sl); + void readingConcurrent(SplitLevel sl); private: gpuState mState; diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 8551419efd4ff..9682e69b2e5c3 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -36,6 +36,12 @@ namespace o2 { namespace benchmark { + +enum class SplitLevel { + Blocks, + Threads +}; + struct benchmarkOpts { benchmarkOpts() = default; @@ -75,7 +81,7 @@ struct gpuState { return gpuBuffersHost; } - int getNiterations() { return iterations; } + int getNKernelLaunches() { return iterations; } // Configuration size_t nMaxThreadsPerDimension; @@ -105,7 +111,7 @@ class ResultStreamer public: explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root"); ~ResultStreamer(); - void storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry); + void storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry); private: std::string mDebugTreeFileName = "benchmark_results.root"; // output filename @@ -123,10 +129,10 @@ inline ResultStreamer::~ResultStreamer() delete mTreeStream; } -inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry) +inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry) { (*mTreeStream) - << (benchmarkName + type).data() + << (benchmarkName + "_" + type + "_" + split).data() << "elapsed=" << entry << "\n"; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index b8f0ac6691e9f..1612437149c4a 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -9,10 +9,13 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. /// -/// \file Kernels.cu +/// \file Kernels.{cu, hip.cxx} /// \author: mconcas@cern.ch #include "../Shared/Kernels.h" +#if defined(__HIPCC__) +#include "hip/hip_runtime.h" +#endif #include #define GPUCHECK(error) \ @@ -58,16 +61,41 @@ GPUhd() buffer_type* getPartPtrOnScratch(buffer_type* scratchPtr, float partSize return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partSizeGB) * partNumber); } +GPUhd() int getCorrespondingSplitId(int blockId, int nPartitions, int nSplits = 1) +{ + return blockId * nSplits / nPartitions; +} + +template +GPUg() void read_single_segment_k( + int segmentId, + buffer_type* results, + buffer_type* scratch, + size_t bufferSize, + float partitionSize = 1.f) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { + if (getPartPtrOnScratch(scratch, partitionSize, segmentId)[i] == static_cast(1)) { + results[segmentId] += getPartPtrOnScratch(scratch, partitionSize, segmentId)[i]; // should never happen and threads should be always in sync + } + } +} + template -GPUg() void readerKernel( +GPUg() void split_read_k( + int split, // Id of split partition + int nsplits, + int npartitions, buffer_type* results, buffer_type* scratch, size_t bufferSize, float partitionSize = 1.f) { - for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { - if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { - results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync + if (split == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { + if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { + results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync + } } } } @@ -210,6 +238,55 @@ float GPUbenchmark::measure(void (GPUbenchmark::*task) return diff; } +template +template +float GPUbenchmark::benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args) +{ + cudaEvent_t start, stop; + GPUCHECK(cudaEventCreate(&start)); + GPUCHECK(cudaEventCreate(&stop)); + + GPUCHECK(cudaEventRecord(start)); + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { + (*kernel)<<>>(args...); // Stream is 0 by default, so that we don't have to convert cudaStream_t it in HIP header + } + GPUCHECK(cudaEventRecord(stop)); + + GPUCHECK(cudaEventSynchronize(stop)); + float milliseconds{0.f}; + GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop)); + + return milliseconds; +} + +template +template +std::vector GPUbenchmark::benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nStreams, int nLaunches, int blocks, int threads, T&... args) +{ + std::vector splitStarts(nStreams), splitStops(nStreams); + std::vector streams(nStreams); + std::vector splitResults(nStreams); + + for (auto iStream{0}; iStream < nStreams; ++iStream) { + GPUCHECK(cudaStreamCreate(&(streams.at(iStream)))); + GPUCHECK(cudaEventCreate(&(splitStarts[iStream]))); + GPUCHECK(cudaEventCreate(&(splitStops[iStream]))); + GPUCHECK(cudaEventRecord(splitStarts[iStream], streams[iStream])); + + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive lanuches on the same stream + (*kernel)<<>>(iStream, nStreams, args...); + } + GPUCHECK(cudaEventRecord(splitStops[iStream], streams[iStream])); + } + + for (auto iStream{0}; iStream < nStreams; ++iStream) { + GPUCHECK(cudaEventSynchronize(splitStops[iStream])); + GPUCHECK(cudaEventElapsedTime(&(splitResults.at(iStream)), splitStarts[iStream], splitStops[iStream])); + } + + return splitResults; +} + template void GPUbenchmark::printDevices() { @@ -223,7 +300,7 @@ void GPUbenchmark::printDevices() } template -void GPUbenchmark::generalInit(const int deviceId) +void GPUbenchmark::globalInit(const int deviceId) { cudaDeviceProp props; size_t free; @@ -257,30 +334,69 @@ void GPUbenchmark::generalInit(const int deviceId) template void GPUbenchmark::readingInit() { - std::cout << ">>> Initializing reading benchmark with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; + std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; mState.hostReadingResultsVector.resize(mState.getMaxSegments()); GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type))); } template -void GPUbenchmark::readingBenchmark(int kernelLaunches) +void GPUbenchmark::readingSequential(SplitLevel sl) { - auto nBlocks{mState.getMaxSegments()}; - auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - for (auto launch{0}; launch < kernelLaunches; ++launch) { - gpu::readerKernel<<>>(mState.deviceReadingResultsPtr, mState.scratchPtr, mState.getPartitionCapacity(), mState.partitionSizeGB); + switch (sl) { + case SplitLevel::Blocks: + break; + case SplitLevel::Threads: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads"; + for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately + auto result = benchmarkSynchExecution(&gpu::read_single_segment_k, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); + mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iSegment), getType(), result); + } + } + break; + } + } + std::cout << " completed." << std::endl; +} + +template +void GPUbenchmark::readingConcurrent(SplitLevel sl) +{ + switch (sl) { + case SplitLevel::Blocks: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto segments{mState.getMaxSegments()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurements{mOptions.nTests}; measurements--;) { + std::cout << std::setw(2) << ">>> Concurrent read benchmark, splitting on blocks"; + auto results = benchmarkAsynchExecution(&gpu::split_read_k, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType(), results[iResult]); + } + } + break; + } + case SplitLevel::Threads: + break; } - GPUCHECK(cudaDeviceSynchronize()); + std::cout << " completed." << std::endl; } template void GPUbenchmark::readingFinalize() { GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceReadingResultsPtr)); } template -void GPUbenchmark::generalFinalize() +void GPUbenchmark::globalFinalize() { GPUCHECK(cudaFree(mState.scratchPtr)); } @@ -288,17 +404,16 @@ void GPUbenchmark::generalFinalize() template void GPUbenchmark::run() { - generalInit(0); - // Test calls go here - // - Reading + globalInit(0); + // Test calls go here: readingInit(); - for (auto measures{mOptions.nTests}; measures--;) { - auto result = measure(&GPUbenchmark::readingBenchmark, "Reading benchmark", mState.getNiterations()); - mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType(), result); - } + // - Reading + readingSequential(SplitLevel::Threads); + // - Split reading + readingConcurrent(SplitLevel::Blocks); readingFinalize(); - GPUbenchmark::generalFinalize(); + GPUbenchmark::globalFinalize(); } template class GPUbenchmark; From bc8e75b7e321c1ddb81578e8155f5db1ee6c36e0 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 1 Jul 2021 15:21:07 +0200 Subject: [PATCH 28/42] Tidy up kernels and improve output --- GPU/GPUbenchmark/cuda/Kernels.cu | 86 +++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 13 deletions(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 1612437149c4a..5be6250c42825 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -67,22 +67,54 @@ GPUhd() int getCorrespondingSplitId(int blockId, int nPartitions, int nSplits = } template -GPUg() void read_single_segment_k( +GPUd() void read_segment_singleblock(size_t threadId, + buffer_type* scratch, + buffer_type* results, + size_t blockDim, + size_t bufferSize, + float partSizeGB, + size_t segmentId) +{ + for (size_t i = threadId; i < bufferSize; i += blockDim) { + if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast(1)) { // actual read operation is performed here + results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i]; // this case should never happen and waves should be always in sync + } + } +} + +template +GPUd() void read_segment_multiblock(size_t threadId, + size_t blockId, + buffer_type* scratch, + buffer_type* results, + size_t blockDim, + size_t gridDim, + size_t bufferSize, + float partSizeGB, + size_t segmentId) +{ + for (int i = blockId * blockDim + threadId; i < bufferSize; i += blockDim * gridDim) { + if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast(1)) { // actual read operation is performed here + results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i]; // this case should never happen and waves should be always in sync + } + } +} + +template +GPUg() void read_segment_singleblock_k( int segmentId, buffer_type* results, buffer_type* scratch, size_t bufferSize, float partitionSize = 1.f) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) { - if (getPartPtrOnScratch(scratch, partitionSize, segmentId)[i] == static_cast(1)) { - results[segmentId] += getPartPtrOnScratch(scratch, partitionSize, segmentId)[i]; // should never happen and threads should be always in sync - } + if (segmentId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + read_segment_singleblock(threadIdx.x, scratch, results, blockDim.x, bufferSize, partitionSize, segmentId); } } template -GPUg() void split_read_k( +GPUg() void split_read_singleblock_k( int split, // Id of split partition int nsplits, int npartitions, @@ -99,6 +131,18 @@ GPUg() void split_read_k( } } } + +template +GPUg() void read_single_segment_multiblock_k( + int segmentId, + buffer_type* results, + buffer_type* scratch, + size_t bufferSize, + float partitionSize = 1.f) +{ + read_segment_multiblock(threadIdx.x, blockIdx.x, scratch, results, blockDim.x, gridDim.x, bufferSize, partitionSize, segmentId); +} + /////////////////// } // namespace gpu @@ -315,7 +359,7 @@ void GPUbenchmark::globalInit(const int deviceId) mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; mState.scratchSize = static_cast(mOptions.freeMemoryFractionToAllocate * free); - std::cout << ">>> Running on: \e[1m" << props.name << "\e[0m" << std::endl; + std::cout << ">>> Running on: \033[1;31m" << props.name << "\e[0m" << std::endl; // Allocate scratch on GPU GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); @@ -343,24 +387,38 @@ template void GPUbenchmark::readingSequential(SplitLevel sl) { switch (sl) { - case SplitLevel::Blocks: + case SplitLevel::Blocks: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on blocks:"; + for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately + auto result = benchmarkSynchExecution(&gpu::read_segment_singleblock_k, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); + mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iSegment), getType(), result); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } break; + } + case SplitLevel::Threads: { auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads"; + std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads:"; for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately - auto result = benchmarkSynchExecution(&gpu::read_single_segment_k, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); + auto result = benchmarkSynchExecution(&gpu::read_single_segment_multiblock_k, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iSegment), getType(), result); } + std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } } - std::cout << " completed." << std::endl; } template @@ -375,7 +433,7 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl) for (auto measurements{mOptions.nTests}; measurements--;) { std::cout << std::setw(2) << ">>> Concurrent read benchmark, splitting on blocks"; - auto results = benchmarkAsynchExecution(&gpu::split_read_k, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); + auto results = benchmarkAsynchExecution(&gpu::split_read_singleblock_k, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType(), results[iResult]); } @@ -409,8 +467,10 @@ void GPUbenchmark::run() readingInit(); // - Reading readingSequential(SplitLevel::Threads); + readingSequential(SplitLevel::Blocks); + // - Split reading - readingConcurrent(SplitLevel::Blocks); + // readingConcurrent(SplitLevel::Blocks); readingFinalize(); GPUbenchmark::globalFinalize(); From 71408e894498a162c7f9d2738db1d9c9a2dfaf8c Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 7 Jul 2021 09:47:39 +0200 Subject: [PATCH 29/42] Update read test --- GPU/GPUbenchmark/Shared/Kernels.h | 21 +- GPU/GPUbenchmark/Shared/Utils.h | 22 +- GPU/GPUbenchmark/benchmark.cxx | 2 +- GPU/GPUbenchmark/cuda/Kernels.cu | 323 ++++++++++++++++-------------- 4 files changed, 194 insertions(+), 174 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 09272d37d8ccd..2f6f06764d373 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -23,15 +23,12 @@ #include #include -// #define PARTITION_SIZE_GB 1 -// #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f - namespace o2 { namespace benchmark { -template +template class GPUbenchmark final { public: @@ -43,12 +40,20 @@ class GPUbenchmark final template float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); + // Single stream synchronous (sequential kernels) execution template - float benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args); + float benchmarkSync(void (*kernel)(T...), + int nLaunches, int blocks, int threads, T&... args); + // Multi-streams asynchronous executions on whole memory template - std::vector benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nSplits, int nLaunches, int blocks, int threads, T&... args); + std::vector benchmarkAsync(void (*kernel)(int, T...), + int nStreams, int nLaunches, int blocks, int threads, T&... args); + // Per-memory region benchmarking + template + std::vector benchmarkAsyncVsRegion(void (*kernel)(int, int, T...), + int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args); // Main interface void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters void run(); // Execute all specified callbacks @@ -61,10 +66,10 @@ class GPUbenchmark final // Benchmark kernel callbacks void readingSequential(SplitLevel sl); - void readingConcurrent(SplitLevel sl); + void readingConcurrent(SplitLevel sl, int nRegions = 2); private: - gpuState mState; + gpuState mState; std::shared_ptr mStreamer; benchmarkOpts mOptions; }; diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 9682e69b2e5c3..8f43647090a8b 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -45,7 +45,7 @@ enum class SplitLevel { struct benchmarkOpts { benchmarkOpts() = default; - float partitionSizeGB = 1.f; + float chunkReservedGB = 1.f; float freeMemoryFractionToAllocate = 0.95f; int kernelLaunches = 1; int nTests = 1; @@ -53,22 +53,22 @@ struct benchmarkOpts { template struct gpuState { - int getMaxSegments() + int getMaxChunks() { - return (double)scratchSize / (partitionSizeGB * GB); + return (double)scratchSize / (chunkReservedGB * GB); } void computeScratchPtrs() { - partAddrOnHost.resize(getMaxSegments()); - for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) { - partAddrOnHost[iBuffAddress] = reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partitionSizeGB) * iBuffAddress); + partAddrOnHost.resize(getMaxChunks()); + for (size_t iBuffAddress{0}; iBuffAddress < getMaxChunks(); ++iBuffAddress) { + partAddrOnHost[iBuffAddress] = reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * iBuffAddress); } } size_t getPartitionCapacity() { - return static_cast(GB * partitionSizeGB / sizeof(T)); + return static_cast(GB * chunkReservedGB / sizeof(T)); } std::vector getScratchPtrs() @@ -87,7 +87,7 @@ struct gpuState { size_t nMaxThreadsPerDimension; int iterations; - float partitionSizeGB; // Size of each partition (GB) + float chunkReservedGB; // Size of each partition (GB) // General containers and state T* scratchPtr; // Pointer to scratch buffer @@ -111,7 +111,7 @@ class ResultStreamer public: explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root"); ~ResultStreamer(); - void storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry); + void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry); private: std::string mDebugTreeFileName = "benchmark_results.root"; // output filename @@ -129,10 +129,10 @@ inline ResultStreamer::~ResultStreamer() delete mTreeStream; } -inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry) +inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry) { (*mTreeStream) - << (benchmarkName + "_" + type + "_" + split).data() + << (benchmarkName + "_" + type + "_" + chunk).data() << "elapsed=" << entry << "\n"; } diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 0abd0d52be1ba..070f43cc94add 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -42,7 +42,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) } conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); - conf.partitionSizeGB = vm["chunkSize"].as(); + conf.chunkReservedGB = vm["chunkSize"].as(); conf.kernelLaunches = vm["launches"].as(); conf.nTests = vm["ntests"].as(); diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 5be6250c42825..78e2cbb4c82ca 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -18,6 +18,12 @@ #endif #include +// Memory partition legend +// +// |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM +// |--chunk 0--|--chunk 1--|--chunk 2--| *** |--chunk n--| chunks -> default size: 1GB (single block pins) +// |__________________________________________scratch______________________________________________| scratch -> default size: 95% free GPU RAM + #define GPUCHECK(error) \ if (error != cudaSuccess) { \ printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \ @@ -28,6 +34,11 @@ double bytesToKB(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / GB; } +int getCorrespondingRegionId(int Id, int nChunks, int nRegions = 1) +{ + return Id * nRegions / nChunks; +} + template std::string getType() { @@ -53,98 +64,47 @@ namespace benchmark namespace gpu { -/////////////////// -/// Kernels and device functions go here -template -GPUhd() buffer_type* getPartPtrOnScratch(buffer_type* scratchPtr, float partSizeGB, size_t partNumber) +/////////////////////////// +// Device functions go here +template +GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber) { - return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partSizeGB) * partNumber); -} - -GPUhd() int getCorrespondingSplitId(int blockId, int nPartitions, int nSplits = 1) -{ - return blockId * nSplits / nPartitions; -} - -template -GPUd() void read_segment_singleblock(size_t threadId, - buffer_type* scratch, - buffer_type* results, - size_t blockDim, - size_t bufferSize, - float partSizeGB, - size_t segmentId) -{ - for (size_t i = threadId; i < bufferSize; i += blockDim) { - if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast(1)) { // actual read operation is performed here - results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i]; // this case should never happen and waves should be always in sync - } - } + return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * partNumber); } -template -GPUd() void read_segment_multiblock(size_t threadId, - size_t blockId, - buffer_type* scratch, - buffer_type* results, - size_t blockDim, - size_t gridDim, - size_t bufferSize, - float partSizeGB, - size_t segmentId) +////////////////// +// Kernels go here +template +GPUg() void readChunkSBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) { - for (int i = blockId * blockDim + threadId; i < bufferSize; i += blockDim * gridDim) { - if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast(1)) { // actual read operation is performed here - results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i]; // this case should never happen and waves should be always in sync + if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { + if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast(1)) { // actual read operation is performed here + results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i]; // this case should never happen and waves should be always in sync + } } } } -template -GPUg() void read_segment_singleblock_k( - int segmentId, - buffer_type* results, - buffer_type* scratch, - size_t bufferSize, - float partitionSize = 1.f) +template +GPUg() void readChunkMBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) { - if (segmentId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split - read_segment_singleblock(threadIdx.x, scratch, results, blockDim.x, bufferSize, partitionSize, segmentId); - } -} - -template -GPUg() void split_read_singleblock_k( - int split, // Id of split partition - int nsplits, - int npartitions, - buffer_type* results, - buffer_type* scratch, - size_t bufferSize, - float partitionSize = 1.f) -{ - if (split == blockIdx.x) { // runs only if blockIdx.x is allowed in given split - for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) { - if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast(1)) { - results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync - } + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) { + if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast(1)) { // actual read operation is performed here + results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i]; // this case should never happen and waves should be always in sync } } } - -template -GPUg() void read_single_segment_multiblock_k( - int segmentId, - buffer_type* results, - buffer_type* scratch, - size_t bufferSize, - float partitionSize = 1.f) -{ - read_segment_multiblock(threadIdx.x, blockIdx.x, scratch, results, blockDim.x, gridDim.x, bufferSize, partitionSize, segmentId); -} - -/////////////////// - } // namespace gpu void printDeviceProp(int deviceId) @@ -267,72 +227,98 @@ void printDeviceProp(int deviceId) << (float)free / total * 100.0 << "%)" << std::endl; } -template -template -float GPUbenchmark::measure(void (GPUbenchmark::*task)(T...), const char* taskName, T&&... args) -{ - float diff{0.f}; - std::cout << std::setw(2) << ">>> " << taskName; - auto start = std::chrono::high_resolution_clock::now(); - (this->*task)(std::forward(args)...); - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration diff_t{end - start}; - diff = diff_t.count(); - std::cout << std::setw(2) << " completed in: \x1B[32m" << diff << " ms\x1B[0m" << std::endl; - return diff; -} - -template +template template -float GPUbenchmark::benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args) +float GPUbenchmark::benchmarkSync(void (*kernel)(T...), + int nLaunches, int blocks, int threads, T&... args) // run for each chunk (id is passed in variadic args) { cudaEvent_t start, stop; GPUCHECK(cudaEventCreate(&start)); GPUCHECK(cudaEventCreate(&stop)); GPUCHECK(cudaEventRecord(start)); - for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { - (*kernel)<<>>(args...); // Stream is 0 by default, so that we don't have to convert cudaStream_t it in HIP header + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches + (*kernel)<<>>(args...); } - GPUCHECK(cudaEventRecord(stop)); + GPUCHECK(cudaEventRecord(stop)); // record checkpoint - GPUCHECK(cudaEventSynchronize(stop)); + GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions float milliseconds{0.f}; GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop)); return milliseconds; } -template +template template -std::vector GPUbenchmark::benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nStreams, int nLaunches, int blocks, int threads, T&... args) +std::vector GPUbenchmark::benchmarkAsync(void (*kernel)(int, T...), + int nStreams, int nLaunches, int blocks, int threads, T&... args) { - std::vector splitStarts(nStreams), splitStops(nStreams); + std::vector starts(nStreams), stops(nStreams); std::vector streams(nStreams); - std::vector splitResults(nStreams); + std::vector results(nStreams); - for (auto iStream{0}; iStream < nStreams; ++iStream) { + for (auto iStream{0}; iStream < nStreams; ++iStream) { // one stream per chunk GPUCHECK(cudaStreamCreate(&(streams.at(iStream)))); - GPUCHECK(cudaEventCreate(&(splitStarts[iStream]))); - GPUCHECK(cudaEventCreate(&(splitStops[iStream]))); - GPUCHECK(cudaEventRecord(splitStarts[iStream], streams[iStream])); + GPUCHECK(cudaEventCreate(&(starts[iStream]))); + GPUCHECK(cudaEventCreate(&(stops[iStream]))); + } - for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive lanuches on the same stream - (*kernel)<<>>(iStream, nStreams, args...); + for (auto iStream{0}; iStream < nStreams; ++iStream) { + GPUCHECK(cudaEventRecord(starts[iStream], streams[iStream])); + + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream + (*kernel)<<>>(iStream, args...); } - GPUCHECK(cudaEventRecord(splitStops[iStream], streams[iStream])); + GPUCHECK(cudaEventRecord(stops[iStream], streams[iStream])); } for (auto iStream{0}; iStream < nStreams; ++iStream) { - GPUCHECK(cudaEventSynchronize(splitStops[iStream])); - GPUCHECK(cudaEventElapsedTime(&(splitResults.at(iStream)), splitStarts[iStream], splitStops[iStream])); + GPUCHECK(cudaEventSynchronize(stops[iStream])); + GPUCHECK(cudaEventElapsedTime(&(results.at(iStream)), starts[iStream], stops[iStream])); + } + + return results; +} + +template +template +std::vector GPUbenchmark::benchmarkAsyncVsRegion(void (*kernel)(int, int, T...), + int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args) +{ + std::vector starts(nRegions), stops(nRegions); // I want one event per region + std::vector streams(nStreams); + std::vector results(nRegions); + + for (auto iStream{0}; iStream < nStreams; ++iStream) { + GPUCHECK(cudaStreamCreate(&(streams.at(iStream)))); } - return splitResults; + for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) { + GPUCHECK(cudaEventCreate(&(starts[iRegion]))); + GPUCHECK(cudaEventCreate(&(stops[iRegion]))); + + for (auto iStream{0}; iStream < nStreams; ++iStream) { + if (getCorrespondingRegionId(iStream, nStreams, nRegions) == iRegion) { + std::cout << "DEBUG: stream " << iStream << " " << getCorrespondingRegionId(iStream, nStreams, nRegions) << std::endl; + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream + (*kernel)<<>>(iStream, nStreams, args...); + } + } + GPUCHECK(cudaEventRecord(stops[iRegion], streams[iRegion])); + } + } + + for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) { + GPUCHECK(cudaEventSynchronize(stops[iRegion])); + GPUCHECK(cudaEventElapsedTime(&(results.at(iRegion)), starts[iRegion], stops[iRegion])); + } + + return results; } -template -void GPUbenchmark::printDevices() +template +void GPUbenchmark::printDevices() { int deviceCnt; GPUCHECK(cudaGetDeviceCount(&deviceCnt)); @@ -343,8 +329,8 @@ void GPUbenchmark::printDevices() } } -template -void GPUbenchmark::globalInit(const int deviceId) +template +void GPUbenchmark::globalInit(const int deviceId) { cudaDeviceProp props; size_t free; @@ -353,7 +339,7 @@ void GPUbenchmark::globalInit(const int deviceId) GPUCHECK(cudaGetDeviceProperties(&props, deviceId)); GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory)); - mState.partitionSizeGB = mOptions.partitionSizeGB; + mState.chunkReservedGB = mOptions.chunkReservedGB; mState.iterations = mOptions.kernelLaunches; mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; @@ -367,24 +353,24 @@ void GPUbenchmark::globalInit(const int deviceId) mState.computeScratchPtrs(); GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize)) - std::cout << " ├ Buffer type: \e[1m" << getType() << "\e[0m" << std::endl + std::cout << " ├ Buffer type: \e[1m" << getType() << "\e[0m" << std::endl << " ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory) << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n" - << " ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n" - << " └ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl + << " ├ Number of scratch chunks: " << mState.getMaxChunks() << " of " << mOptions.chunkReservedGB << "GB each\n" + << " └ Each chunk can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl << std::endl; } -template -void GPUbenchmark::readingInit() +template +void GPUbenchmark::readingInit() { std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; - mState.hostReadingResultsVector.resize(mState.getMaxSegments()); - GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type))); + mState.hostReadingResultsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); } -template -void GPUbenchmark::readingSequential(SplitLevel sl) +template +void GPUbenchmark::readingSequential(SplitLevel sl) { switch (sl) { case SplitLevel::Blocks: { @@ -393,10 +379,18 @@ void GPUbenchmark::readingSequential(SplitLevel sl) auto capacity{mState.getPartitionCapacity()}; for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on blocks:"; - for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately - auto result = benchmarkSynchExecution(&gpu::read_segment_singleblock_k, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); - mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iSegment), getType(), result); + std::cout << std::setw(2) << ">>> Sequential read benchmark, one block per chunk:"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::readChunkSBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceReadingResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -410,9 +404,17 @@ void GPUbenchmark::readingSequential(SplitLevel sl) for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads:"; - for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately - auto result = benchmarkSynchExecution(&gpu::read_single_segment_multiblock_k, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); - mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iSegment), getType(), result); + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::readChunkMBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceReadingResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -421,59 +423,72 @@ void GPUbenchmark::readingSequential(SplitLevel sl) } } -template -void GPUbenchmark::readingConcurrent(SplitLevel sl) +template +void GPUbenchmark::readingConcurrent(SplitLevel sl, int nChunks) { switch (sl) { case SplitLevel::Blocks: { auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - auto segments{mState.getMaxSegments()}; + auto chunks{mState.getMaxChunks()}; auto capacity{mState.getPartitionCapacity()}; for (auto measurements{mOptions.nTests}; measurements--;) { - std::cout << std::setw(2) << ">>> Concurrent read benchmark, splitting on blocks"; - auto results = benchmarkAsynchExecution(&gpu::split_read_singleblock_k, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB); + std::cout << std::setw(2) << ">>> Concurrent read benchmark, one block per chunk"; + auto results = benchmarkAsync(&gpu::readChunkSBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType(), results[iResult]); + mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType(), results[iResult]); } } break; } - case SplitLevel::Threads: + case SplitLevel::Threads: { + // auto nBlocks{mState.nMultiprocessors}; + // auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + // auto chunks{mState.getMaxChunks()}; + // auto capacity{mState.getPartitionCapacity()}; break; + } } std::cout << " completed." << std::endl; } -template -void GPUbenchmark::readingFinalize() +template +void GPUbenchmark::readingFinalize() { - GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); GPUCHECK(cudaFree(mState.deviceReadingResultsPtr)); } -template -void GPUbenchmark::globalFinalize() +template +void GPUbenchmark::globalFinalize() { GPUCHECK(cudaFree(mState.scratchPtr)); } -template -void GPUbenchmark::run() +template +void GPUbenchmark::run() { globalInit(0); // Test calls go here: readingInit(); - // - Reading + // - Reading whole memory readingSequential(SplitLevel::Threads); readingSequential(SplitLevel::Blocks); - // - Split reading - // readingConcurrent(SplitLevel::Blocks); + // - Reading memory partitions + readingConcurrent(SplitLevel::Blocks); readingFinalize(); - GPUbenchmark::globalFinalize(); + GPUbenchmark::globalFinalize(); } template class GPUbenchmark; From 05cc7ae4b1b76801b2bff45b2430d9b1d3bad305 Mon Sep 17 00:00:00 2001 From: ALICE Builder Date: Wed, 7 Jul 2021 20:32:53 +0200 Subject: [PATCH 30/42] Please consider the following formatting changes (#18) --- GPU/GPUbenchmark/Shared/Kernels.h | 6 +++--- GPU/GPUbenchmark/cuda/Kernels.cu | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 2f6f06764d373..fc64c414bf391 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -43,17 +43,17 @@ class GPUbenchmark final // Single stream synchronous (sequential kernels) execution template float benchmarkSync(void (*kernel)(T...), - int nLaunches, int blocks, int threads, T&... args); + int nLaunches, int blocks, int threads, T&... args); // Multi-streams asynchronous executions on whole memory template std::vector benchmarkAsync(void (*kernel)(int, T...), - int nStreams, int nLaunches, int blocks, int threads, T&... args); + int nStreams, int nLaunches, int blocks, int threads, T&... args); // Per-memory region benchmarking template std::vector benchmarkAsyncVsRegion(void (*kernel)(int, int, T...), - int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args); + int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args); // Main interface void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters void run(); // Execute all specified callbacks diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 78e2cbb4c82ca..5d894ca524fa7 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -436,14 +436,14 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nChunks) for (auto measurements{mOptions.nTests}; measurements--;) { std::cout << std::setw(2) << ">>> Concurrent read benchmark, one block per chunk"; auto results = benchmarkAsync(&gpu::readChunkSBKernel, - mState.getMaxChunks(), // nStreams - mState.getNKernelLaunches(), - nBlocks, - nThreads, - mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper) - mState.scratchPtr, - capacity, - mState.chunkReservedGB); + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType(), results[iResult]); } From 4ae3ae4c766e2aef10aff442bb045e82d7927694 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 8 Jul 2021 19:06:20 +0200 Subject: [PATCH 31/42] CP --- GPU/GPUbenchmark/Shared/Kernels.h | 4 -- GPU/GPUbenchmark/Shared/Utils.h | 10 ++++ GPU/GPUbenchmark/benchmark.cxx | 4 +- GPU/GPUbenchmark/cuda/Kernels.cu | 85 +++++++++++++------------------ 4 files changed, 47 insertions(+), 56 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index fc64c414bf391..c741ce31a7f0c 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -50,10 +50,6 @@ class GPUbenchmark final std::vector benchmarkAsync(void (*kernel)(int, T...), int nStreams, int nLaunches, int blocks, int threads, T&... args); - // Per-memory region benchmarking - template - std::vector benchmarkAsyncVsRegion(void (*kernel)(int, int, T...), - int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args); // Main interface void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters void run(); // Execute all specified callbacks diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 8f43647090a8b..931ffbfc1fc75 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -46,6 +46,7 @@ struct benchmarkOpts { benchmarkOpts() = default; float chunkReservedGB = 1.f; + int nRegions = 2; float freeMemoryFractionToAllocate = 0.95f; int kernelLaunches = 1; int nTests = 1; @@ -112,6 +113,7 @@ class ResultStreamer explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root"); ~ResultStreamer(); void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry); + void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry); private: std::string mDebugTreeFileName = "benchmark_results.root"; // output filename @@ -137,6 +139,14 @@ inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std:: << "\n"; } +inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry) +{ + (*mTreeStream) + << (benchmarkName + "_" + type + "_region_" + region).data() + << "elapsed=" << entry + << "\n"; +} + } // namespace benchmark } // namespace o2 diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 070f43cc94add..4dfc039c34efe 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -22,8 +22,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) options.add_options()( "help,h", "Print help message.")( "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( + "regions,r", bpo::value()->default_value(2), "Number of memory regions to partition RAM in.")( "freeMemFraction,f", bpo::value()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")( - "launches,l", bpo::value()->default_value(50), "Number of iterations in reading kernels.")( + "launches,l", bpo::value()->default_value(10), "Number of iterations in reading kernels.")( "ntests,n", bpo::value()->default_value(1), "Number of times each test is run."); try { bpo::store(parse_command_line(argc, argv, options), vm); @@ -43,6 +44,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as(); conf.chunkReservedGB = vm["chunkSize"].as(); + conf.nRegions = vm["regions"].as(); conf.kernelLaunches = vm["launches"].as(); conf.nTests = vm["ntests"].as(); diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 5d894ca524fa7..6510d153c5a9d 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -281,42 +281,6 @@ std::vector GPUbenchmark::benchmarkAsync(void (*kernel)(int, return results; } -template -template -std::vector GPUbenchmark::benchmarkAsyncVsRegion(void (*kernel)(int, int, T...), - int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args) -{ - std::vector starts(nRegions), stops(nRegions); // I want one event per region - std::vector streams(nStreams); - std::vector results(nRegions); - - for (auto iStream{0}; iStream < nStreams; ++iStream) { - GPUCHECK(cudaStreamCreate(&(streams.at(iStream)))); - } - - for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) { - GPUCHECK(cudaEventCreate(&(starts[iRegion]))); - GPUCHECK(cudaEventCreate(&(stops[iRegion]))); - - for (auto iStream{0}; iStream < nStreams; ++iStream) { - if (getCorrespondingRegionId(iStream, nStreams, nRegions) == iRegion) { - std::cout << "DEBUG: stream " << iStream << " " << getCorrespondingRegionId(iStream, nStreams, nRegions) << std::endl; - for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream - (*kernel)<<>>(iStream, nStreams, args...); - } - } - GPUCHECK(cudaEventRecord(stops[iRegion], streams[iRegion])); - } - } - - for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) { - GPUCHECK(cudaEventSynchronize(stops[iRegion])); - GPUCHECK(cudaEventElapsedTime(&(results.at(iRegion)), starts[iRegion], stops[iRegion])); - } - - return results; -} - template void GPUbenchmark::printDevices() { @@ -378,8 +342,8 @@ void GPUbenchmark::readingSequential(SplitLevel sl) auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; - for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential read benchmark, one block per chunk:"; + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << ">>> Sequential read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately auto result = benchmarkSync(&gpu::readChunkSBKernel, mState.getNKernelLaunches(), @@ -402,8 +366,8 @@ void GPUbenchmark::readingSequential(SplitLevel sl) auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; - for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads:"; + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << ">>> Sequential read, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately auto result = benchmarkSync(&gpu::readChunkMBKernel, mState.getNKernelLaunches(), @@ -424,7 +388,7 @@ void GPUbenchmark::readingSequential(SplitLevel sl) } template -void GPUbenchmark::readingConcurrent(SplitLevel sl, int nChunks) +void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) { switch (sl) { case SplitLevel::Blocks: { @@ -433,8 +397,8 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nChunks) auto chunks{mState.getMaxChunks()}; auto capacity{mState.getPartitionCapacity()}; - for (auto measurements{mOptions.nTests}; measurements--;) { - std::cout << std::setw(2) << ">>> Concurrent read benchmark, one block per chunk"; + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; auto results = benchmarkAsync(&gpu::readChunkSBKernel, mState.getMaxChunks(), // nStreams mState.getNKernelLaunches(), @@ -445,20 +409,39 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nChunks) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType(), results[iResult]); + auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); + mStreamer.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType(), results[iResult]); } + std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } case SplitLevel::Threads: { - // auto nBlocks{mState.nMultiprocessors}; - // auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; - // auto chunks{mState.getMaxChunks()}; - // auto capacity{mState.getPartitionCapacity()}; + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::readChunkMBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); + mStreamer.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType(), results[iResult]); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } break; } } - std::cout << " completed." << std::endl; } template @@ -481,8 +464,8 @@ void GPUbenchmark::run() // Test calls go here: readingInit(); // - Reading whole memory - readingSequential(SplitLevel::Threads); - readingSequential(SplitLevel::Blocks); + // readingSequential(SplitLevel::Threads); + // readingSequential(SplitLevel::Blocks); // - Reading memory partitions readingConcurrent(SplitLevel::Blocks); From 195aae766adfb8e4b38cd9d5acb1fbba400dc197 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 8 Jul 2021 22:13:19 +0200 Subject: [PATCH 32/42] Add last read test --- GPU/GPUbenchmark/benchmark.cxx | 8 ++++---- GPU/GPUbenchmark/cuda/Kernels.cu | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 4dfc039c34efe..6b1d5338db5c1 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -66,10 +66,10 @@ int main(int argc, const char* argv[]) o2::benchmark::GPUbenchmark bm_char{opts, streamer}; bm_char.run(); - // o2::benchmark::GPUbenchmark bm_int{opts, streamer}; - // bm_int.run(); - // o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; - // bm_size_t.run(); + o2::benchmark::GPUbenchmark bm_int{opts, streamer}; + bm_int.run(); + o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; + bm_size_t.run(); return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 6510d153c5a9d..e182d3889ded4 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -423,7 +423,7 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { - std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << ">>> Concurrent read, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; auto results = benchmarkAsync(&gpu::readChunkMBKernel, mState.getMaxChunks(), // nStreams mState.getNKernelLaunches(), @@ -464,11 +464,12 @@ void GPUbenchmark::run() // Test calls go here: readingInit(); // - Reading whole memory - // readingSequential(SplitLevel::Threads); - // readingSequential(SplitLevel::Blocks); + readingSequential(SplitLevel::Threads); + readingSequential(SplitLevel::Blocks); // - Reading memory partitions readingConcurrent(SplitLevel::Blocks); + readingConcurrent(SplitLevel::Threads); readingFinalize(); GPUbenchmark::globalFinalize(); From 33b19bec7a405490b8425d4e53418a5851fb2edf Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 8 Jul 2021 22:17:22 +0200 Subject: [PATCH 33/42] Add last read test --- GPU/GPUbenchmark/cuda/Kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e182d3889ded4..e4784da0bb5d5 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -464,8 +464,8 @@ void GPUbenchmark::run() // Test calls go here: readingInit(); // - Reading whole memory - readingSequential(SplitLevel::Threads); readingSequential(SplitLevel::Blocks); + readingSequential(SplitLevel::Threads); // - Reading memory partitions readingConcurrent(SplitLevel::Blocks); From 434c9af4fceca965ddae2dd8b02a7d2d08b3f305 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 8 Jul 2021 22:36:50 +0200 Subject: [PATCH 34/42] Fix result dump on file --- GPU/GPUbenchmark/cuda/Kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e4784da0bb5d5..9cba1e713f852 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -434,7 +434,7 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); + auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); mStreamer.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; From 9daeab5b442849df500e26995f3bd9dfec9c465e Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Fri, 9 Jul 2021 19:00:30 +0200 Subject: [PATCH 35/42] add reading kernel --- GPU/GPUbenchmark/cuda/Kernels.cu | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 9cba1e713f852..5149245789e61 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -105,6 +105,36 @@ GPUg() void readChunkMBKernel( } } } + +// Writing +template +GPUg() void writeChunkSBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1; + } + } +} + +template +GPUg() void writeChunkMBKernel( + int chunkId, + chunk_type* results, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1; + } +} + } // namespace gpu void printDeviceProp(int deviceId) From e309004c388f114fba0e3f1acfbbaaf706e0e835 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Sat, 10 Jul 2021 10:51:08 +0200 Subject: [PATCH 36/42] Add write tests --- GPU/GPUbenchmark/CMakeLists.txt | 2 +- GPU/GPUbenchmark/Shared/Kernels.h | 16 ++- GPU/GPUbenchmark/Shared/Utils.h | 8 +- GPU/GPUbenchmark/cuda/Kernels.cu | 183 ++++++++++++++++++++++++++---- 4 files changed, 178 insertions(+), 31 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 9ed33e179cc84..9151acc8bc478 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -33,7 +33,7 @@ if(HIP_ENABLED) if(EXISTS ${HIPIFY_EXECUTABLE}) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) message("Generating HIP kernel code ...") - execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}") + execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index c741ce31a7f0c..b39dc2f3a836a 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -57,12 +57,18 @@ class GPUbenchmark final void printDevices(); // Dump info // Initializations/Finalizations of tests. Not to be measured, in principle used for report - void readingInit(); - void readingFinalize(); + void readInit(); + void readFinalize(); - // Benchmark kernel callbacks - void readingSequential(SplitLevel sl); - void readingConcurrent(SplitLevel sl, int nRegions = 2); + void writeInit(); + void writeFinalize(); + + // Kernel calling wrappers + void readSequential(SplitLevel sl); + void readConcurrent(SplitLevel sl, int nRegions = 2); + + void writeSequential(SplitLevel sl); + void writeConcurrent(SplitLevel sl, int nRegions = 2); private: gpuState mState; diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 931ffbfc1fc75..c8f1679fbbbd8 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -97,8 +97,12 @@ struct gpuState { std::vector> gpuBuffersHost; // Host-based vector-ized data // Test-specific containers - T* deviceReadingResultsPtr; // Results of the reading test (single variable) on GPU - std::vector hostReadingResultsVector; // Results of the reading test (single variable) on host + T* deviceReadResultsPtr; // Results of the read test (single variable) on GPU + std::vector hostReadResultsVector; // Results of the read test (single variable) on host + + // Test-specific containers + T* deviceWriteResultsPtr; // Results of the write test (single variable) on GPU + std::vector hostWriteResultsVector; // Results of the write test (single variable) on host // Static info size_t totalMemory; diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 5149245789e61..e79bb1670d31d 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -18,7 +18,7 @@ #endif #include -// Memory partition legend +// Memory partitioning legend // // |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM // |--chunk 0--|--chunk 1--|--chunk 2--| *** |--chunk n--| chunks -> default size: 1GB (single block pins) @@ -356,15 +356,15 @@ void GPUbenchmark::globalInit(const int deviceId) } template -void GPUbenchmark::readingInit() +void GPUbenchmark::readInit() { std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; - mState.hostReadingResultsVector.resize(mState.getMaxChunks()); - GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadingResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); + mState.hostReadResultsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceReadResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); } template -void GPUbenchmark::readingSequential(SplitLevel sl) +void GPUbenchmark::readSequential(SplitLevel sl) { switch (sl) { case SplitLevel::Blocks: { @@ -380,11 +380,11 @@ void GPUbenchmark::readingSequential(SplitLevel sl) nBlocks, nThreads, iChunk, - mState.deviceReadingResultsPtr, + mState.deviceReadResultsPtr, mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iChunk), getType(), result); + mStreamer.get()->storeBenchmarkEntry("seq_R_SB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -404,11 +404,11 @@ void GPUbenchmark::readingSequential(SplitLevel sl) nBlocks, nThreads, iChunk, - mState.deviceReadingResultsPtr, + mState.deviceReadResultsPtr, mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iChunk), getType(), result); + mStreamer.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -418,7 +418,7 @@ void GPUbenchmark::readingSequential(SplitLevel sl) } template -void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) +void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) { switch (sl) { case SplitLevel::Blocks: { @@ -434,7 +434,7 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) mState.getNKernelLaunches(), nBlocks, nThreads, - mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper) mState.scratchPtr, capacity, mState.chunkReservedGB); @@ -459,7 +459,7 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) mState.getNKernelLaunches(), nBlocks, nThreads, - mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper) mState.scratchPtr, capacity, mState.chunkReservedGB); @@ -475,10 +475,137 @@ void GPUbenchmark::readingConcurrent(SplitLevel sl, int nRegions) } template -void GPUbenchmark::readingFinalize() +void GPUbenchmark::readFinalize() { - GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); - GPUCHECK(cudaFree(mState.deviceReadingResultsPtr)); + GPUCHECK(cudaMemcpy(mState.hostReadResultsVector.data(), mState.deviceReadResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceReadResultsPtr)); +} + +/// Write +template +void GPUbenchmark::writeInit() +{ + std::cout << ">>> Initializing write benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; + mState.hostWriteResultsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceWriteResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); +} + +template +void GPUbenchmark::writeSequential(SplitLevel sl) +{ + switch (sl) { + case SplitLevel::Blocks: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << ">>> Sequential write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::writeChunkSBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceWriteResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mStreamer.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType(), result); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + + case SplitLevel::Threads: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << ">>> Sequential write, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::writeChunkMBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceWriteResultsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mStreamer.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType(), result); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) +{ + switch (sl) { + case SplitLevel::Blocks: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << ">>> Concurrent write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::writeChunkSBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); + mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + case SplitLevel::Threads: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << ">>> Concurrent write, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::writeChunkMBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); + mStreamer.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType(), results[iResult]); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::writeFinalize() +{ + GPUCHECK(cudaMemcpy(mState.hostWriteResultsVector.data(), mState.deviceWriteResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceWriteResultsPtr)); } template @@ -492,15 +619,25 @@ void GPUbenchmark::run() { globalInit(0); // Test calls go here: - readingInit(); + readInit(); // - Reading whole memory - readingSequential(SplitLevel::Blocks); - readingSequential(SplitLevel::Threads); - - // - Reading memory partitions - readingConcurrent(SplitLevel::Blocks); - readingConcurrent(SplitLevel::Threads); - readingFinalize(); + readSequential(SplitLevel::Blocks); + readSequential(SplitLevel::Threads); + + // - Reading memory regions + readConcurrent(SplitLevel::Blocks); + readConcurrent(SplitLevel::Threads); + readFinalize(); + + writeInit(); + // - Write on whole memory + writeSequential(SplitLevel::Blocks); + writeSequential(SplitLevel::Threads); + + // - Write memory regions + writeConcurrent(SplitLevel::Blocks); + writeConcurrent(SplitLevel::Threads); + writeFinalize(); GPUbenchmark::globalFinalize(); } From 901892248bb6a2d8c384a1845388054937233014 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Sat, 10 Jul 2021 13:40:24 +0200 Subject: [PATCH 37/42] Add copy benchmark --- GPU/GPUbenchmark/Shared/Kernels.h | 6 + GPU/GPUbenchmark/Shared/Utils.h | 14 +- GPU/GPUbenchmark/cuda/Kernels.cu | 211 +++++++++++++++++++++++++++--- 3 files changed, 205 insertions(+), 26 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index b39dc2f3a836a..89a5086bc5bb3 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -63,6 +63,9 @@ class GPUbenchmark final void writeInit(); void writeFinalize(); + void copyInit(); + void copyFinalize(); + // Kernel calling wrappers void readSequential(SplitLevel sl); void readConcurrent(SplitLevel sl, int nRegions = 2); @@ -70,6 +73,9 @@ class GPUbenchmark final void writeSequential(SplitLevel sl); void writeConcurrent(SplitLevel sl, int nRegions = 2); + void copySequential(SplitLevel sl); + void copyConcurrent(SplitLevel sl, int nRegions = 2); + private: gpuState mState; std::shared_ptr mStreamer; diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index c8f1679fbbbd8..991e078e63888 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -95,14 +95,12 @@ struct gpuState { size_t scratchSize; // Size of scratch area (B) std::vector partAddrOnHost; // Pointers to scratch partitions on host vector std::vector> gpuBuffersHost; // Host-based vector-ized data - - // Test-specific containers - T* deviceReadResultsPtr; // Results of the read test (single variable) on GPU - std::vector hostReadResultsVector; // Results of the read test (single variable) on host - - // Test-specific containers - T* deviceWriteResultsPtr; // Results of the write test (single variable) on GPU - std::vector hostWriteResultsVector; // Results of the write test (single variable) on host + T* deviceReadResultsPtr; // Results of the read test (single variable) on GPU + std::vector hostReadResultsVector; // Results of the read test (single variable) on host + T* deviceWriteResultsPtr; // Results of the write test (single variable) on GPU + std::vector hostWriteResultsVector; // Results of the write test (single variable) on host + T* deviceCopyInputsPtr; // Inputs of the copy test (single variable) on GPU + std::vector hostCopyInputsVector; // Inputs of the copy test (single variable) on host // Static info size_t totalMemory; diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e79bb1670d31d..05f17eadc9c0a 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -21,7 +21,7 @@ // Memory partitioning legend // // |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM -// |--chunk 0--|--chunk 1--|--chunk 2--| *** |--chunk n--| chunks -> default size: 1GB (single block pins) +// |--chunk 0--|--chunk 1--|--chunk 2--| *** |--chunk n--| chunks -> default size: 1GB (sing block pins) // |__________________________________________scratch______________________________________________| scratch -> default size: 95% free GPU RAM #define GPUCHECK(error) \ @@ -74,6 +74,7 @@ GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReser ////////////////// // Kernels go here +// Reading template GPUg() void readChunkSBKernel( int chunkId, @@ -83,10 +84,13 @@ GPUg() void readChunkSBKernel( float chunkReservedGB = 1.f) { if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + chunk_type sink{0}; + chunk_type* ptr = getPartPtrOnScratch(scratch, chunkReservedGB, chunkId); for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { - if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast(1)) { // actual read operation is performed here - results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i]; // this case should never happen and waves should be always in sync - } + sink += ptr[i]; + } + if (sink == static_cast(1)) { + results[chunkId] = sink; } } } @@ -135,6 +139,35 @@ GPUg() void writeChunkMBKernel( } } +// Copying +template +GPUg() void copyChunkSBKernel( + int chunkId, + chunk_type* inputs, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split + for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId]; + } + } +} + +template +GPUg() void copyChunkMBKernel( + int chunkId, + chunk_type* inputs, + chunk_type* scratch, + size_t chunkSize, + float chunkReservedGB = 1.f) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) { + getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId]; + } +} + } // namespace gpu void printDeviceProp(int deviceId) @@ -355,6 +388,7 @@ void GPUbenchmark::globalInit(const int deviceId) << std::endl; } +/// Read template void GPUbenchmark::readInit() { @@ -373,7 +407,7 @@ void GPUbenchmark::readSequential(SplitLevel sl) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << std::setw(2) << " ├ Sequential read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately auto result = benchmarkSync(&gpu::readChunkSBKernel, mState.getNKernelLaunches(), @@ -397,7 +431,7 @@ void GPUbenchmark::readSequential(SplitLevel sl) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential read, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << std::setw(2) << " ├ Sequential read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately auto result = benchmarkSync(&gpu::readChunkMBKernel, mState.getNKernelLaunches(), @@ -428,7 +462,7 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { - std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << " ├ Concurrent read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; auto results = benchmarkAsync(&gpu::readChunkSBKernel, mState.getMaxChunks(), // nStreams mState.getNKernelLaunches(), @@ -453,7 +487,7 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { - std::cout << ">>> Concurrent read, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << " ├ Concurrent read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; auto results = benchmarkAsync(&gpu::readChunkMBKernel, mState.getMaxChunks(), // nStreams mState.getNKernelLaunches(), @@ -479,6 +513,7 @@ void GPUbenchmark::readFinalize() { GPUCHECK(cudaMemcpy(mState.hostReadResultsVector.data(), mState.deviceReadResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); GPUCHECK(cudaFree(mState.deviceReadResultsPtr)); + std::cout << " └ done." << std::endl; } /// Write @@ -500,7 +535,7 @@ void GPUbenchmark::writeSequential(SplitLevel sl) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << std::setw(2) << " ├ Sequential write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately auto result = benchmarkSync(&gpu::writeChunkSBKernel, mState.getNKernelLaunches(), @@ -524,7 +559,7 @@ void GPUbenchmark::writeSequential(SplitLevel sl) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement - std::cout << std::setw(2) << ">>> Sequential write, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << std::setw(2) << " ├ Sequential write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately auto result = benchmarkSync(&gpu::writeChunkMBKernel, mState.getNKernelLaunches(), @@ -555,7 +590,7 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { - std::cout << ">>> Concurrent write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << " ├ Concurrent write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; auto results = benchmarkAsync(&gpu::writeChunkSBKernel, mState.getMaxChunks(), // nStreams mState.getNKernelLaunches(), @@ -580,7 +615,7 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) auto capacity{mState.getPartitionCapacity()}; for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { - std::cout << ">>> Concurrent write, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):"; + std::cout << " ├ Concurrent write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; auto results = benchmarkAsync(&gpu::writeChunkMBKernel, mState.getMaxChunks(), // nStreams mState.getNKernelLaunches(), @@ -606,6 +641,136 @@ void GPUbenchmark::writeFinalize() { GPUCHECK(cudaMemcpy(mState.hostWriteResultsVector.data(), mState.deviceWriteResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); GPUCHECK(cudaFree(mState.deviceWriteResultsPtr)); + std::cout << " └ done." << std::endl; +} + +/// Copy +template +void GPUbenchmark::copyInit() +{ + std::cout << ">>> Initializing copy benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl; + mState.hostCopyInputsVector.resize(mState.getMaxChunks()); + GPUCHECK(cudaMalloc(reinterpret_cast(&(mState.deviceCopyInputsPtr)), mState.getMaxChunks() * sizeof(chunk_type))); + GPUCHECK(cudaMemset(mState.deviceCopyInputsPtr, 1, mState.getMaxChunks() * sizeof(chunk_type))); +} + +template +void GPUbenchmark::copySequential(SplitLevel sl) +{ + switch (sl) { + case SplitLevel::Blocks: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::copyChunkSBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceCopyInputsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mStreamer.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType(), result); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + + case SplitLevel::Threads: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement + std::cout << std::setw(2) << " ├ Sequential copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately + auto result = benchmarkSync(&gpu::copyChunkMBKernel, + mState.getNKernelLaunches(), + nBlocks, + nThreads, + iChunk, + mState.deviceCopyInputsPtr, + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + mStreamer.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType(), result); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) +{ + switch (sl) { + case SplitLevel::Blocks: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::copyChunkSBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); + mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + case SplitLevel::Threads: { + auto nBlocks{mState.nMultiprocessors}; + auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; + auto chunks{mState.getMaxChunks()}; + auto capacity{mState.getPartitionCapacity()}; + + for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { + std::cout << " ├ Concurrent copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):"; + auto results = benchmarkAsync(&gpu::copyChunkMBKernel, + mState.getMaxChunks(), // nStreams + mState.getNKernelLaunches(), + nBlocks, + nThreads, + mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper) + mState.scratchPtr, + capacity, + mState.chunkReservedGB); + for (auto iResult{0}; iResult < results.size(); ++iResult) { + auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); + mStreamer.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType(), results[iResult]); + } + std::cout << "\033[1;32m complete\033[0m" << std::endl; + } + break; + } + } +} + +template +void GPUbenchmark::copyFinalize() +{ + GPUCHECK(cudaMemcpy(mState.hostCopyInputsVector.data(), mState.deviceCopyInputsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost)); + GPUCHECK(cudaFree(mState.deviceCopyInputsPtr)); + std::cout << " └ done." << std::endl; } template @@ -618,34 +783,44 @@ template void GPUbenchmark::run() { globalInit(0); - // Test calls go here: + readInit(); - // - Reading whole memory + // Reading in whole memory readSequential(SplitLevel::Blocks); readSequential(SplitLevel::Threads); - // - Reading memory regions + // Reading in memory regions readConcurrent(SplitLevel::Blocks); readConcurrent(SplitLevel::Threads); readFinalize(); writeInit(); - // - Write on whole memory + // Write on whole memory writeSequential(SplitLevel::Blocks); writeSequential(SplitLevel::Threads); - // - Write memory regions + // Write on memory regions writeConcurrent(SplitLevel::Blocks); writeConcurrent(SplitLevel::Threads); writeFinalize(); + copyInit(); + // Copy from input buffer (size = nChunks) on whole memory + copySequential(SplitLevel::Blocks); + copySequential(SplitLevel::Threads); + + // Copy from input buffer (size = nChunks) on memory regions + copyConcurrent(SplitLevel::Blocks); + copyConcurrent(SplitLevel::Threads); + copyFinalize(); + GPUbenchmark::globalFinalize(); } template class GPUbenchmark; -// template class GPUbenchmark; template class GPUbenchmark; template class GPUbenchmark; +// template class GPUbenchmark; } // namespace benchmark } // namespace o2 \ No newline at end of file From f31f7741b60242340be1500a3df207a02b6b3deb Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 12 Jul 2021 10:24:01 +0200 Subject: [PATCH 38/42] Remove CommonUtils dependency --- GPU/GPUbenchmark/CMakeLists.txt | 2 +- GPU/GPUbenchmark/Shared/Utils.h | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 9151acc8bc478..df0b4e4e47263 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -50,7 +50,7 @@ if(HIP_ENABLED) PUBLIC_LINK_LIBRARIES O2::GPUCommon hip::host Boost::program_options - O2::CommonUtils + ROOT::Tree TARGETVARNAME targetName) if(HIP_AMDGPUTARGET) diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 991e078e63888..cf46c61bd2c55 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -19,7 +19,7 @@ #include #include #include -#include "CommonUtils/TreeStreamRedirector.h" +#include #define KNRM "\x1B[0m" #define KRED "\x1B[31m" @@ -112,41 +112,41 @@ struct gpuState { class ResultStreamer { public: - explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root"); + explicit ResultStreamer(const std::string resultsTreeFilename = "benchmark_results.root"); ~ResultStreamer(); void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry); void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry); private: - std::string mDebugTreeFileName = "benchmark_results.root"; // output filename - o2::utils::TreeStreamRedirector* mTreeStream; // observer + std::string mResultsTreeFilename = "benchmark_results.root"; // output filename + TTree* mTree; // observer }; -inline ResultStreamer::ResultStreamer(const std::string debugTreeFileName) +inline ResultStreamer::ResultStreamer(const std::string resultsTreeFilename) { - mDebugTreeFileName = debugTreeFileName; - mTreeStream = new o2::utils::TreeStreamRedirector(debugTreeFileName.data(), "recreate"); + mResultsTreeFilename = resultsTreeFilename; + mTree = new TTree(resultsTreeFilename.data(), resultsTreeFilename.data()); } inline ResultStreamer::~ResultStreamer() { - delete mTreeStream; + delete mTree; } inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry) { - (*mTreeStream) - << (benchmarkName + "_" + type + "_" + chunk).data() - << "elapsed=" << entry - << "\n"; + // (*mTree) + // << (benchmarkName + "_" + type + "_" + chunk).data() + // << "elapsed=" << entry + // << "\n"; } inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry) { - (*mTreeStream) - << (benchmarkName + "_" + type + "_region_" + region).data() - << "elapsed=" << entry - << "\n"; + // (*mTree) + // << (benchmarkName + "_" + type + "_region_" + region).data() + // << "elapsed=" << entry + // << "\n"; } } // namespace benchmark From e330a7ebb7c2d1b3080410cb9ae8f17258440492 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 12 Jul 2021 12:35:06 +0200 Subject: [PATCH 39/42] Remove GPUCommon dependency --- GPU/GPUbenchmark/CMakeLists.txt | 8 +++----- GPU/GPUbenchmark/Shared/Kernels.h | 1 - GPU/GPUbenchmark/cuda/Kernels.cu | 14 +++++++------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index df0b4e4e47263..4e9f9be10618b 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -16,9 +16,8 @@ if(CUDA_ENABLED) o2_add_executable(gpu-memory-benchmark-cuda SOURCES benchmark.cxx cuda/Kernels.cu - PUBLIC_LINK_LIBRARIES O2::GPUCommon - Boost::program_options - O2::CommonUtils + PUBLIC_LINK_LIBRARIES Boost::program_options + ROOT::Tree TARGETVARNAME targetName) endif() @@ -47,8 +46,7 @@ if(HIP_ENABLED) o2_add_executable(gpu-memory-benchmark-hip SOURCES benchmark.cxx hip/Kernels.hip.cxx - PUBLIC_LINK_LIBRARIES O2::GPUCommon - hip::host + PUBLIC_LINK_LIBRARIES hip::host Boost::program_options ROOT::Tree TARGETVARNAME targetName) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 89a5086bc5bb3..6218a18015b0d 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -15,7 +15,6 @@ #ifndef GPU_BENCHMARK_KERNELS_H #define GPU_BENCHMARK_KERNELS_H -#include "GPUCommonDef.h" #include "Utils.h" #include #include diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 05f17eadc9c0a..12437dc0f5383 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -67,7 +67,7 @@ namespace gpu /////////////////////////// // Device functions go here template -GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber) +__host__ __device__ inline chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber) { return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * partNumber); } @@ -76,7 +76,7 @@ GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReser // Kernels go here // Reading template -GPUg() void readChunkSBKernel( +__global__ void readChunkSBKernel( int chunkId, chunk_type* results, chunk_type* scratch, @@ -96,7 +96,7 @@ GPUg() void readChunkSBKernel( } template -GPUg() void readChunkMBKernel( +__global__ void readChunkMBKernel( int chunkId, chunk_type* results, chunk_type* scratch, @@ -112,7 +112,7 @@ GPUg() void readChunkMBKernel( // Writing template -GPUg() void writeChunkSBKernel( +__global__ void writeChunkSBKernel( int chunkId, chunk_type* results, chunk_type* scratch, @@ -127,7 +127,7 @@ GPUg() void writeChunkSBKernel( } template -GPUg() void writeChunkMBKernel( +__global__ void writeChunkMBKernel( int chunkId, chunk_type* results, chunk_type* scratch, @@ -141,7 +141,7 @@ GPUg() void writeChunkMBKernel( // Copying template -GPUg() void copyChunkSBKernel( +__global__ void copyChunkSBKernel( int chunkId, chunk_type* inputs, chunk_type* scratch, @@ -156,7 +156,7 @@ GPUg() void copyChunkSBKernel( } template -GPUg() void copyChunkMBKernel( +__global__ void copyChunkMBKernel( int chunkId, chunk_type* inputs, chunk_type* scratch, From 1480959537ffdd822c0af4f05ae65c744b934bad Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 12 Jul 2021 19:59:25 +0200 Subject: [PATCH 40/42] Fix fullCI errors --- GPU/GPUbenchmark/CMakeLists.txt | 5 +++-- GPU/GPUbenchmark/Shared/Utils.h | 7 +------ GPU/GPUbenchmark/cuda/Kernels.cu | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 4e9f9be10618b..8a93dcec41101 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -32,8 +32,9 @@ if(HIP_ENABLED) if(EXISTS ${HIPIFY_EXECUTABLE}) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) message("Generating HIP kernel code ...") - execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}") - elseif() + execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}") +# sed '1{/\#include \"hip\/hip_runtime.h\"/d}' + elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index cf46c61bd2c55..319ac6a156539 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -113,7 +113,7 @@ class ResultStreamer { public: explicit ResultStreamer(const std::string resultsTreeFilename = "benchmark_results.root"); - ~ResultStreamer(); + ~ResultStreamer() = default; void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry); void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry); @@ -128,11 +128,6 @@ inline ResultStreamer::ResultStreamer(const std::string resultsTreeFilename) mTree = new TTree(resultsTreeFilename.data(), resultsTreeFilename.data()); } -inline ResultStreamer::~ResultStreamer() -{ - delete mTree; -} - inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry) { // (*mTree) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 12437dc0f5383..e43b9dc1ab792 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -16,7 +16,7 @@ #if defined(__HIPCC__) #include "hip/hip_runtime.h" #endif -#include +#include // Memory partitioning legend // From 5383afc00e0ad1308b7c4fc419160d8dc3574771 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 13 Jul 2021 19:56:14 +0200 Subject: [PATCH 41/42] Revise result saving --- GPU/GPUbenchmark/Shared/Kernels.h | 4 +- GPU/GPUbenchmark/Shared/Utils.h | 51 +++++++++++++++------ GPU/GPUbenchmark/benchmark.cxx | 12 ++--- GPU/GPUbenchmark/cuda/Kernels.cu | 75 ++++++++++++++++--------------- 4 files changed, 84 insertions(+), 58 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 6218a18015b0d..a4e7f71440347 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -32,7 +32,7 @@ class GPUbenchmark final { public: GPUbenchmark() = delete; // need for a configuration - GPUbenchmark(benchmarkOpts& opts, std::shared_ptr streamer) : mStreamer{streamer}, mOptions{opts} + GPUbenchmark(benchmarkOpts& opts, std::shared_ptr rWriter) : mResultWriter{rWriter}, mOptions{opts} { } virtual ~GPUbenchmark() = default; @@ -77,7 +77,7 @@ class GPUbenchmark final private: gpuState mState; - std::shared_ptr mStreamer; + std::shared_ptr mResultWriter; benchmarkOpts mOptions; }; diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 319ac6a156539..e5ae595883c74 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #define KNRM "\x1B[0m" #define KRED "\x1B[31m" @@ -109,34 +111,56 @@ struct gpuState { }; // Interface class to stream results to root file -class ResultStreamer +class ResultWriter { public: - explicit ResultStreamer(const std::string resultsTreeFilename = "benchmark_results.root"); - ~ResultStreamer() = default; - void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry); + explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root"); + ~ResultWriter() = default; + void storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry); void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry); + void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks); + void snapshotBenchmark(const std::string bName, const std::string type); + void saveToFile(std::string filename = "benchmark_results.root"); private: std::string mResultsTreeFilename = "benchmark_results.root"; // output filename - TTree* mTree; // observer + // std::unordered_map> mBenchmarksChunk; + // std::unordered_map> mBenchmarksRegions; + std::vector mBenchmarkResults; + TBranch* mTmpBranch; + TTree* mTree; }; -inline ResultStreamer::ResultStreamer(const std::string resultsTreeFilename) +inline ResultWriter::ResultWriter(const std::string resultsTreeFilename) { mResultsTreeFilename = resultsTreeFilename; - mTree = new TTree(resultsTreeFilename.data(), resultsTreeFilename.data()); + mTree = new TTree("GPUbenchmarks", "GPUbenchmarks"); } -inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry) +inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks) { - // (*mTree) - // << (benchmarkName + "_" + type + "_" + chunk).data() - // << "elapsed=" << entry - // << "\n"; + mTmpBranch = mTree->Branch((bName + "_" + type).data(), &mBenchmarkResults); + mBenchmarkResults.resize(nChunks); } -inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry) +inline void ResultWriter::storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry) +{ + mBenchmarkResults[chunk] = entry; +} + +inline void ResultWriter::snapshotBenchmark(const std::string bName, const std::string type) +{ + mTree->Fill(); +} + +inline void ResultWriter::saveToFile(std::string filename) +{ + auto file = TFile::Open(filename.data(), "recreate"); + mTree->Write(); + file->Close(); +} + +inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry) { // (*mTree) // << (benchmarkName + "_" + type + "_region_" + region).data() @@ -153,5 +177,4 @@ inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std:: printf("\n"); \ printf("error: TEST FAILED\n%s", KNRM); \ exit(EXIT_FAILURE); - #endif \ No newline at end of file diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 6b1d5338db5c1..83acae586d735 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -51,7 +51,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) return true; } -using o2::benchmark::ResultStreamer; +using o2::benchmark::ResultWriter; int main(int argc, const char* argv[]) { @@ -62,12 +62,12 @@ int main(int argc, const char* argv[]) return -1; } - std::shared_ptr streamer = std::make_shared(); + std::shared_ptr streamer = std::make_shared(); - o2::benchmark::GPUbenchmark bm_char{opts, streamer}; - bm_char.run(); - o2::benchmark::GPUbenchmark bm_int{opts, streamer}; - bm_int.run(); + // o2::benchmark::GPUbenchmark bm_char{opts, streamer}; + // bm_char.run(); + // o2::benchmark::GPUbenchmark bm_int{opts, streamer}; + // bm_int.run(); o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; bm_size_t.run(); diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e43b9dc1ab792..516e3d5327d4f 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -46,7 +46,7 @@ std::string getType() return std::string{"char"}; } if (typeid(T).name() == typeid(size_t).name()) { - return std::string{"unsigned long"}; + return std::string{"unsigned_long"}; } if (typeid(T).name() == typeid(int).name()) { return std::string{"int"}; @@ -402,6 +402,7 @@ void GPUbenchmark::readSequential(SplitLevel sl) { switch (sl) { case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("seq_R_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -418,10 +419,12 @@ void GPUbenchmark::readSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("seq_R_SB", std::to_string(iChunk), getType(), result); + mResultWriter.get()->storeBenchmarkEntry("seq_R_SB", getType(), iChunk, result); } + mResultWriter.get()->snapshotBenchmark("seq_R_SB", getType()); std::cout << "\033[1;32m complete\033[0m" << std::endl; } + mResultWriter->saveToFile(); break; } @@ -442,7 +445,7 @@ void GPUbenchmark::readSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType(), result); + // mResultWriter.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -474,7 +477,7 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); - mStreamer.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType(), results[iResult]); + // mResultWriter.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -499,7 +502,7 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); - mStreamer.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType(), results[iResult]); + // mResultWriter.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -546,7 +549,7 @@ void GPUbenchmark::writeSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType(), result); + // mResultWriter.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -570,7 +573,7 @@ void GPUbenchmark::writeSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType(), result); + // mResultWriter.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -602,7 +605,7 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); - mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); + // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -627,7 +630,7 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); - mStreamer.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType(), results[iResult]); + // mResultWriter.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -675,7 +678,7 @@ void GPUbenchmark::copySequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType(), result); + // mResultWriter.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -699,7 +702,7 @@ void GPUbenchmark::copySequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mStreamer.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType(), result); + // mResultWriter.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType(), result); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -731,7 +734,7 @@ void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); - mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); + // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -756,7 +759,7 @@ void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); - mStreamer.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType(), results[iResult]); + // mResultWriter.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType(), results[iResult]); } std::cout << "\033[1;32m complete\033[0m" << std::endl; } @@ -787,32 +790,32 @@ void GPUbenchmark::run() readInit(); // Reading in whole memory readSequential(SplitLevel::Blocks); - readSequential(SplitLevel::Threads); + // readSequential(SplitLevel::Threads); - // Reading in memory regions - readConcurrent(SplitLevel::Blocks); - readConcurrent(SplitLevel::Threads); + // // Reading in memory regions + // readConcurrent(SplitLevel::Blocks); + // readConcurrent(SplitLevel::Threads); readFinalize(); - writeInit(); - // Write on whole memory - writeSequential(SplitLevel::Blocks); - writeSequential(SplitLevel::Threads); - - // Write on memory regions - writeConcurrent(SplitLevel::Blocks); - writeConcurrent(SplitLevel::Threads); - writeFinalize(); - - copyInit(); - // Copy from input buffer (size = nChunks) on whole memory - copySequential(SplitLevel::Blocks); - copySequential(SplitLevel::Threads); - - // Copy from input buffer (size = nChunks) on memory regions - copyConcurrent(SplitLevel::Blocks); - copyConcurrent(SplitLevel::Threads); - copyFinalize(); + // writeInit(); + // // Write on whole memory + // writeSequential(SplitLevel::Blocks); + // writeSequential(SplitLevel::Threads); + + // // Write on memory regions + // writeConcurrent(SplitLevel::Blocks); + // writeConcurrent(SplitLevel::Threads); + // writeFinalize(); + + // copyInit(); + // // Copy from input buffer (size = nChunks) on whole memory + // copySequential(SplitLevel::Blocks); + // copySequential(SplitLevel::Threads); + + // // Copy from input buffer (size = nChunks) on memory regions + // copyConcurrent(SplitLevel::Blocks); + // copyConcurrent(SplitLevel::Threads); + // copyFinalize(); GPUbenchmark::globalFinalize(); } From cbef6f8da07b607e8c29890d34ad3566bace5044 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 14 Jul 2021 00:35:58 +0200 Subject: [PATCH 42/42] Ready to test on EPN --- GPU/GPUbenchmark/CMakeLists.txt | 1 - GPU/GPUbenchmark/Shared/Utils.h | 36 +++++------ GPU/GPUbenchmark/benchmark.cxx | 15 +++-- GPU/GPUbenchmark/cuda/Kernels.cu | 102 ++++++++++++++++++------------- 4 files changed, 86 insertions(+), 68 deletions(-) diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt index 8a93dcec41101..e008ab4cc0f41 100644 --- a/GPU/GPUbenchmark/CMakeLists.txt +++ b/GPU/GPUbenchmark/CMakeLists.txt @@ -33,7 +33,6 @@ if(HIP_ENABLED) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL}) message("Generating HIP kernel code ...") execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}") -# sed '1{/\#include \"hip\/hip_runtime.h\"/d}' elseif() message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...") endif() diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index e5ae595883c74..6d3400aa9a6ec 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -116,48 +116,48 @@ class ResultWriter public: explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root"); ~ResultWriter() = default; - void storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry); + void storeBenchmarkEntry(int chunk, float entry); void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry); void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks); - void snapshotBenchmark(const std::string bName, const std::string type); - void saveToFile(std::string filename = "benchmark_results.root"); + void snapshotBenchmark(); + void saveToFile(); private: - std::string mResultsTreeFilename = "benchmark_results.root"; // output filename - // std::unordered_map> mBenchmarksChunk; - // std::unordered_map> mBenchmarksRegions; std::vector mBenchmarkResults; - TBranch* mTmpBranch; - TTree* mTree; + std::vector mBenchmarkTrees; + TFile* mOutfile; }; inline ResultWriter::ResultWriter(const std::string resultsTreeFilename) { - mResultsTreeFilename = resultsTreeFilename; - mTree = new TTree("GPUbenchmarks", "GPUbenchmarks"); + mOutfile = TFile::Open(resultsTreeFilename.data(), "recreate"); } inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks) { - mTmpBranch = mTree->Branch((bName + "_" + type).data(), &mBenchmarkResults); + mBenchmarkTrees.emplace_back(new TTree((bName + "_" + type).data(), (bName + "_" + type).data())); + mBenchmarkResults.clear(); mBenchmarkResults.resize(nChunks); + mBenchmarkTrees.back()->Branch("elapsed", &mBenchmarkResults); } -inline void ResultWriter::storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry) +inline void ResultWriter::storeBenchmarkEntry(int chunk, float entry) { mBenchmarkResults[chunk] = entry; } -inline void ResultWriter::snapshotBenchmark(const std::string bName, const std::string type) +inline void ResultWriter::snapshotBenchmark() { - mTree->Fill(); + mBenchmarkTrees.back()->Fill(); } -inline void ResultWriter::saveToFile(std::string filename) +inline void ResultWriter::saveToFile() { - auto file = TFile::Open(filename.data(), "recreate"); - mTree->Write(); - file->Close(); + mOutfile->cd(); + for (auto t : mBenchmarkTrees) { + t->Write(); + } + mOutfile->Close(); } inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry) diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx index 83acae586d735..7ee638594f9e3 100644 --- a/GPU/GPUbenchmark/benchmark.cxx +++ b/GPU/GPUbenchmark/benchmark.cxx @@ -62,14 +62,17 @@ int main(int argc, const char* argv[]) return -1; } - std::shared_ptr streamer = std::make_shared(); + std::shared_ptr writer = std::make_shared(); - // o2::benchmark::GPUbenchmark bm_char{opts, streamer}; - // bm_char.run(); - // o2::benchmark::GPUbenchmark bm_int{opts, streamer}; - // bm_int.run(); - o2::benchmark::GPUbenchmark bm_size_t{opts, streamer}; + o2::benchmark::GPUbenchmark bm_char{opts, writer}; + bm_char.run(); + o2::benchmark::GPUbenchmark bm_int{opts, writer}; + bm_int.run(); + o2::benchmark::GPUbenchmark bm_size_t{opts, writer}; bm_size_t.run(); + // save results + writer.get()->saveToFile(); + return 0; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 516e3d5327d4f..8af91423c12e5 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -402,7 +402,7 @@ void GPUbenchmark::readSequential(SplitLevel sl) { switch (sl) { case SplitLevel::Blocks: { - mResultWriter.get()->addBenchmarkEntry("seq_R_SB", getType(), mState.getMaxChunks()); + mResultWriter.get()->addBenchmarkEntry("seq_read_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -419,16 +419,16 @@ void GPUbenchmark::readSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - mResultWriter.get()->storeBenchmarkEntry("seq_R_SB", getType(), iChunk, result); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); } - mResultWriter.get()->snapshotBenchmark("seq_R_SB", getType()); + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } - mResultWriter->saveToFile(); break; } case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("seq_read_MB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -445,8 +445,9 @@ void GPUbenchmark::readSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - // mResultWriter.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType(), result); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; @@ -459,6 +460,7 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) { switch (sl) { case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("conc_read_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto chunks{mState.getMaxChunks()}; @@ -476,14 +478,15 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); - // mResultWriter.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType(), results[iResult]); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("conc_read_MB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto chunks{mState.getMaxChunks()}; @@ -501,9 +504,9 @@ void GPUbenchmark::readConcurrent(SplitLevel sl, int nRegions) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); - // mResultWriter.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType(), results[iResult]); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; @@ -533,6 +536,7 @@ void GPUbenchmark::writeSequential(SplitLevel sl) { switch (sl) { case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("seq_write_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -549,14 +553,16 @@ void GPUbenchmark::writeSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - // mResultWriter.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType(), result); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("seq_write_MB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -573,8 +579,9 @@ void GPUbenchmark::writeSequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - // mResultWriter.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType(), result); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; @@ -587,6 +594,7 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) { switch (sl) { case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("conc_write_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto chunks{mState.getMaxChunks()}; @@ -604,14 +612,15 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); - // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("conc_write_MB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto chunks{mState.getMaxChunks()}; @@ -629,9 +638,9 @@ void GPUbenchmark::writeConcurrent(SplitLevel sl, int nRegions) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); - // mResultWriter.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType(), results[iResult]); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; @@ -662,6 +671,7 @@ void GPUbenchmark::copySequential(SplitLevel sl) { switch (sl) { case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("seq_copy_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -678,14 +688,16 @@ void GPUbenchmark::copySequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - // mResultWriter.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType(), result); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("seq_copy_MB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto capacity{mState.getPartitionCapacity()}; @@ -702,8 +714,9 @@ void GPUbenchmark::copySequential(SplitLevel sl) mState.scratchPtr, capacity, mState.chunkReservedGB); - // mResultWriter.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType(), result); + mResultWriter.get()->storeBenchmarkEntry(iChunk, result); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; @@ -716,6 +729,7 @@ void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) { switch (sl) { case SplitLevel::Blocks: { + mResultWriter.get()->addBenchmarkEntry("conc_copy_SB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto chunks{mState.getMaxChunks()}; @@ -733,14 +747,15 @@ void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) capacity, mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { - auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions); - // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType(), results[iResult]); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; } case SplitLevel::Threads: { + mResultWriter.get()->addBenchmarkEntry("conc_copy_MB", getType(), mState.getMaxChunks()); auto nBlocks{mState.nMultiprocessors}; auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)}; auto chunks{mState.getMaxChunks()}; @@ -759,8 +774,9 @@ void GPUbenchmark::copyConcurrent(SplitLevel sl, int nRegions) mState.chunkReservedGB); for (auto iResult{0}; iResult < results.size(); ++iResult) { auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions); - // mResultWriter.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType(), results[iResult]); + mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]); } + mResultWriter.get()->snapshotBenchmark(); std::cout << "\033[1;32m complete\033[0m" << std::endl; } break; @@ -790,32 +806,32 @@ void GPUbenchmark::run() readInit(); // Reading in whole memory readSequential(SplitLevel::Blocks); - // readSequential(SplitLevel::Threads); + readSequential(SplitLevel::Threads); - // // Reading in memory regions - // readConcurrent(SplitLevel::Blocks); - // readConcurrent(SplitLevel::Threads); + // Reading in memory regions + readConcurrent(SplitLevel::Blocks); + readConcurrent(SplitLevel::Threads); readFinalize(); - // writeInit(); - // // Write on whole memory - // writeSequential(SplitLevel::Blocks); - // writeSequential(SplitLevel::Threads); - - // // Write on memory regions - // writeConcurrent(SplitLevel::Blocks); - // writeConcurrent(SplitLevel::Threads); - // writeFinalize(); - - // copyInit(); - // // Copy from input buffer (size = nChunks) on whole memory - // copySequential(SplitLevel::Blocks); - // copySequential(SplitLevel::Threads); - - // // Copy from input buffer (size = nChunks) on memory regions - // copyConcurrent(SplitLevel::Blocks); - // copyConcurrent(SplitLevel::Threads); - // copyFinalize(); + writeInit(); + // Write on whole memory + writeSequential(SplitLevel::Blocks); + writeSequential(SplitLevel::Threads); + + // Write on memory regions + writeConcurrent(SplitLevel::Blocks); + writeConcurrent(SplitLevel::Threads); + writeFinalize(); + + copyInit(); + // Copy from input buffer (size = nChunks) on whole memory + copySequential(SplitLevel::Blocks); + copySequential(SplitLevel::Threads); + + // Copy from input buffer (size = nChunks) on memory regions + copyConcurrent(SplitLevel::Blocks); + copyConcurrent(SplitLevel::Threads); + copyFinalize(); GPUbenchmark::globalFinalize(); }