From 244287b024ccf6b3d1a99a3390c7867b028d78ad Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 19 May 2021 23:34:32 +0200
Subject: [PATCH 01/42] Add CUDA backbone

---
 GPU/CMakeLists.txt                       |  1 +
 GPU/GPUBenchmark/CMakeLists.txt          | 55 ++++++++++++++++++++++++
 GPU/GPUBenchmark/GPUbenchmark.cu         | 34 +++++++++++++++
 GPU/GPUBenchmark/GPUbenchmark.h          | 37 ++++++++++++++++
 GPU/GPUBenchmark/macro/CMakeLists.txt    |  4 ++
 GPU/GPUBenchmark/macro/runGPUbenchmark.C |  9 ++++
 6 files changed, 140 insertions(+)
 create mode 100644 GPU/GPUBenchmark/CMakeLists.txt
 create mode 100644 GPU/GPUBenchmark/GPUbenchmark.cu
 create mode 100644 GPU/GPUBenchmark/GPUbenchmark.h
 create mode 100644 GPU/GPUBenchmark/macro/CMakeLists.txt
 create mode 100644 GPU/GPUBenchmark/macro/runGPUbenchmark.C

diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt
index f8f1931f35547..74c3cbd6da6bc 100644
--- a/GPU/CMakeLists.txt
+++ b/GPU/CMakeLists.txt
@@ -22,6 +22,7 @@ add_subdirectory(Common)
 add_subdirectory(Utils)
 add_subdirectory(TPCFastTransformation)
 add_subdirectory(GPUTracking)
+add_subdirectory(GPUBenchmark)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
   add_subdirectory(Workflow)
 endif()
diff --git a/GPU/GPUBenchmark/CMakeLists.txt b/GPU/GPUBenchmark/CMakeLists.txt
new file mode 100644
index 0000000000000..1274cf00407e0
--- /dev/null
+++ b/GPU/GPUBenchmark/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright CERN and copyright holders of ALICE O2. This software is distributed
+# under the terms of the GNU General Public License v3 (GPL Version 3), copied
+# verbatim in the file "COPYING".
+#
+# See http://alice-o2.web.cern.ch/license for full licensing information.
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization or
+# submit itself to any jurisdiction.
+
+add_subdirectory(macro)
+set(HDRS_INSTALL GPUbenchmark.h)
+
+o2_add_library(GPUbenchmarkCUDA
+               SOURCES GPUbenchmark.cu
+               PUBLIC_INCLUDE_DIRECTORIES .
+               PUBLIC_LINK_LIBRARIES O2::GPUCommon
+               TARGETVARNAME targetName)
+
+               set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+target_compile_definitions(
+  ${targetName} PRIVATE $<TARGET_PROPERTY:O2::DetectorsVertexing,COMPILE_DEFINITIONS>)
+
+install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark)
+
+# set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+# set(CMAKE_CXX_EXTENSIONS OFF)
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+# # Hipify-perl
+# set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
+# set(HIP_KERNEL "GPUbenchmark.hip.cxx")
+# set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu)
+# set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/${HIP_KERNEL}")
+
+# if(EXISTS ${HIPIFY_EXECUTABLE})
+# # generate on-the-fly the HIP kernel
+# execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
+# o2_add_library(GPUbenchmarkHIP
+#                SOURCES GPUbenchmark.hip.cxx
+#                PUBLIC_INCLUDE_DIRECTORIES .
+#                PUBLIC_LINK_LIBRARIES O2::GPUCommon
+#                                      hip::host
+#                                      hip::device
+#                TARGETVARNAME targetName)
+#                target_compile_definitions(
+#                 ${targetName} PRIVATE $<TARGET_PROPERTY:O2::DetectorsVertexing,COMPILE_DEFINITIONS>)
+              
+#   if(HIP_AMDGPUTARGET)
+#     # Need to add gpu target also to link flags due to gpu-rdc option
+#     target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+#   endif()
+# elseif()
+#   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
+# endif()
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/GPUbenchmark.cu b/GPU/GPUBenchmark/GPUbenchmark.cu
new file mode 100644
index 0000000000000..99c49558630f4
--- /dev/null
+++ b/GPU/GPUBenchmark/GPUbenchmark.cu
@@ -0,0 +1,34 @@
+// Copyright CERN and copyright holders of ALICE O2. This software is
+// distributed under the terms of the GNU General Public License v3 (GPL
+// Version 3), copied verbatim in the file "COPYING".
+//
+// See http://alice-o2.web.cern.ch/license for full licensing information.
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file GPUbenchmark.cu
+/// \author: mconcas@cern.ch
+
+#include <GPUbenchmark.h>
+#include <stdio.h>
+
+namespace o2
+{
+namespace benchmark
+{
+namespace gpu
+{
+GPUg() void helloKernel()
+{
+  printf("Hello World from GPU!!\n");
+}
+} // namespace gpu
+
+void hello_util()
+{
+  gpu::helloKernel<<<1, 1>>>();
+}
+} // namespace benchmark
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/GPUbenchmark.h b/GPU/GPUBenchmark/GPUbenchmark.h
new file mode 100644
index 0000000000000..fca4a05144c13
--- /dev/null
+++ b/GPU/GPUBenchmark/GPUbenchmark.h
@@ -0,0 +1,37 @@
+// Copyright CERN and copyright holders of ALICE O2. This software is
+// distributed under the terms of the GNU General Public License v3 (GPL
+// Version 3), copied verbatim in the file "COPYING".
+//
+// See http://alice-o2.web.cern.ch/license for full licensing information.
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file GPUbenchmark.h
+/// \author: mconcas@cern.ch
+
+#include "GPUCommonDef.h"
+
+namespace o2
+{
+namespace benchmark
+{
+void hello_util();
+
+class GPUbenchmark final
+{
+ public:
+  GPUbenchmark() = default;
+  virtual ~GPUbenchmark() = default;
+
+  void hello();
+};
+
+// Steers
+void GPUbenchmark::hello()
+{
+  hello_util();
+}
+} // namespace benchmark
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/macro/CMakeLists.txt b/GPU/GPUBenchmark/macro/CMakeLists.txt
new file mode 100644
index 0000000000000..dd74a1e43db7e
--- /dev/null
+++ b/GPU/GPUBenchmark/macro/CMakeLists.txt
@@ -0,0 +1,4 @@
+o2_add_test_root_macro(runGPUbenchmark.C
+                       PUBLIC_LINK_LIBRARIES O2::GPUbenchmarkCUDA
+                                             O2::GPUCommon
+                       LABELS gpu COMPILE_ONLY)
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/macro/runGPUbenchmark.C b/GPU/GPUBenchmark/macro/runGPUbenchmark.C
new file mode 100644
index 0000000000000..b7e6c4138d260
--- /dev/null
+++ b/GPU/GPUBenchmark/macro/runGPUbenchmark.C
@@ -0,0 +1,9 @@
+#if !defined(__CLING__) || defined(__ROOTCLING__)
+#include <GPUBenchmark/GPUbenchmark.h>
+#endif
+
+void runGPUbenchmark()
+{
+  o2::benchmark::GPUbenchmark bm{};
+  bm.hello();
+}
\ No newline at end of file

From 4071b1156b705f22467d31f7b6fe856a2ff5fec1 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 20 May 2021 00:03:25 +0200
Subject: [PATCH 02/42] HIP breaks

---
 GPU/GPUBenchmark/CMakeLists.txt       | 70 +++++++++++++--------------
 GPU/GPUBenchmark/GPUbenchmark.hip.cxx | 35 ++++++++++++++
 2 files changed, 68 insertions(+), 37 deletions(-)
 create mode 100644 GPU/GPUBenchmark/GPUbenchmark.hip.cxx

diff --git a/GPU/GPUBenchmark/CMakeLists.txt b/GPU/GPUBenchmark/CMakeLists.txt
index 1274cf00407e0..75ca30fcf97f8 100644
--- a/GPU/GPUBenchmark/CMakeLists.txt
+++ b/GPU/GPUBenchmark/CMakeLists.txt
@@ -11,45 +11,41 @@
 add_subdirectory(macro)
 set(HDRS_INSTALL GPUbenchmark.h)
 
-o2_add_library(GPUbenchmarkCUDA
-               SOURCES GPUbenchmark.cu
-               PUBLIC_INCLUDE_DIRECTORIES .
-               PUBLIC_LINK_LIBRARIES O2::GPUCommon
-               TARGETVARNAME targetName)
-
-               set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+# o2_add_library(GPUbenchmarkCUDA
+#                SOURCES GPUbenchmark.cu
+#                PUBLIC_INCLUDE_DIRECTORIES .
+#                PUBLIC_LINK_LIBRARIES O2::GPUCommon
+#                TARGETVARNAME targetName)
 
-target_compile_definitions(
-  ${targetName} PRIVATE $<TARGET_PROPERTY:O2::DetectorsVertexing,COMPILE_DEFINITIONS>)
+#                set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
 install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark)
 
-# set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
-# set(CMAKE_CXX_EXTENSIONS OFF)
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
-# # Hipify-perl
-# set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
-# set(HIP_KERNEL "GPUbenchmark.hip.cxx")
-# set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu)
-# set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/${HIP_KERNEL}")
+set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+# Hipify-perl
+set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
+set(HIP_KERNEL "GPUbenchmark.hip.cxx")
+set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu)
+set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
-# if(EXISTS ${HIPIFY_EXECUTABLE})
-# # generate on-the-fly the HIP kernel
-# execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
-# o2_add_library(GPUbenchmarkHIP
-#                SOURCES GPUbenchmark.hip.cxx
-#                PUBLIC_INCLUDE_DIRECTORIES .
-#                PUBLIC_LINK_LIBRARIES O2::GPUCommon
-#                                      hip::host
-#                                      hip::device
-#                TARGETVARNAME targetName)
-#                target_compile_definitions(
-#                 ${targetName} PRIVATE $<TARGET_PROPERTY:O2::DetectorsVertexing,COMPILE_DEFINITIONS>)
-              
-#   if(HIP_AMDGPUTARGET)
-#     # Need to add gpu target also to link flags due to gpu-rdc option
-#     target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
-#   endif()
-# elseif()
-#   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
-# endif()
\ No newline at end of file
+if(EXISTS ${HIPIFY_EXECUTABLE})
+# generate on-the-fly the HIP kernel
+execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
+o2_add_library(GPUbenchmarkHIP
+               SOURCES GPUbenchmark.hip.cxx
+               PUBLIC_INCLUDE_DIRECTORIES .
+               PUBLIC_LINK_LIBRARIES O2::GPUCommon
+                                     hip::host
+                                     hip::device
+               TARGETVARNAME targetName)
+
+  if(HIP_AMDGPUTARGET)
+  message(FATAL_ERROR ${HIP_AMDGPUTARGET})
+    # Need to add gpu target also to link flags due to gpu-rdc option
+    target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+  endif()
+elseif()
+  message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
+endif()
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/GPUbenchmark.hip.cxx b/GPU/GPUBenchmark/GPUbenchmark.hip.cxx
new file mode 100644
index 0000000000000..786bb72d5ae4e
--- /dev/null
+++ b/GPU/GPUBenchmark/GPUbenchmark.hip.cxx
@@ -0,0 +1,35 @@
+#include "hip/hip_runtime.h"
+// Copyright CERN and copyright holders of ALICE O2. This software is
+// distributed under the terms of the GNU General Public License v3 (GPL
+// Version 3), copied verbatim in the file "COPYING".
+//
+// See http://alice-o2.web.cern.ch/license for full licensing information.
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file GPUbenchmark.cu
+/// \author: mconcas@cern.ch
+
+#include <GPUbenchmark.h>
+#include <stdio.h>
+
+namespace o2
+{
+namespace benchmark
+{
+namespace gpu
+{
+GPUg() void helloKernel()
+{
+  printf("Hello World from GPU!!\n");
+}
+} // namespace gpu
+
+void hello_util()
+{
+  hipLaunchKernelGGL(gpu::helloKernel, dim3(1), dim3(1), 0, 0);
+}
+} // namespace benchmark
+} // namespace o2
\ No newline at end of file

From 04ca94bfcf5536688548144077a9cd5a10219df7 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 20 May 2021 22:52:57 +0200
Subject: [PATCH 03/42] Make two separate libraries

---
 GPU/CMakeLists.txt                            |  2 +-
 GPU/GPUBenchmark/GPUbenchmark.hip.cxx         | 35 -------------------
 GPU/GPUbenchmark/CMakeLists.txt               | 13 +++++++
 .../GPUbenchmark.h                            |  0
 GPU/GPUbenchmark/cuda/CMakeLists.txt          | 21 +++++++++++
 .../cuda}/GPUbenchmark.cu                     |  0
 .../hip}/CMakeLists.txt                       | 17 ++-------
 GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx     |  0
 .../macro/CMakeLists.txt                      |  0
 .../macro/runGPUbenchmark.C                   |  0
 10 files changed, 38 insertions(+), 50 deletions(-)
 delete mode 100644 GPU/GPUBenchmark/GPUbenchmark.hip.cxx
 create mode 100644 GPU/GPUbenchmark/CMakeLists.txt
 rename GPU/{GPUBenchmark => GPUbenchmark}/GPUbenchmark.h (100%)
 create mode 100644 GPU/GPUbenchmark/cuda/CMakeLists.txt
 rename GPU/{GPUBenchmark => GPUbenchmark/cuda}/GPUbenchmark.cu (100%)
 rename GPU/{GPUBenchmark => GPUbenchmark/hip}/CMakeLists.txt (74%)
 create mode 100644 GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx
 rename GPU/{GPUBenchmark => GPUbenchmark}/macro/CMakeLists.txt (100%)
 rename GPU/{GPUBenchmark => GPUbenchmark}/macro/runGPUbenchmark.C (100%)

diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt
index 74c3cbd6da6bc..7019f951b25fb 100644
--- a/GPU/CMakeLists.txt
+++ b/GPU/CMakeLists.txt
@@ -22,7 +22,7 @@ add_subdirectory(Common)
 add_subdirectory(Utils)
 add_subdirectory(TPCFastTransformation)
 add_subdirectory(GPUTracking)
-add_subdirectory(GPUBenchmark)
+add_subdirectory(GPUbenchmark)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
   add_subdirectory(Workflow)
 endif()
diff --git a/GPU/GPUBenchmark/GPUbenchmark.hip.cxx b/GPU/GPUBenchmark/GPUbenchmark.hip.cxx
deleted file mode 100644
index 786bb72d5ae4e..0000000000000
--- a/GPU/GPUBenchmark/GPUbenchmark.hip.cxx
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "hip/hip_runtime.h"
-// Copyright CERN and copyright holders of ALICE O2. This software is
-// distributed under the terms of the GNU General Public License v3 (GPL
-// Version 3), copied verbatim in the file "COPYING".
-//
-// See http://alice-o2.web.cern.ch/license for full licensing information.
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file GPUbenchmark.cu
-/// \author: mconcas@cern.ch
-
-#include <GPUbenchmark.h>
-#include <stdio.h>
-
-namespace o2
-{
-namespace benchmark
-{
-namespace gpu
-{
-GPUg() void helloKernel()
-{
-  printf("Hello World from GPU!!\n");
-}
-} // namespace gpu
-
-void hello_util()
-{
-  hipLaunchKernelGGL(gpu::helloKernel, dim3(1), dim3(1), 0, 0);
-}
-} // namespace benchmark
-} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
new file mode 100644
index 0000000000000..b41feca915d3d
--- /dev/null
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Copyright CERN and copyright holders of ALICE O2. This software is distributed
+# under the terms of the GNU General Public License v3 (GPL Version 3), copied
+# verbatim in the file "COPYING".
+#
+# See http://alice-o2.web.cern.ch/license for full licensing information.
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization or
+# submit itself to any jurisdiction.
+
+add_subdirectory(macro)
+add_subdirectory(cuda)
+add_subdirectory(hip)
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/GPUbenchmark.h b/GPU/GPUbenchmark/GPUbenchmark.h
similarity index 100%
rename from GPU/GPUBenchmark/GPUbenchmark.h
rename to GPU/GPUbenchmark/GPUbenchmark.h
diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..2eab170cde6a5
--- /dev/null
+++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Copyright CERN and copyright holders of ALICE O2. This software is distributed
+# under the terms of the GNU General Public License v3 (GPL Version 3), copied
+# verbatim in the file "COPYING".
+#
+# See http://alice-o2.web.cern.ch/license for full licensing information.
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization or
+# submit itself to any jurisdiction.
+
+set(HDRS_INSTALL ../GPUbenchmark.h)
+
+o2_add_library(GPUbenchmarkCUDA
+               SOURCES GPUbenchmark.cu
+               PUBLIC_INCLUDE_DIRECTORIES ../
+               PUBLIC_LINK_LIBRARIES O2::GPUCommon
+               TARGETVARNAME targetName)
+
+               set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark)
\ No newline at end of file
diff --git a/GPU/GPUBenchmark/GPUbenchmark.cu b/GPU/GPUbenchmark/cuda/GPUbenchmark.cu
similarity index 100%
rename from GPU/GPUBenchmark/GPUbenchmark.cu
rename to GPU/GPUbenchmark/cuda/GPUbenchmark.cu
diff --git a/GPU/GPUBenchmark/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
similarity index 74%
rename from GPU/GPUBenchmark/CMakeLists.txt
rename to GPU/GPUbenchmark/hip/CMakeLists.txt
index 75ca30fcf97f8..f90267d09b4e9 100644
--- a/GPU/GPUBenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/hip/CMakeLists.txt
@@ -8,18 +8,7 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-add_subdirectory(macro)
-set(HDRS_INSTALL GPUbenchmark.h)
-
-# o2_add_library(GPUbenchmarkCUDA
-#                SOURCES GPUbenchmark.cu
-#                PUBLIC_INCLUDE_DIRECTORIES .
-#                PUBLIC_LINK_LIBRARIES O2::GPUCommon
-#                TARGETVARNAME targetName)
-
-#                set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-
-install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark)
+set(HDRS_INSTALL ../GPUbenchmark.h)
 
 set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -27,7 +16,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
 # Hipify-perl
 set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
 set(HIP_KERNEL "GPUbenchmark.hip.cxx")
-set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/GPUbenchmark.cu)
+set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/GPUbenchmark.cu)
 set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
 if(EXISTS ${HIPIFY_EXECUTABLE})
@@ -35,7 +24,7 @@ if(EXISTS ${HIPIFY_EXECUTABLE})
 execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
 o2_add_library(GPUbenchmarkHIP
                SOURCES GPUbenchmark.hip.cxx
-               PUBLIC_INCLUDE_DIRECTORIES .
+               PUBLIC_INCLUDE_DIRECTORIES ../
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                      hip::host
                                      hip::device
diff --git a/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx b/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/GPU/GPUBenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt
similarity index 100%
rename from GPU/GPUBenchmark/macro/CMakeLists.txt
rename to GPU/GPUbenchmark/macro/CMakeLists.txt
diff --git a/GPU/GPUBenchmark/macro/runGPUbenchmark.C b/GPU/GPUbenchmark/macro/runGPUbenchmark.C
similarity index 100%
rename from GPU/GPUBenchmark/macro/runGPUbenchmark.C
rename to GPU/GPUbenchmark/macro/runGPUbenchmark.C

From aae7f42a2b56718a23319142d2f5bda0a744bdb6 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Sun, 23 May 2021 21:42:12 +0200
Subject: [PATCH 04/42] Re-arrange directories

---
 GPU/GPUbenchmark/CMakeLists.txt                 |  3 +--
 GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx       | 15 +++++++++++++++
 GPU/GPUbenchmark/Steer/BenchmarkSteer.h         | 17 +++++++++++++++++
 GPU/GPUbenchmark/Steer/CMakeLists.txt           | 17 +++++++++++++++++
 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt   | 12 ++++++++++++
 .../{GPUbenchmark.h => Steer/Kernels/Kernels.h} |  6 ++++--
 .../{ => Steer/Kernels}/cuda/CMakeLists.txt     |  8 ++++----
 .../Kernels/cuda/Kernels.cu}                    |  6 +++---
 GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore   |  1 +
 .../{ => Steer/Kernels}/hip/CMakeLists.txt      | 12 +++++++-----
 GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx       |  0
 GPU/GPUbenchmark/macro/CMakeLists.txt           |  2 +-
 GPU/GPUbenchmark/macro/runGPUbenchmark.C        |  2 +-
 13 files changed, 83 insertions(+), 18 deletions(-)
 create mode 100644 GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx
 create mode 100644 GPU/GPUbenchmark/Steer/BenchmarkSteer.h
 create mode 100644 GPU/GPUbenchmark/Steer/CMakeLists.txt
 create mode 100644 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
 rename GPU/GPUbenchmark/{GPUbenchmark.h => Steer/Kernels/Kernels.h} (92%)
 rename GPU/GPUbenchmark/{ => Steer/Kernels}/cuda/CMakeLists.txt (87%)
 rename GPU/GPUbenchmark/{cuda/GPUbenchmark.cu => Steer/Kernels/cuda/Kernels.cu} (88%)
 create mode 100644 GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore
 rename GPU/GPUbenchmark/{ => Steer/Kernels}/hip/CMakeLists.txt (85%)
 delete mode 100644 GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index b41feca915d3d..28a562412c927 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -9,5 +9,4 @@
 # submit itself to any jurisdiction.
 
 add_subdirectory(macro)
-add_subdirectory(cuda)
-add_subdirectory(hip)
\ No newline at end of file
+add_subdirectory(Steer)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx b/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx
new file mode 100644
index 0000000000000..fd0fc5db989cf
--- /dev/null
+++ b/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx
@@ -0,0 +1,15 @@
+// Copyright CERN and copyright holders of ALICE O2. This software is
+// distributed under the terms of the GNU General Public License v3 (GPL
+// Version 3), copied verbatim in the file "COPYING".
+//
+// See http://alice-o2.web.cern.ch/license for full licensing information.
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file BenchmarkSteer.cxx
+/// \author: mconcas@cern.ch
+
+#include <BenchmarkSteer.h>
+
diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.h b/GPU/GPUbenchmark/Steer/BenchmarkSteer.h
new file mode 100644
index 0000000000000..c246b649d834a
--- /dev/null
+++ b/GPU/GPUbenchmark/Steer/BenchmarkSteer.h
@@ -0,0 +1,17 @@
+// Copyright CERN and copyright holders of ALICE O2. This software is
+// distributed under the terms of the GNU General Public License v3 (GPL
+// Version 3), copied verbatim in the file "COPYING".
+//
+// See http://alice-o2.web.cern.ch/license for full licensing information.
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file BenchmarkSteer.h
+/// \author: mconcas@cern.ch
+
+#ifndef BENCHAMARKSTEER_H
+#define BENCHAMARKSTEER_H
+
+#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt
new file mode 100644
index 0000000000000..ab0ee700120e1
--- /dev/null
+++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright CERN and copyright holders of ALICE O2. This software is distributed
+# under the terms of the GNU General Public License v3 (GPL Version 3), copied
+# verbatim in the file "COPYING".
+#
+# See http://alice-o2.web.cern.ch/license for full licensing information.
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization or
+# submit itself to any jurisdiction.
+
+add_subdirectory(Kernels)
+
+o2_add_library(GPUBenchmark
+               SOURCES BenchmarkSteer.cxx
+               PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
+                                     O2::CUDAbenchmark
+               TARGETVARNAME targetName)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..8f53feffba52f
--- /dev/null
+++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright CERN and copyright holders of ALICE O2. This software is distributed
+# under the terms of the GNU General Public License v3 (GPL Version 3), copied
+# verbatim in the file "COPYING".
+#
+# See http://alice-o2.web.cern.ch/license for full licensing information.
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization or
+# submit itself to any jurisdiction.
+
+add_subdirectory(cuda)
+add_subdirectory(hip)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/GPUbenchmark.h b/GPU/GPUbenchmark/Steer/Kernels/Kernels.h
similarity index 92%
rename from GPU/GPUbenchmark/GPUbenchmark.h
rename to GPU/GPUbenchmark/Steer/Kernels/Kernels.h
index fca4a05144c13..0390c93e33040 100644
--- a/GPU/GPUbenchmark/GPUbenchmark.h
+++ b/GPU/GPUbenchmark/Steer/Kernels/Kernels.h
@@ -10,6 +10,8 @@
 ///
 /// \file GPUbenchmark.h
 /// \author: mconcas@cern.ch
+#ifndef GPUBENCHMARK_H
+#define GPUBENCHMARK_H
 
 #include "GPUCommonDef.h"
 
@@ -24,7 +26,6 @@ class GPUbenchmark final
  public:
   GPUbenchmark() = default;
   virtual ~GPUbenchmark() = default;
-
   void hello();
 };
 
@@ -34,4 +35,5 @@ void GPUbenchmark::hello()
   hello_util();
 }
 } // namespace benchmark
-} // namespace o2
\ No newline at end of file
+} // namespace o2
+#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt
similarity index 87%
rename from GPU/GPUbenchmark/cuda/CMakeLists.txt
rename to GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt
index 2eab170cde6a5..f9866db53131f 100644
--- a/GPU/GPUbenchmark/cuda/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt
@@ -8,14 +8,14 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-set(HDRS_INSTALL ../GPUbenchmark.h)
+set(HDRS_INSTALL ../Kernels.h)
 
-o2_add_library(GPUbenchmarkCUDA
-               SOURCES GPUbenchmark.cu
+o2_add_library(CUDAbenchmark
+               SOURCES Kernels.cu
                PUBLIC_INCLUDE_DIRECTORIES ../
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
                TARGETVARNAME targetName)
 
                set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
-install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark)
\ No newline at end of file
+install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/cuda/GPUbenchmark.cu b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
similarity index 88%
rename from GPU/GPUbenchmark/cuda/GPUbenchmark.cu
rename to GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
index 99c49558630f4..b023eafcadfe2 100644
--- a/GPU/GPUbenchmark/cuda/GPUbenchmark.cu
+++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
@@ -8,10 +8,10 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 ///
-/// \file GPUbenchmark.cu
+/// \file Kernels.cu
 /// \author: mconcas@cern.ch
 
-#include <GPUbenchmark.h>
+#include <Kernels.h>
 #include <stdio.h>
 
 namespace o2
@@ -22,7 +22,7 @@ namespace gpu
 {
 GPUg() void helloKernel()
 {
-  printf("Hello World from GPU!!\n");
+  printf("Hello World from GPU!\n");
 }
 } // namespace gpu
 
diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore b/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore
new file mode 100644
index 0000000000000..14f27f00c53c2
--- /dev/null
+++ b/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore
@@ -0,0 +1 @@
+*.hip.cxx
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
similarity index 85%
rename from GPU/GPUbenchmark/hip/CMakeLists.txt
rename to GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
index f90267d09b4e9..a2aeae011e7aa 100644
--- a/GPU/GPUbenchmark/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
@@ -8,22 +8,24 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-set(HDRS_INSTALL ../GPUbenchmark.h)
+set(HDRS_INSTALL ../Kernels.h)
 
 set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+ 
 # Hipify-perl
 set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
-set(HIP_KERNEL "GPUbenchmark.hip.cxx")
-set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/GPUbenchmark.cu)
+set(HIP_KERNEL "Kernels.hip.cxx")
+message(FATAL_ERROR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
 set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
 if(EXISTS ${HIPIFY_EXECUTABLE})
 # generate on-the-fly the HIP kernel
 execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
-o2_add_library(GPUbenchmarkHIP
-               SOURCES GPUbenchmark.hip.cxx
+o2_add_library(HIPbenchmark
+               SOURCES Kernels.hip.cxx
                PUBLIC_INCLUDE_DIRECTORIES ../
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                      hip::host
diff --git a/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx b/GPU/GPUbenchmark/hip/GPUbenchmark.hip.cxx
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/GPU/GPUbenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt
index dd74a1e43db7e..ff210986e7205 100644
--- a/GPU/GPUbenchmark/macro/CMakeLists.txt
+++ b/GPU/GPUbenchmark/macro/CMakeLists.txt
@@ -1,4 +1,4 @@
 o2_add_test_root_macro(runGPUbenchmark.C
-                       PUBLIC_LINK_LIBRARIES O2::GPUbenchmarkCUDA
+                       PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark
                                              O2::GPUCommon
                        LABELS gpu COMPILE_ONLY)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/macro/runGPUbenchmark.C b/GPU/GPUbenchmark/macro/runGPUbenchmark.C
index b7e6c4138d260..0d3bf53899984 100644
--- a/GPU/GPUbenchmark/macro/runGPUbenchmark.C
+++ b/GPU/GPUbenchmark/macro/runGPUbenchmark.C
@@ -1,5 +1,5 @@
 #if !defined(__CLING__) || defined(__ROOTCLING__)
-#include <GPUBenchmark/GPUbenchmark.h>
+#include <GPUBenchmark/Steer/Kernels/Kernels.h>
 #endif
 
 void runGPUbenchmark()

From 3cc969580d3384c60979db200a34cc18faa402fb Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Sun, 23 May 2021 21:44:22 +0200
Subject: [PATCH 05/42] Add missing header

---
 GPU/GPUbenchmark/Steer/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt
index ab0ee700120e1..83841550410b4 100644
--- a/GPU/GPUbenchmark/Steer/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt
@@ -12,6 +12,7 @@ add_subdirectory(Kernels)
 
 o2_add_library(GPUBenchmark
                SOURCES BenchmarkSteer.cxx
+               PUBLIC_INCLUDE_DIRECTORIES .
                PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
                                      O2::CUDAbenchmark
                TARGETVARNAME targetName)
\ No newline at end of file

From 1f317913e6f31efd9d8a549005f6de719e14ea17 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Sun, 23 May 2021 22:17:21 +0200
Subject: [PATCH 06/42] Meta library does not compile

---
 GPU/GPUbenchmark/Steer/CMakeLists.txt             | 13 ++++++-------
 GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt |  3 +--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt
index 83841550410b4..e0afe118e576f 100644
--- a/GPU/GPUbenchmark/Steer/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt
@@ -8,11 +8,10 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-add_subdirectory(Kernels)
+# o2_add_library(GPUBenchmark
+#                SOURCES BenchmarkSteer.cxx
+#                PUBLIC_INCLUDE_DIRECTORIES .
+#                PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
+#                                      O2::CUDAbenchmark)
 
-o2_add_library(GPUBenchmark
-               SOURCES BenchmarkSteer.cxx
-               PUBLIC_INCLUDE_DIRECTORIES .
-               PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
-                                     O2::CUDAbenchmark
-               TARGETVARNAME targetName)
\ No newline at end of file
+add_subdirectory(Kernels)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
index a2aeae011e7aa..52eef2dd8b420 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
@@ -17,7 +17,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
 # Hipify-perl
 set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
 set(HIP_KERNEL "Kernels.hip.cxx")
-message(FATAL_ERROR "${CMAKE_CURRENT_SOURCE_DIR}")
+
 set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
 set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
@@ -33,7 +33,6 @@ o2_add_library(HIPbenchmark
                TARGETVARNAME targetName)
 
   if(HIP_AMDGPUTARGET)
-  message(FATAL_ERROR ${HIP_AMDGPUTARGET})
     # Need to add gpu target also to link flags due to gpu-rdc option
     target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
   endif()

From 175be3f455d082d8f6d94ec951044bc45bbdc959 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Mon, 24 May 2021 18:50:29 +0200
Subject: [PATCH 07/42] Port hipInfo example to test gpu specs

---
 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt |  12 +-
 .../Steer/Kernels/cuda/Kernels.cu             | 175 +++++++++++++++++-
 .../Steer/Kernels/hip/CMakeLists.txt          |  13 +-
 GPU/GPUbenchmark/macro/CMakeLists.txt         |   2 +-
 4 files changed, 193 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
index 8f53feffba52f..43e9caa230d88 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
@@ -8,5 +8,13 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-add_subdirectory(cuda)
-add_subdirectory(hip)
\ No newline at end of file
+# if(CUDA_ENABLED)
+  # message("Building CUDA benchmark library")
+  # add_subdirectory(cuda)
+  # target_compile_definitions(${targetName} PRIVATE CUDA_ENABLED)
+# endif()
+if(HIP_ENABLED)
+  message("Building HIP benchmark library")
+  add_subdirectory(hip)
+  # target_compile_definitions(${targetName} PRIVATE HIP_ENABLED)
+endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
index b023eafcadfe2..ae8651f916f7a 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
@@ -12,7 +12,48 @@
 /// \author: mconcas@cern.ch
 
 #include <Kernels.h>
-#include <stdio.h>
+#include <iostream>
+#include <iomanip>
+
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"
+
+#define failed(...)                       \
+  printf("%serror: ", KRED);              \
+  printf(__VA_ARGS__);                    \
+  printf("\n");                           \
+  printf("error: TEST FAILED\n%s", KNRM); \
+  exit(EXIT_FAILURE);
+
+#define GPUCHECK(error)                                                                        \
+  if (error != cudaSuccess) {                                                                  \
+    printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \
+           __LINE__, KNRM);                                                                    \
+    failed("API returned error code.");                                                        \
+  }
+
+void printCompilerInfo()
+{
+#ifdef __NVCC__
+  printf("compiler: nvcc\n");
+#endif
+}
+
+double bytesToKB(size_t s) { return (double)s / (1024.0); }
+double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
+
+#define printLimit(w1, limit, units)                                          \
+  {                                                                           \
+    size_t val;                                                               \
+    cudaDeviceGetLimit(&val, limit);                                          \
+    std::cout << setw(w1) << #limit ": " << val << " " << units << std::endl; \
+  }
 
 namespace o2
 {
@@ -25,10 +66,140 @@ GPUg() void helloKernel()
   printf("Hello World from GPU!\n");
 }
 } // namespace gpu
+void printDeviceProp(int deviceId)
+{
+  using namespace std;
+  const int w1 = 34;
+  cout << left;
+  cout << setw(w1)
+       << "--------------------------------------------------------------------------------"
+       << endl;
+  cout << setw(w1) << "device#" << deviceId << endl;
+
+  cudaDeviceProp props;
+  GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
+
+  cout << setw(w1) << "Name: " << props.name << endl;
+  cout << setw(w1) << "pciBusID: " << props.pciBusID << endl;
+  cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl;
+  cout << setw(w1) << "pciDomainID: " << props.pciDomainID << endl;
+  cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl;
+  cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
+       << endl;
+  cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl;
+  cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl;
+  cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
+       << endl;
+  cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl;
+  cout << setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0
+       << " Mhz" << endl;
+  cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
+       << bytesToGB(props.totalGlobalMem) << " GB" << endl;
+#if !defined(__CUDACC__)
+  cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
+       << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << endl;
+#endif
+#if defined(__HIPCC__)
+  cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
+       << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl;
+#endif
+  cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
+  cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
+       << endl;
+  cout << setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << endl;
+  cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl;
+  cout << setw(w1) << "warpSize: " << props.warpSize << endl;
+  cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl;
+  cout << setw(w1) << "computeMode: " << props.computeMode << endl;
+  cout << setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << endl;
+  cout << setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << endl;
+  cout << setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << endl;
+  cout << setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << endl;
+  cout << setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << endl;
+  cout << setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << endl;
+  cout << setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << endl;
+  cout << setw(w1) << "major: " << props.major << endl;
+  cout << setw(w1) << "minor: " << props.minor << endl;
+  cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl;
+  cout << setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << endl;
+  cout << setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << endl;
+#if defined(__HIPCC__)
+  cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl;
+  cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
+       << endl;
+  cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl;
+  cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
+       << endl;
+  cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl;
+  cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl;
+  cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl;
+  cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl;
+  cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl;
+  cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl;
+  cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl;
+  cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl;
+  cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl;
+  cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl;
+  cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl;
+  cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl;
+  cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl;
+  cout << setw(w1) << "gcnArchName: " << props.gcnArchName << endl;
+#endif
+  cout << setw(w1) << "isIntegrated: " << props.integrated << endl;
+  cout << setw(w1) << "maxTexture1D: " << props.maxTexture1D << endl;
+  cout << setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << endl;
+  cout << setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << endl;
+  cout << setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << endl;
+  cout << setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << endl;
+  cout << setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << endl;
+#if defined(__HIPCC__)
+  cout << setw(w1) << "isLargeBar: " << props.isLargeBar << endl;
+  cout << setw(w1) << "asicRevision: " << props.asicRevision << endl;
+#endif
+
+  int deviceCnt;
+  cudaGetDeviceCount(&deviceCnt);
+  cout << setw(w1) << "peers: ";
+  for (int i = 0; i < deviceCnt; i++) {
+    int isPeer;
+    cudaDeviceCanAccessPeer(&isPeer, i, deviceId);
+    if (isPeer) {
+      cout << "device#" << i << " ";
+    }
+  }
+  cout << endl;
+  cout << setw(w1) << "non-peers: ";
+  for (int i = 0; i < deviceCnt; i++) {
+    int isPeer;
+    cudaDeviceCanAccessPeer(&isPeer, i, deviceId);
+    if (!isPeer) {
+      cout << "device#" << i << " ";
+    }
+  }
+  cout << endl;
+
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+
+  cout << fixed << setprecision(2);
+  cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl;
+  cout << setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << setprecision(0)
+       << (float)free / total * 100.0 << "%)" << endl;
+}
 
 void hello_util()
 {
-  gpu::helloKernel<<<1, 1>>>();
+  int deviceCnt;
+
+  GPUCHECK(cudaGetDeviceCount(&deviceCnt));
+
+  for (int i = 0; i < deviceCnt; i++) {
+    cudaSetDevice(i);
+    printDeviceProp(i);
+  }
+
+  // gpu::helloKernel<<<1, 1>>>();
+  // displayCard();
 }
 } // namespace benchmark
 } // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
index 52eef2dd8b420..a40cc6a77204a 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
@@ -18,12 +18,14 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
 set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
 set(HIP_KERNEL "Kernels.hip.cxx")
 
-set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
+set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu)
 set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
 if(EXISTS ${HIPIFY_EXECUTABLE})
-# generate on-the-fly the HIP kernel
-execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
+
+# Generate on-the-fly the HIP kernel
+message("Generating HIP kernel code on the fly...")
+execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
 o2_add_library(HIPbenchmark
                SOURCES Kernels.hip.cxx
                PUBLIC_INCLUDE_DIRECTORIES ../
@@ -34,8 +36,11 @@ o2_add_library(HIPbenchmark
 
   if(HIP_AMDGPUTARGET)
     # Need to add gpu target also to link flags due to gpu-rdc option
+
     target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
   endif()
 elseif()
   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
-endif()
\ No newline at end of file
+endif()
+
+install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt
index ff210986e7205..556948e819a65 100644
--- a/GPU/GPUbenchmark/macro/CMakeLists.txt
+++ b/GPU/GPUbenchmark/macro/CMakeLists.txt
@@ -1,4 +1,4 @@
 o2_add_test_root_macro(runGPUbenchmark.C
-                       PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark
+                       PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
                                              O2::GPUCommon
                        LABELS gpu COMPILE_ONLY)
\ No newline at end of file

From ed9ade90d8f903c95a3ad2b3ae9385bbb65c2702 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 25 May 2021 16:35:56 +0200
Subject: [PATCH 08/42] Fix compilation to test build on EPN

---
 GPU/GPUbenchmark/Steer/CMakeLists.txt         | 6 +++---
 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt
index e0afe118e576f..aeca995eb458f 100644
--- a/GPU/GPUbenchmark/Steer/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/CMakeLists.txt
@@ -8,10 +8,10 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
+add_subdirectory(Kernels)
+
 # o2_add_library(GPUBenchmark
 #                SOURCES BenchmarkSteer.cxx
 #                PUBLIC_INCLUDE_DIRECTORIES .
 #                PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
-#                                      O2::CUDAbenchmark)
-
-add_subdirectory(Kernels)
\ No newline at end of file
+#                                      O2::CUDAbenchmark)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
index 43e9caa230d88..3626924ac5cc2 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
@@ -8,11 +8,11 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-# if(CUDA_ENABLED)
-  # message("Building CUDA benchmark library")
-  # add_subdirectory(cuda)
+if(CUDA_ENABLED)
+  message("Building CUDA benchmark library")
+  add_subdirectory(cuda)
   # target_compile_definitions(${targetName} PRIVATE CUDA_ENABLED)
-# endif()
+endif()
 if(HIP_ENABLED)
   message("Building HIP benchmark library")
   add_subdirectory(hip)

From d74f0b437412082235195c7972688bfa0f795fb4 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 27 May 2021 11:41:08 +0200
Subject: [PATCH 09/42] Produce two separate executables

---
 GPU/GPUbenchmark/Steer/BenchmarkSteer.h       | 17 --------------
 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 17 ++++++++++++--
 .../benchmark.cxx}                            | 12 ++++++++--
 .../Steer/Kernels/cuda/Kernels.cu             | 22 ++++++++++++++++++-
 .../Steer/Kernels/hip/CMakeLists.txt          |  8 +++----
 5 files changed, 50 insertions(+), 26 deletions(-)
 delete mode 100644 GPU/GPUbenchmark/Steer/BenchmarkSteer.h
 rename GPU/GPUbenchmark/Steer/{BenchmarkSteer.cxx => Kernels/benchmark.cxx} (73%)

diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.h b/GPU/GPUbenchmark/Steer/BenchmarkSteer.h
deleted file mode 100644
index c246b649d834a..0000000000000
--- a/GPU/GPUbenchmark/Steer/BenchmarkSteer.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright CERN and copyright holders of ALICE O2. This software is
-// distributed under the terms of the GNU General Public License v3 (GPL
-// Version 3), copied verbatim in the file "COPYING".
-//
-// See http://alice-o2.web.cern.ch/license for full licensing information.
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-///
-/// \file BenchmarkSteer.h
-/// \author: mconcas@cern.ch
-
-#ifndef BENCHAMARKSTEER_H
-#define BENCHAMARKSTEER_H
-
-#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
index 3626924ac5cc2..911d567cda350 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
@@ -11,10 +11,23 @@
 if(CUDA_ENABLED)
   message("Building CUDA benchmark library")
   add_subdirectory(cuda)
-  # target_compile_definitions(${targetName} PRIVATE CUDA_ENABLED)
+  o2_add_executable(memory-benchmark-cuda
+                  SOURCES benchmark.cxx
+                  PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark
+                  TARGETVARNAME targetName)
 endif()
+
 if(HIP_ENABLED)
   message("Building HIP benchmark library")
   add_subdirectory(hip)
-  # target_compile_definitions(${targetName} PRIVATE HIP_ENABLED)
+  set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+
+  o2_add_executable(memory-benchmark-hip
+                    SOURCES benchmark.cxx
+                    PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
+                                          hip::host 
+                                          hip::device
+                    TARGETVARNAME targetName)
 endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx b/GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx
similarity index 73%
rename from GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx
rename to GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx
index fd0fc5db989cf..19cd67d354942 100644
--- a/GPU/GPUbenchmark/Steer/BenchmarkSteer.cxx
+++ b/GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx
@@ -8,8 +8,16 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 ///
-/// \file BenchmarkSteer.cxx
+/// \file benchmark.cxx
 /// \author: mconcas@cern.ch
 
-#include <BenchmarkSteer.h>
+#include <iostream>
+#include <Kernels.h>
 
+int main()
+{
+  std::cout << "HELLO WORLD" << std::endl;
+  o2::benchmark::GPUbenchmark bm{};
+  bm.hello();
+  return 0;
+}
diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
index ae8651f916f7a..d9aa3ac88359c 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
@@ -65,6 +65,7 @@ GPUg() void helloKernel()
 {
   printf("Hello World from GPU!\n");
 }
+
 } // namespace gpu
 void printDeviceProp(int deviceId)
 {
@@ -202,4 +203,23 @@ void hello_util()
   // displayCard();
 }
 } // namespace benchmark
-} // namespace o2
\ No newline at end of file
+} // namespace o2
+
+/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before.
+I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP.
+
+Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel.
+For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level.
+We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks.
+We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant.
+
+For the tests I want to run in the segments, I think these should be:
+- Linear read in a multithreaded way: i.e. the standard GPU for loop:
+for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i];
+In the end we have to write foo to some output address to make sure the compiler cannot optimize anything.
+- Then I'd do the same with some stride, i.e.:
+foo += array[i * stride];
+- I'd try a random access with some simple linear congruence RNG per thread to determine the address.
+- Then I'd do the same with writing memory, and with copying memory.
+- Finally the data type should be flexible, going from char to uint4.
+That should cover most cases, but if you have more ideas, feel free to add something.*/
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
index a40cc6a77204a..93192c2c5a3d0 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
@@ -34,11 +34,11 @@ o2_add_library(HIPbenchmark
                                      hip::device
                TARGETVARNAME targetName)
 
-  if(HIP_AMDGPUTARGET)
-    # Need to add gpu target also to link flags due to gpu-rdc option
+  # if(HIP_AMDGPUTARGET)
+  #   # Need to add gpu target also to link flags due to gpu-rdc option
 
-    target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
-  endif()
+  #   target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+  # endif()
 elseif()
   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
 endif()

From 9454b545ee5893e45afe84a7eafd66df6b46ce3f Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 27 May 2021 14:47:12 +0200
Subject: [PATCH 10/42] Flatten dir tree a bit

---
 GPU/GPUbenchmark/CMakeLists.txt               | 25 ++++++++++++--
 .../{Steer/Kernels => Shared}/Kernels.h       |  0
 GPU/GPUbenchmark/Steer/CMakeLists.txt         | 17 ----------
 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt | 33 -------------------
 .../{Steer/Kernels => }/benchmark.cxx         |  0
 .../{Steer/Kernels => }/cuda/CMakeLists.txt   |  6 ++--
 .../{Steer/Kernels => }/cuda/Kernels.cu       |  0
 .../{Steer/Kernels => }/hip/.gitignore        |  0
 .../{Steer/Kernels => }/hip/CMakeLists.txt    | 11 ++-----
 GPU/GPUbenchmark/macro/CMakeLists.txt         |  4 ---
 GPU/GPUbenchmark/macro/runGPUbenchmark.C      |  9 -----
 11 files changed, 29 insertions(+), 76 deletions(-)
 rename GPU/GPUbenchmark/{Steer/Kernels => Shared}/Kernels.h (100%)
 delete mode 100644 GPU/GPUbenchmark/Steer/CMakeLists.txt
 delete mode 100644 GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
 rename GPU/GPUbenchmark/{Steer/Kernels => }/benchmark.cxx (100%)
 rename GPU/GPUbenchmark/{Steer/Kernels => }/cuda/CMakeLists.txt (80%)
 rename GPU/GPUbenchmark/{Steer/Kernels => }/cuda/Kernels.cu (100%)
 rename GPU/GPUbenchmark/{Steer/Kernels => }/hip/.gitignore (100%)
 rename GPU/GPUbenchmark/{Steer/Kernels => }/hip/CMakeLists.txt (80%)
 delete mode 100644 GPU/GPUbenchmark/macro/CMakeLists.txt
 delete mode 100644 GPU/GPUbenchmark/macro/runGPUbenchmark.C

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 28a562412c927..911d567cda350 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -8,5 +8,26 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-add_subdirectory(macro)
-add_subdirectory(Steer)
\ No newline at end of file
+if(CUDA_ENABLED)
+  message("Building CUDA benchmark library")
+  add_subdirectory(cuda)
+  o2_add_executable(memory-benchmark-cuda
+                  SOURCES benchmark.cxx
+                  PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark
+                  TARGETVARNAME targetName)
+endif()
+
+if(HIP_ENABLED)
+  message("Building HIP benchmark library")
+  add_subdirectory(hip)
+  set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+
+  o2_add_executable(memory-benchmark-hip
+                    SOURCES benchmark.cxx
+                    PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
+                                          hip::host 
+                                          hip::device
+                    TARGETVARNAME targetName)
+endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
similarity index 100%
rename from GPU/GPUbenchmark/Steer/Kernels/Kernels.h
rename to GPU/GPUbenchmark/Shared/Kernels.h
diff --git a/GPU/GPUbenchmark/Steer/CMakeLists.txt b/GPU/GPUbenchmark/Steer/CMakeLists.txt
deleted file mode 100644
index aeca995eb458f..0000000000000
--- a/GPU/GPUbenchmark/Steer/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright CERN and copyright holders of ALICE O2. This software is distributed
-# under the terms of the GNU General Public License v3 (GPL Version 3), copied
-# verbatim in the file "COPYING".
-#
-# See http://alice-o2.web.cern.ch/license for full licensing information.
-#
-# In applying this license CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization or
-# submit itself to any jurisdiction.
-
-add_subdirectory(Kernels)
-
-# o2_add_library(GPUBenchmark
-#                SOURCES BenchmarkSteer.cxx
-#                PUBLIC_INCLUDE_DIRECTORIES .
-#                PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
-#                                      O2::CUDAbenchmark)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt b/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
deleted file mode 100644
index 911d567cda350..0000000000000
--- a/GPU/GPUbenchmark/Steer/Kernels/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright CERN and copyright holders of ALICE O2. This software is distributed
-# under the terms of the GNU General Public License v3 (GPL Version 3), copied
-# verbatim in the file "COPYING".
-#
-# See http://alice-o2.web.cern.ch/license for full licensing information.
-#
-# In applying this license CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization or
-# submit itself to any jurisdiction.
-
-if(CUDA_ENABLED)
-  message("Building CUDA benchmark library")
-  add_subdirectory(cuda)
-  o2_add_executable(memory-benchmark-cuda
-                  SOURCES benchmark.cxx
-                  PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark
-                  TARGETVARNAME targetName)
-endif()
-
-if(HIP_ENABLED)
-  message("Building HIP benchmark library")
-  add_subdirectory(hip)
-  set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
-  set(CMAKE_CXX_EXTENSIONS OFF)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
-
-  o2_add_executable(memory-benchmark-hip
-                    SOURCES benchmark.cxx
-                    PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
-                                          hip::host 
-                                          hip::device
-                    TARGETVARNAME targetName)
-endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
similarity index 100%
rename from GPU/GPUbenchmark/Steer/Kernels/benchmark.cxx
rename to GPU/GPUbenchmark/benchmark.cxx
diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt
similarity index 80%
rename from GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt
rename to GPU/GPUbenchmark/cuda/CMakeLists.txt
index f9866db53131f..3ce3990534ece 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/cuda/CMakeLists.txt
+++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt
@@ -8,14 +8,14 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-set(HDRS_INSTALL ../Kernels.h)
+set(HDRS_INSTALL ../Shared/Kernels.h)
 
 o2_add_library(CUDAbenchmark
                SOURCES Kernels.cu
-               PUBLIC_INCLUDE_DIRECTORIES ../
+               PUBLIC_INCLUDE_DIRECTORIES ../Shared
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
                TARGETVARNAME targetName)
 
                set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
 
-install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
+# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
similarity index 100%
rename from GPU/GPUbenchmark/Steer/Kernels/cuda/Kernels.cu
rename to GPU/GPUbenchmark/cuda/Kernels.cu
diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore b/GPU/GPUbenchmark/hip/.gitignore
similarity index 100%
rename from GPU/GPUbenchmark/Steer/Kernels/hip/.gitignore
rename to GPU/GPUbenchmark/hip/.gitignore
diff --git a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
similarity index 80%
rename from GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
rename to GPU/GPUbenchmark/hip/CMakeLists.txt
index 93192c2c5a3d0..27e8a15efdc20 100644
--- a/GPU/GPUbenchmark/Steer/Kernels/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/hip/CMakeLists.txt
@@ -8,7 +8,7 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
-set(HDRS_INSTALL ../Kernels.h)
+set(HDRS_INSTALL ../Shared/Kernels.h)
 
 set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -28,19 +28,14 @@ message("Generating HIP kernel code on the fly...")
 execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
 o2_add_library(HIPbenchmark
                SOURCES Kernels.hip.cxx
-               PUBLIC_INCLUDE_DIRECTORIES ../
+               PUBLIC_INCLUDE_DIRECTORIES ../Shared
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                      hip::host
                                      hip::device
                TARGETVARNAME targetName)
 
-  # if(HIP_AMDGPUTARGET)
-  #   # Need to add gpu target also to link flags due to gpu-rdc option
-
-  #   target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
-  # endif()
 elseif()
   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
 endif()
 
-install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
+# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/macro/CMakeLists.txt b/GPU/GPUbenchmark/macro/CMakeLists.txt
deleted file mode 100644
index 556948e819a65..0000000000000
--- a/GPU/GPUbenchmark/macro/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-o2_add_test_root_macro(runGPUbenchmark.C
-                       PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
-                                             O2::GPUCommon
-                       LABELS gpu COMPILE_ONLY)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/macro/runGPUbenchmark.C b/GPU/GPUbenchmark/macro/runGPUbenchmark.C
deleted file mode 100644
index 0d3bf53899984..0000000000000
--- a/GPU/GPUbenchmark/macro/runGPUbenchmark.C
+++ /dev/null
@@ -1,9 +0,0 @@
-#if !defined(__CLING__) || defined(__ROOTCLING__)
-#include <GPUBenchmark/Steer/Kernels/Kernels.h>
-#endif
-
-void runGPUbenchmark()
-{
-  o2::benchmark::GPUbenchmark bm{};
-  bm.hello();
-}
\ No newline at end of file

From 84f8534610d10f112ec17c60afae8955ad12f6d6 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 27 May 2021 15:30:25 +0200
Subject: [PATCH 11/42] Cleanup

---
 GPU/GPUbenchmark/Shared/Common.h     |  44 +++++
 GPU/GPUbenchmark/Shared/Kernels.h    |  37 ++++-
 GPU/GPUbenchmark/benchmark.cxx       |   3 +-
 GPU/GPUbenchmark/cuda/CMakeLists.txt |   4 +-
 GPU/GPUbenchmark/cuda/Kernels.cu     | 238 +++++++++++----------------
 GPU/GPUbenchmark/hip/CMakeLists.txt  |   6 +-
 6 files changed, 170 insertions(+), 162 deletions(-)
 create mode 100644 GPU/GPUbenchmark/Shared/Common.h

diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
new file mode 100644
index 0000000000000..b98d01923dc61
--- /dev/null
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -0,0 +1,44 @@
+// Copyright CERN and copyright holders of ALICE O2. This software is
+// distributed under the terms of the GNU General Public License v3 (GPL
+// Version 3), copied verbatim in the file "COPYING".
+//
+// See http://alice-o2.web.cern.ch/license for full licensing information.
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file Common.h
+/// \author: mconcas@cern.ch
+
+#ifndef GPUBENCHMARK_COMMON_H
+#define GPUBENCHMARK_COMMON_H
+#if defined (__HIPCC__)
+#include "hip/hip_runtime.h"
+#endif
+#if defined (__HIPCC__)
+#define AUTO_DISCARD "auto discard ="
+#else
+#define AUTO_DISCARD
+#endif
+
+#include <iostream>
+#include <iomanip>
+#include "GPUCommonDef.h"
+
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"
+
+#define failed(...)                       \
+  printf("%serror: ", KRED);              \
+  printf(__VA_ARGS__);                    \
+  printf("\n");                           \
+  printf("error: TEST FAILED\n%s", KNRM); \
+  exit(EXIT_FAILURE);
+#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 0390c93e33040..514eede070e73 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -8,10 +8,11 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 ///
-/// \file GPUbenchmark.h
+/// \file Kernels.h
 /// \author: mconcas@cern.ch
-#ifndef GPUBENCHMARK_H
-#define GPUBENCHMARK_H
+
+#ifndef GPU_BENCHMARK_KERNELS_H
+#define GPU_BENCHMARK_KERNELS_H
 
 #include "GPUCommonDef.h"
 
@@ -19,21 +20,41 @@ namespace o2
 {
 namespace benchmark
 {
-void hello_util();
+void printDevices();
 
 class GPUbenchmark final
 {
  public:
   GPUbenchmark() = default;
   virtual ~GPUbenchmark() = default;
-  void hello();
+  void run();
 };
 
 // Steers
-void GPUbenchmark::hello()
+void GPUbenchmark::run()
 {
-  hello_util();
+  printDevices();
 }
 } // namespace benchmark
 } // namespace o2
-#endif
\ No newline at end of file
+#endif
+
+
+/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before.
+I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP.
+
+Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel.
+For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level.
+We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks.
+We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant.
+
+For the tests I want to run in the segments, I think these should be:
+- Linear read in a multithreaded way: i.e. the standard GPU for loop:
+for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i];
+In the end we have to write foo to some output address to make sure the compiler cannot optimize anything.
+- Then I'd do the same with some stride, i.e.:
+foo += array[i * stride];
+- I'd try a random access with some simple linear congruence RNG per thread to determine the address.
+- Then I'd do the same with writing memory, and with copying memory.
+- Finally the data type should be flexible, going from char to uint4.
+That should cover most cases, but if you have more ideas, feel free to add something.*/
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 19cd67d354942..aaaa0ffbbe390 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -16,8 +16,7 @@
 
 int main()
 {
-  std::cout << "HELLO WORLD" << std::endl;
   o2::benchmark::GPUbenchmark bm{};
-  bm.hello();
+  bm.run();
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt
index 3ce3990534ece..0e8415cef1262 100644
--- a/GPU/GPUbenchmark/cuda/CMakeLists.txt
+++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt
@@ -16,6 +16,4 @@ o2_add_library(CUDAbenchmark
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
                TARGETVARNAME targetName)
 
-               set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-
-# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
+               set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index d9aa3ac88359c..7d2070d7065d8 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -12,24 +12,7 @@
 /// \author: mconcas@cern.ch
 
 #include <Kernels.h>
-#include <iostream>
-#include <iomanip>
-
-#define KNRM "\x1B[0m"
-#define KRED "\x1B[31m"
-#define KGRN "\x1B[32m"
-#define KYEL "\x1B[33m"
-#define KBLU "\x1B[34m"
-#define KMAG "\x1B[35m"
-#define KCYN "\x1B[36m"
-#define KWHT "\x1B[37m"
-
-#define failed(...)                       \
-  printf("%serror: ", KRED);              \
-  printf(__VA_ARGS__);                    \
-  printf("\n");                           \
-  printf("error: TEST FAILED\n%s", KNRM); \
-  exit(EXIT_FAILURE);
+#include <Common.h>
 
 #define GPUCHECK(error)                                                                        \
   if (error != cudaSuccess) {                                                                  \
@@ -38,13 +21,6 @@
     failed("API returned error code.");                                                        \
   }
 
-void printCompilerInfo()
-{
-#ifdef __NVCC__
-  printf("compiler: nvcc\n");
-#endif
-}
-
 double bytesToKB(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
 
@@ -61,165 +37,137 @@ namespace benchmark
 {
 namespace gpu
 {
-GPUg() void helloKernel()
-{
-  printf("Hello World from GPU!\n");
-}
-
+// Kernels here
 } // namespace gpu
 void printDeviceProp(int deviceId)
 {
-  using namespace std;
   const int w1 = 34;
-  cout << left;
-  cout << setw(w1)
-       << "--------------------------------------------------------------------------------"
-       << endl;
-  cout << setw(w1) << "device#" << deviceId << endl;
+  std::cout << std::left;
+  std::cout << std::setw(w1)
+            << "--------------------------------------------------------------------------------"
+            << std::endl;
+  std::cout << std::setw(w1) << "device#" << deviceId << std::endl;
 
   cudaDeviceProp props;
   GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
 
-  cout << setw(w1) << "Name: " << props.name << endl;
-  cout << setw(w1) << "pciBusID: " << props.pciBusID << endl;
-  cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl;
-  cout << setw(w1) << "pciDomainID: " << props.pciDomainID << endl;
-  cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl;
-  cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
-       << endl;
-  cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl;
-  cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl;
-  cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
-       << endl;
-  cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl;
-  cout << setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0
-       << " Mhz" << endl;
-  cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2)
-       << bytesToGB(props.totalGlobalMem) << " GB" << endl;
+  std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
+  std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
+  std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
+  std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
+  std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
+            << std::endl;
+  std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
+  std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl;
+  std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
+            << std::endl;
+  std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
+  std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0
+            << " Mhz" << std::endl;
+  std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
+            << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
 #if !defined(__CUDACC__)
-  cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
-       << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << endl;
+  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+            << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << std::endl;
 #endif
 #if defined(__HIPCC__)
-  cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2)
-       << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << endl;
+  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+            << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << std::endl;
 #endif
-  cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl;
-  cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
-       << endl;
-  cout << setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << endl;
-  cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl;
-  cout << setw(w1) << "warpSize: " << props.warpSize << endl;
-  cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl;
-  cout << setw(w1) << "computeMode: " << props.computeMode << endl;
-  cout << setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << endl;
-  cout << setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << endl;
-  cout << setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << endl;
-  cout << setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << endl;
-  cout << setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << endl;
-  cout << setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << endl;
-  cout << setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << endl;
-  cout << setw(w1) << "major: " << props.major << endl;
-  cout << setw(w1) << "minor: " << props.minor << endl;
-  cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl;
-  cout << setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << endl;
-  cout << setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << endl;
+  std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
+  std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
+            << std::endl;
+  std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
+  std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
+  std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
+  std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
+  std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
+  std::cout << std::setw(w1) << "major: " << props.major << std::endl;
+  std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
+  std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
+  std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl;
+  std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl;
 #if defined(__HIPCC__)
-  cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl;
-  cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
-       << endl;
-  cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl;
-  cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
-       << endl;
-  cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl;
-  cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl;
-  cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl;
-  cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl;
-  cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl;
-  cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl;
-  cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl;
-  cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl;
-  cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl;
-  cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl;
-  cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl;
-  cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl;
-  cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl;
-  cout << setw(w1) << "gcnArchName: " << props.gcnArchName << endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
+            << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
+            << std::endl;
+  std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
+  std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
+  std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
+  std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
+  std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
+  std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
 #endif
-  cout << setw(w1) << "isIntegrated: " << props.integrated << endl;
-  cout << setw(w1) << "maxTexture1D: " << props.maxTexture1D << endl;
-  cout << setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << endl;
-  cout << setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << endl;
-  cout << setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << endl;
-  cout << setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << endl;
-  cout << setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << endl;
+  std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
+  std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
+  std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
 #if defined(__HIPCC__)
-  cout << setw(w1) << "isLargeBar: " << props.isLargeBar << endl;
-  cout << setw(w1) << "asicRevision: " << props.asicRevision << endl;
+  std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
+  std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
 #endif
 
   int deviceCnt;
-  cudaGetDeviceCount(&deviceCnt);
-  cout << setw(w1) << "peers: ";
+  GPUCHECK(cudaGetDeviceCount(&deviceCnt));
+  std::cout << std::setw(w1) << "peers: ";
   for (int i = 0; i < deviceCnt; i++) {
     int isPeer;
-    cudaDeviceCanAccessPeer(&isPeer, i, deviceId);
+    GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId));
     if (isPeer) {
-      cout << "device#" << i << " ";
+      std::cout << "device#" << i << " ";
     }
   }
-  cout << endl;
-  cout << setw(w1) << "non-peers: ";
+  std::cout << std::endl;
+  std::cout << std::setw(w1) << "non-peers: ";
   for (int i = 0; i < deviceCnt; i++) {
     int isPeer;
-    cudaDeviceCanAccessPeer(&isPeer, i, deviceId);
+    GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId));
     if (!isPeer) {
-      cout << "device#" << i << " ";
+      std::cout << "device#" << i << " ";
     }
   }
-  cout << endl;
+  std::cout << std::endl;
 
   size_t free, total;
-  cudaMemGetInfo(&free, &total);
+  GPUCHECK(cudaMemGetInfo(&free, &total));
 
-  cout << fixed << setprecision(2);
-  cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl;
-  cout << setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << setprecision(0)
-       << (float)free / total * 100.0 << "%)" << endl;
+  std::cout << std::fixed << std::setprecision(2);
+  std::cout << std::setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << std::endl;
+  std::cout << std::setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << std::setprecision(0)
+            << (float)free / total * 100.0 << "%)" << std::endl;
 }
 
-void hello_util()
+void printDevices()
 {
   int deviceCnt;
-
   GPUCHECK(cudaGetDeviceCount(&deviceCnt));
 
   for (int i = 0; i < deviceCnt; i++) {
-    cudaSetDevice(i);
+    GPUCHECK(cudaSetDevice(i));
     printDeviceProp(i);
   }
-
-  // gpu::helloKernel<<<1, 1>>>();
-  // displayCard();
 }
 } // namespace benchmark
-} // namespace o2
-
-/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before.
-I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP.
-
-Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel.
-For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level.
-We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks.
-We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant.
-
-For the tests I want to run in the segments, I think these should be:
-- Linear read in a multithreaded way: i.e. the standard GPU for loop:
-for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i];
-In the end we have to write foo to some output address to make sure the compiler cannot optimize anything.
-- Then I'd do the same with some stride, i.e.:
-foo += array[i * stride];
-- I'd try a random access with some simple linear congruence RNG per thread to determine the address.
-- Then I'd do the same with writing memory, and with copying memory.
-- Finally the data type should be flexible, going from char to uint4.
-That should cover most cases, but if you have more ideas, feel free to add something.*/
\ No newline at end of file
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
index 27e8a15efdc20..1447441c9da1a 100644
--- a/GPU/GPUbenchmark/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/hip/CMakeLists.txt
@@ -24,7 +24,7 @@ set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 if(EXISTS ${HIPIFY_EXECUTABLE})
 
 # Generate on-the-fly the HIP kernel
-message("Generating HIP kernel code on the fly...")
+message("Generating HIP kernel code ....")
 execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
 o2_add_library(HIPbenchmark
                SOURCES Kernels.hip.cxx
@@ -36,6 +36,4 @@ o2_add_library(HIPbenchmark
 
 elseif()
   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
-endif()
-
-# install(FILES ${HDRS_INSTALL} DESTINATION include/GPU/GPUBenchmark/Steer/Kernels)
\ No newline at end of file
+endif()
\ No newline at end of file

From 7e2ef6ae0e3589569070b0d58aa5cb4f3f37d3fc Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Fri, 28 May 2021 17:27:38 +0200
Subject: [PATCH 12/42] Add CMake forced re-configuration

---
 GPU/GPUbenchmark/Shared/Common.h    |  7 +------
 GPU/GPUbenchmark/Shared/Kernels.h   | 23 ++++++++++++++++++++++-
 GPU/GPUbenchmark/cuda/Kernels.cu    | 10 ++++++++++
 GPU/GPUbenchmark/hip/CMakeLists.txt |  3 ++-
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index b98d01923dc61..38f34cf4c1902 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -13,14 +13,9 @@
 
 #ifndef GPUBENCHMARK_COMMON_H
 #define GPUBENCHMARK_COMMON_H
-#if defined (__HIPCC__)
+#if defined(__HIPCC__)
 #include "hip/hip_runtime.h"
 #endif
-#if defined (__HIPCC__)
-#define AUTO_DISCARD "auto discard ="
-#else
-#define AUTO_DISCARD
-#endif
 
 #include <iostream>
 #include <iomanip>
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 514eede070e73..5640f07a78e57 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -15,12 +15,32 @@
 #define GPU_BENCHMARK_KERNELS_H
 
 #include "GPUCommonDef.h"
+#include <iostream>
+#include <iomanip>
+#include <chrono>
 
 namespace o2
 {
 namespace benchmark
 {
 void printDevices();
+void init();
+template <typename... T>
+float measure(void (*task)(T...), const char* taskName, T&&... args)
+{
+  float diff{0.f};
+
+  auto start = std::chrono::high_resolution_clock::now();
+  (*task)(std::forward<T>(args)...);
+  auto end = std::chrono::high_resolution_clock::now();
+
+  std::chrono::duration<double, std::milli> diff_t{end - start};
+  diff = diff_t.count();
+
+  std::cout << std::setw(2) << " - " << taskName << " completed in: " << diff << " ms" << std::endl;
+
+  return diff;
+}
 
 class GPUbenchmark final
 {
@@ -33,7 +53,8 @@ class GPUbenchmark final
 // Steers
 void GPUbenchmark::run()
 {
-  printDevices();
+  // printDevices();
+  measure(&init, "Init");
 }
 } // namespace benchmark
 } // namespace o2
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 7d2070d7065d8..e555385c600f9 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -169,5 +169,15 @@ void printDevices()
     printDeviceProp(i);
   }
 }
+
+void init()
+{
+  size_t free, total;
+  GPUCHECK(cudaMemGetInfo(&free, &total));
+
+  void* devicePtr;
+  GPUCHECK(cudaMalloc(&devicePtr, total));
+}
+
 } // namespace benchmark
 } // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
index 1447441c9da1a..a785a74ec40c5 100644
--- a/GPU/GPUbenchmark/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/hip/CMakeLists.txt
@@ -22,9 +22,10 @@ set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu)
 set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
 if(EXISTS ${HIPIFY_EXECUTABLE})
+set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
 
 # Generate on-the-fly the HIP kernel
-message("Generating HIP kernel code ....")
+message("Generating HIP kernel code ...")
 execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
 o2_add_library(HIPbenchmark
                SOURCES Kernels.hip.cxx

From 664759fe46879ad435ce2bc2c595c06758d87852 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Mon, 31 May 2021 19:13:27 +0200
Subject: [PATCH 13/42] HIP can't find symbols

---
 GPU/GPUbenchmark/Shared/Common.h    |  1 -
 GPU/GPUbenchmark/Shared/Kernels.h   | 66 +++++++++++++-------
 GPU/GPUbenchmark/benchmark.cxx      |  2 +-
 GPU/GPUbenchmark/cuda/Kernels.cu    | 94 +++++++++++++++++++++++------
 GPU/GPUbenchmark/hip/CMakeLists.txt |  7 +++
 5 files changed, 130 insertions(+), 40 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index 38f34cf4c1902..c831ac46882d8 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -19,7 +19,6 @@
 
 #include <iostream>
 #include <iomanip>
-#include "GPUCommonDef.h"
 
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 5640f07a78e57..f8983139718b9 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -15,52 +15,76 @@
 #define GPU_BENCHMARK_KERNELS_H
 
 #include "GPUCommonDef.h"
+#include <vector>
 #include <iostream>
 #include <iomanip>
 #include <chrono>
 
+#define PARTITION_SIZE_GB 1
+#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.99f
+#define GB 1073741824
+
+double bytesToKB(size_t s) { return (double)s / (1024.0); }
+double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
+
 namespace o2
 {
 namespace benchmark
 {
-void printDevices();
-void init();
-template <typename... T>
-float measure(void (*task)(T...), const char* taskName, T&&... args)
-{
-  float diff{0.f};
 
-  auto start = std::chrono::high_resolution_clock::now();
-  (*task)(std::forward<T>(args)...);
-  auto end = std::chrono::high_resolution_clock::now();
+template <class T>
+struct gpuState {
+  int getMaxSegments()
+  {
+    return bytesToGB(allocatedMemory);
+  }
 
-  std::chrono::duration<double, std::milli> diff_t{end - start};
-  diff = diff_t.count();
+  void computeBufferPointers()
+  {
+    addresses.resize(getMaxSegments());
+    for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) {
+      addresses[iBuffAddress] = scratchPtr + GB * PARTITION_SIZE_GB * iBuffAddress;
+    }
+  }
 
-  std::cout << std::setw(2) << " - " << taskName << " completed in: " << diff << " ms" << std::endl;
+  std::vector<T*> getBuffersPointers()
+  {
+    return addresses;
+  }
 
-  return diff;
-}
+  std::vector<T*> addresses;
+  size_t allocatedMemory;
+  T* scratchPtr;
+
+  //Static info
+  size_t totalMemory;
+  size_t nMultiprocessors;
+  size_t nMaxThreadsPerBlock;
+};
 
+template <class buffer_type>
 class GPUbenchmark final
 {
  public:
   GPUbenchmark() = default;
   virtual ~GPUbenchmark() = default;
+  template <typename... T>
+  float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
+
+  void init(const int deviceId);
   void run();
+  void finalize();
+  void readingBenchmark();
+  void printDevices();
+
+ private:
+  gpuState<buffer_type> mState;
 };
 
-// Steers
-void GPUbenchmark::run()
-{
-  // printDevices();
-  measure(&init, "Init");
-}
 } // namespace benchmark
 } // namespace o2
 #endif
 
-
 /*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before.
 I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP.
 
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index aaaa0ffbbe390..610028a8e16f6 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -16,7 +16,7 @@
 
 int main()
 {
-  o2::benchmark::GPUbenchmark bm{};
+  o2::benchmark::GPUbenchmark<char> bm{};
   bm.run();
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e555385c600f9..3bc8566ef3191 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -21,24 +21,27 @@
     failed("API returned error code.");                                                        \
   }
 
-double bytesToKB(size_t s) { return (double)s / (1024.0); }
-double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
-
-#define printLimit(w1, limit, units)                                          \
-  {                                                                           \
-    size_t val;                                                               \
-    cudaDeviceGetLimit(&val, limit);                                          \
-    std::cout << setw(w1) << #limit ": " << val << " " << units << std::endl; \
-  }
-
 namespace o2
 {
 namespace benchmark
 {
 namespace gpu
 {
-// Kernels here
+// Kernels go here
+template <class buffer_type>
+GPUg() void readerKernel(
+  // buffer_type* buffer,
+  // size_t bufferSize)
+)
+{
+  printf("ciao");
+  // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
+  //   if (i == 0) {
+  //       }
+  // }
+}
 } // namespace gpu
+
 void printDeviceProp(int deviceId)
 {
   const int w1 = 34;
@@ -159,7 +162,22 @@ void printDeviceProp(int deviceId)
             << (float)free / total * 100.0 << "%)" << std::endl;
 }
 
-void printDevices()
+template <class buffer_type>
+template <typename... T>
+float GPUbenchmark<buffer_type>::measure(void (GPUbenchmark<buffer_type>::*task)(T...), const char* taskName, T&&... args)
+{
+  float diff{0.f};
+  auto start = std::chrono::high_resolution_clock::now();
+  (this->*task)(std::forward<T>(args)...);
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::milli> diff_t{end - start};
+  diff = diff_t.count();
+  std::cout << std::setw(2) << ">>> " << taskName << " completed in: " << diff << " ms" << std::endl;
+  return diff;
+}
+
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::printDevices()
 {
   int deviceCnt;
   GPUCHECK(cudaGetDeviceCount(&deviceCnt));
@@ -170,14 +188,56 @@ void printDevices()
   }
 }
 
-void init()
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::init(const int deviceId)
 {
-  size_t free, total;
-  GPUCHECK(cudaMemGetInfo(&free, &total));
+  cudaDeviceProp props;
+  size_t free;
+
+  // Fetch and store traits
+  GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
+  GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
-  void* devicePtr;
-  GPUCHECK(cudaMalloc(&devicePtr, total));
+  mState.nMultiprocessors = props.multiProcessorCount;
+  mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
+  mState.allocatedMemory = static_cast<long int>(FREE_MEMORY_FRACTION_TO_ALLOCATE * free);
+
+  // Setup
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.allocatedMemory));
+}
+
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::readingBenchmark()
+{
+  dim3 nBlocks(mState.nMultiprocessors);
+  dim3 nThreads(mState.nMaxThreadsPerBlock);
+  gpu::readerKernel<buffer_type><<<1, 1>>>();
+}
+
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::finalize()
+{
+  GPUCHECK(cudaFree(mState.scratchPtr));
+}
+
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::run()
+{
+  printDevices();
+  measure(&GPUbenchmark<buffer_type>::init, "Init", 0);
+  std::cout << "  ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory
+            << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n";
+  std::cout << "  └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n";
+  mState.computeBufferPointers();
+
+  // for (auto& addr : mState.getBuffersPointers()) {
+  //   std::cout << (void*)addr << std::endl;
+  // }
+  measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
+  GPUbenchmark<buffer_type>::finalize();
 }
 
+template class GPUbenchmark<char>;
+
 } // namespace benchmark
 } // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
index a785a74ec40c5..6d41c72148374 100644
--- a/GPU/GPUbenchmark/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/hip/CMakeLists.txt
@@ -35,6 +35,13 @@ o2_add_library(HIPbenchmark
                                      hip::device
                TARGETVARNAME targetName)
 
+target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2exe-memory-benchmark-hip,COMPILE_DEFINITIONS>)
+
+if(HIP_AMDGPUTARGET)
+  # Need to add gpu target also to link flags due to gpu-rdc option
+  target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+endif()
+
 elseif()
   message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
 endif()
\ No newline at end of file

From 1d352ab03ab27f6c011de64d698a1ea85c53d3b9 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 1 Jun 2021 11:54:08 +0200
Subject: [PATCH 14/42] Checkpoint before radical change

---
 GPU/GPUbenchmark/CMakeLists.txt      |  12 ++-
 GPU/GPUbenchmark/cuda/CMakeLists.txt |   4 +-
 GPU/GPUbenchmark/cuda/Kernels.cu     | 107 ++++++++++++++++++++++-----
 GPU/GPUbenchmark/hip/CMakeLists.txt  |  14 ++--
 4 files changed, 102 insertions(+), 35 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 911d567cda350..30969425203a9 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -9,7 +9,6 @@
 # submit itself to any jurisdiction.
 
 if(CUDA_ENABLED)
-  message("Building CUDA benchmark library")
   add_subdirectory(cuda)
   o2_add_executable(memory-benchmark-cuda
                   SOURCES benchmark.cxx
@@ -18,16 +17,21 @@ if(CUDA_ENABLED)
 endif()
 
 if(HIP_ENABLED)
-  message("Building HIP benchmark library")
   add_subdirectory(hip)
   set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+  set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
   set(CMAKE_CXX_EXTENSIONS OFF)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
 
   o2_add_executable(memory-benchmark-hip
                     SOURCES benchmark.cxx
                     PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
-                                          hip::host 
-                                          hip::device
+                                          hip::host
                     TARGETVARNAME targetName)
+
+if(HIP_AMDGPUTARGET)
+# Need to add gpu target also to link flags due to gpu-rdc option
+target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+endif()
 endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt
index 0e8415cef1262..89a88969c2a3f 100644
--- a/GPU/GPUbenchmark/cuda/CMakeLists.txt
+++ b/GPU/GPUbenchmark/cuda/CMakeLists.txt
@@ -14,6 +14,4 @@ o2_add_library(CUDAbenchmark
                SOURCES Kernels.cu
                PUBLIC_INCLUDE_DIRECTORIES ../Shared
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
-               TARGETVARNAME targetName)
-
-               set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
+               TARGETVARNAME targetName)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 3bc8566ef3191..20e5187ba69f1 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -13,6 +13,7 @@
 
 #include <Kernels.h>
 #include <Common.h>
+#include <stdio.h>
 
 #define GPUCHECK(error)                                                                        \
   if (error != cudaSuccess) {                                                                  \
@@ -21,6 +22,15 @@
     failed("API returned error code.");                                                        \
   }
 
+#define CHECK(cmd)                                                                                         \
+  {                                                                                                        \
+    cudaError_t error = cmd;                                                                               \
+    if (error != cudaSuccess) {                                                                            \
+      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \
+      exit(EXIT_FAILURE);                                                                                  \
+    }                                                                                                      \
+  }
+
 namespace o2
 {
 namespace benchmark
@@ -28,18 +38,33 @@ namespace benchmark
 namespace gpu
 {
 // Kernels go here
-template <class buffer_type>
-GPUg() void readerKernel(
-  // buffer_type* buffer,
-  // size_t bufferSize)
-)
+/* 
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void
+  vector_square(T* C_d, T* A_d, size_t N)
 {
-  printf("ciao");
-  // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
-  //   if (i == 0) {
-  //       }
-  // }
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = offset; i < N; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
 }
+
+// template <class buffer_type>
+// GPUg() void readerKernel(
+//   // buffer_type* buffer,
+//   // size_t bufferSize)
+// )
+// {
+//   printf("ciao");
+//   // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
+//   //   if (i == 0) {
+//   //       }
+//   // }
+// }
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -209,9 +234,51 @@ void GPUbenchmark<buffer_type>::init(const int deviceId)
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingBenchmark()
 {
-  dim3 nBlocks(mState.nMultiprocessors);
-  dim3 nThreads(mState.nMaxThreadsPerBlock);
-  gpu::readerKernel<buffer_type><<<1, 1>>>();
+  // dim3 nBlocks(mState.nMultiprocessors);
+  // dim3 nThreads(mState.nMaxThreadsPerBlock);
+  // gpu::readerKernel<buffer_type><<<1, 1>>>();
+  float *A_d, *C_d;
+  float *A_h, *C_h;
+  size_t N = 1000000;
+  size_t Nbytes = N * sizeof(float);
+
+  cudaDeviceProp props;
+  CHECK(cudaGetDeviceProperties(&props, 0 /*deviceID*/));
+  printf("info: running on device %s\n", props.name);
+
+  printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
+  A_h = (float*)malloc(Nbytes);
+  CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess);
+  C_h = (float*)malloc(Nbytes);
+  CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess);
+  // Fill with Phi + i
+  for (size_t i = 0; i < N; i++) {
+    A_h[i] = 1.618f + i;
+  }
+
+  printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
+  CHECK(cudaMalloc(&A_d, Nbytes));
+  CHECK(cudaMalloc(&C_d, Nbytes));
+
+  printf("info: copy Host2Device\n");
+  CHECK(cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
+
+  const unsigned blocks = 512;
+  const unsigned threadsPerBlock = 256;
+
+  printf("info: launch 'vector_square' kernel\n");
+  gpu::vector_square<<<blocks, threadsPerBlock>>>(C_d, A_d, N);
+
+  printf("info: copy Device2Host\n");
+  CHECK(cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
+
+  printf("info: check result\n");
+  for (size_t i = 0; i < N; i++) {
+    if (C_h[i] != A_h[i] * A_h[i]) {
+      CHECK(cudaErrorUnknown);
+    }
+  }
+  printf("PASSED!\n");
 }
 
 template <class buffer_type>
@@ -223,18 +290,18 @@ void GPUbenchmark<buffer_type>::finalize()
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::run()
 {
-  printDevices();
-  measure(&GPUbenchmark<buffer_type>::init, "Init", 0);
-  std::cout << "  ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory
-            << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n";
-  std::cout << "  └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n";
-  mState.computeBufferPointers();
+  // printDevices();
+  // measure(&GPUbenchmark<buffer_type>::init, "Init", 0);
+  // std::cout << "  ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory
+  //           << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n";
+  // std::cout << "  └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n";
+  // mState.computeBufferPointers();
 
   // for (auto& addr : mState.getBuffersPointers()) {
   //   std::cout << (void*)addr << std::endl;
   // }
   measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
-  GPUbenchmark<buffer_type>::finalize();
+  // GPUbenchmark<buffer_type>::finalize();
 }
 
 template class GPUbenchmark<char>;
diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
index 6d41c72148374..b599db0de6cc6 100644
--- a/GPU/GPUbenchmark/hip/CMakeLists.txt
+++ b/GPU/GPUbenchmark/hip/CMakeLists.txt
@@ -10,33 +10,31 @@
 
 set(HDRS_INSTALL ../Shared/Kernels.h)
 
-set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
  
 # Hipify-perl
 set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
-set(HIP_KERNEL "Kernels.hip.cxx")
 
+set(HIP_KERNEL "Kernels.hip.cxx")
 set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu)
 set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
 
 if(EXISTS ${HIPIFY_EXECUTABLE})
 set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
-
-# Generate on-the-fly the HIP kernel
 message("Generating HIP kernel code ...")
 execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
 o2_add_library(HIPbenchmark
                SOURCES Kernels.hip.cxx
                PUBLIC_INCLUDE_DIRECTORIES ../Shared
                PUBLIC_LINK_LIBRARIES O2::GPUCommon
-                                     hip::host
-                                     hip::device
+                                    #  hip::host
+                                    #  hip::device
                TARGETVARNAME targetName)
 
-target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2exe-memory-benchmark-hip,COMPILE_DEFINITIONS>)
-
 if(HIP_AMDGPUTARGET)
   # Need to add gpu target also to link flags due to gpu-rdc option
   target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})

From 12c5d394f61e0ec0d2375ca260668d52741ad25e Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 1 Jun 2021 12:39:12 +0200
Subject: [PATCH 15/42] Create single executable

---
 GPU/GPUbenchmark/CMakeLists.txt      | 36 ++++++++++++++++------
 GPU/GPUbenchmark/Shared/Common.h     |  5 ++--
 GPU/GPUbenchmark/Shared/Kernels.h    |  5 +---
 GPU/GPUbenchmark/benchmark.cxx       |  2 +-
 GPU/GPUbenchmark/cuda/CMakeLists.txt | 17 -----------
 GPU/GPUbenchmark/cuda/Kernels.cu     |  7 +++--
 GPU/GPUbenchmark/hip/CMakeLists.txt  | 45 ----------------------------
 7 files changed, 37 insertions(+), 80 deletions(-)
 delete mode 100644 GPU/GPUbenchmark/cuda/CMakeLists.txt
 delete mode 100644 GPU/GPUbenchmark/hip/CMakeLists.txt

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 30969425203a9..0829637fee964 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -8,17 +8,34 @@
 # granted to it by virtue of its status as an Intergovernmental Organization or
 # submit itself to any jurisdiction.
 
+set(HDRS_INSTALL ../Shared/Kernels.h)
+
 if(CUDA_ENABLED)
-  add_subdirectory(cuda)
+  # add_subdirectory(cuda)
   o2_add_executable(memory-benchmark-cuda
                   SOURCES benchmark.cxx
-                  PUBLIC_LINK_LIBRARIES O2::CUDAbenchmark
+                          cuda/Kernels.cu
+                  PUBLIC_LINK_LIBRARIES O2::GPUCommon
                   TARGETVARNAME targetName)
 endif()
 
 if(HIP_ENABLED)
-  add_subdirectory(hip)
-  set(CMAKE_CXX_COMPILER ${hip_HIPCC_EXECUTABLE})
+  # Hipify-perl
+  set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
+
+  set(HIP_KERNEL "Kernels.hip.cxx")
+  set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
+  set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/hip/${HIP_KERNEL}")
+
+  if(EXISTS ${HIPIFY_EXECUTABLE})
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
+    message("Generating HIP kernel code ...")
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
+  elseif()
+    message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
+  endif()
+
+  set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
   set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
 
   set(CMAKE_CXX_EXTENSIONS OFF)
@@ -26,12 +43,13 @@ if(HIP_ENABLED)
 
   o2_add_executable(memory-benchmark-hip
                     SOURCES benchmark.cxx
-                    PUBLIC_LINK_LIBRARIES O2::HIPbenchmark
+                            hip/Kernels.hip.cxx
+                    PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                           hip::host
                     TARGETVARNAME targetName)
 
-if(HIP_AMDGPUTARGET)
-# Need to add gpu target also to link flags due to gpu-rdc option
-target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
-endif()
+  if(HIP_AMDGPUTARGET)
+    # Need to add gpu target also to link flags due to gpu-rdc option
+    target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+  endif()
 endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index c831ac46882d8..99db8e114aee6 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -11,8 +11,8 @@
 /// \file Common.h
 /// \author: mconcas@cern.ch
 
-#ifndef GPUBENCHMARK_COMMON_H
-#define GPUBENCHMARK_COMMON_H
+#ifndef GPU_BENCHMARK_COMMON_H
+#define GPU_BENCHMARK_COMMON_H
 #if defined(__HIPCC__)
 #include "hip/hip_runtime.h"
 #endif
@@ -35,4 +35,5 @@
   printf("\n");                           \
   printf("error: TEST FAILED\n%s", KNRM); \
   exit(EXIT_FAILURE);
+
 #endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index f8983139718b9..32d93159ac958 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -24,9 +24,6 @@
 #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.99f
 #define GB 1073741824
 
-double bytesToKB(size_t s) { return (double)s / (1024.0); }
-double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
-
 namespace o2
 {
 namespace benchmark
@@ -36,7 +33,7 @@ template <class T>
 struct gpuState {
   int getMaxSegments()
   {
-    return bytesToGB(allocatedMemory);
+    return (double)allocatedMemory / (1024.0 * 1024.0 * 1024.0);
   }
 
   void computeBufferPointers()
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 610028a8e16f6..89c2b3d79be76 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -12,7 +12,7 @@
 /// \author: mconcas@cern.ch
 
 #include <iostream>
-#include <Kernels.h>
+#include "Shared/Kernels.h"
 
 int main()
 {
diff --git a/GPU/GPUbenchmark/cuda/CMakeLists.txt b/GPU/GPUbenchmark/cuda/CMakeLists.txt
deleted file mode 100644
index 89a88969c2a3f..0000000000000
--- a/GPU/GPUbenchmark/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright CERN and copyright holders of ALICE O2. This software is distributed
-# under the terms of the GNU General Public License v3 (GPL Version 3), copied
-# verbatim in the file "COPYING".
-#
-# See http://alice-o2.web.cern.ch/license for full licensing information.
-#
-# In applying this license CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization or
-# submit itself to any jurisdiction.
-
-set(HDRS_INSTALL ../Shared/Kernels.h)
-
-o2_add_library(CUDAbenchmark
-               SOURCES Kernels.cu
-               PUBLIC_INCLUDE_DIRECTORIES ../Shared
-               PUBLIC_LINK_LIBRARIES O2::GPUCommon
-               TARGETVARNAME targetName)
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 20e5187ba69f1..a2953c5b48d9f 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -11,10 +11,13 @@
 /// \file Kernels.cu
 /// \author: mconcas@cern.ch
 
-#include <Kernels.h>
-#include <Common.h>
+#include "../Shared/Kernels.h"
+#include "../Shared/Common.h"
 #include <stdio.h>
 
+double bytesToKB(size_t s) { return (double)s / (1024.0); }
+double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
+
 #define GPUCHECK(error)                                                                        \
   if (error != cudaSuccess) {                                                                  \
     printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \
diff --git a/GPU/GPUbenchmark/hip/CMakeLists.txt b/GPU/GPUbenchmark/hip/CMakeLists.txt
deleted file mode 100644
index b599db0de6cc6..0000000000000
--- a/GPU/GPUbenchmark/hip/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright CERN and copyright holders of ALICE O2. This software is distributed
-# under the terms of the GNU General Public License v3 (GPL Version 3), copied
-# verbatim in the file "COPYING".
-#
-# See http://alice-o2.web.cern.ch/license for full licensing information.
-#
-# In applying this license CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization or
-# submit itself to any jurisdiction.
-
-set(HDRS_INSTALL ../Shared/Kernels.h)
-
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
- 
-# Hipify-perl
-set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
-
-set(HIP_KERNEL "Kernels.hip.cxx")
-set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/../cuda/Kernels.cu)
-set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${HIP_KERNEL}")
-
-if(EXISTS ${HIPIFY_EXECUTABLE})
-set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
-message("Generating HIP kernel code ...")
-execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
-o2_add_library(HIPbenchmark
-               SOURCES Kernels.hip.cxx
-               PUBLIC_INCLUDE_DIRECTORIES ../Shared
-               PUBLIC_LINK_LIBRARIES O2::GPUCommon
-                                    #  hip::host
-                                    #  hip::device
-               TARGETVARNAME targetName)
-
-if(HIP_AMDGPUTARGET)
-  # Need to add gpu target also to link flags due to gpu-rdc option
-  target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
-endif()
-
-elseif()
-  message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
-endif()
\ No newline at end of file

From 0a7ae2ec7dfc1c4d536e942b904b914115c1ab46 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 1 Jun 2021 17:27:27 +0200
Subject: [PATCH 16/42] Update

---
 GPU/GPUbenchmark/CMakeLists.txt   |   4 +-
 GPU/GPUbenchmark/Shared/Kernels.h |   5 ++
 GPU/GPUbenchmark/benchmark.cxx    |   7 +-
 GPU/GPUbenchmark/cuda/Kernels.cu  | 130 +++++++++++-------------------
 4 files changed, 58 insertions(+), 88 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 0829637fee964..71ba08edeb975 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -12,7 +12,7 @@ set(HDRS_INSTALL ../Shared/Kernels.h)
 
 if(CUDA_ENABLED)
   # add_subdirectory(cuda)
-  o2_add_executable(memory-benchmark-cuda
+  o2_add_executable(gpu-memory-benchmark-cuda
                   SOURCES benchmark.cxx
                           cuda/Kernels.cu
                   PUBLIC_LINK_LIBRARIES O2::GPUCommon
@@ -41,7 +41,7 @@ if(HIP_ENABLED)
   set(CMAKE_CXX_EXTENSIONS OFF)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
 
-  o2_add_executable(memory-benchmark-hip
+  o2_add_executable(gpu-memory-benchmark-hip
                     SOURCES benchmark.cxx
                             hip/Kernels.hip.cxx
                     PUBLIC_LINK_LIBRARIES O2::GPUCommon
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 32d93159ac958..a27938ad320ab 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -44,6 +44,11 @@ struct gpuState {
     }
   }
 
+  size_t getArrayLength()
+  {
+    return static_cast<size_t>(GB * PARTITION_SIZE_GB / sizeof(T));
+  }
+
   std::vector<T*> getBuffersPointers()
   {
     return addresses;
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 89c2b3d79be76..7a83888bd57ed 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -16,7 +16,10 @@
 
 int main()
 {
-  o2::benchmark::GPUbenchmark<char> bm{};
-  bm.run();
+  o2::benchmark::GPUbenchmark<char> bm_char{};
+  bm_char.run();
+  o2::benchmark::GPUbenchmark<int> bm_int{};
+  bm_int.run();
+
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index a2953c5b48d9f..ffb0af087c147 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -25,14 +25,14 @@ double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
     failed("API returned error code.");                                                        \
   }
 
-#define CHECK(cmd)                                                                                         \
-  {                                                                                                        \
-    cudaError_t error = cmd;                                                                               \
-    if (error != cudaSuccess) {                                                                            \
-      fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \
-      exit(EXIT_FAILURE);                                                                                  \
-    }                                                                                                      \
-  }
+// #define CHECK(cmd)                                                                                         \
+//   {                                                                                                        \
+//     cudaError_t error = cmd;                                                                               \
+//     if (error != cudaSuccess) {                                                                            \
+//       fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \
+//       exit(EXIT_FAILURE);                                                                                  \
+//     }                                                                                                      \
+//   }
 
 namespace o2
 {
@@ -44,30 +44,26 @@ namespace gpu
 /* 
  * Square each element in the array A and write to array C.
  */
-template <typename T>
-__global__ void
-  vector_square(T* C_d, T* A_d, size_t N)
-{
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
+// template <typename T>
+// __global__ void
+//   vector_square(T* C_d, T* A_d, size_t N)
+// {
+//   size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+//   size_t stride = blockDim.x * gridDim.x;
 
-  for (size_t i = offset; i < N; i += stride) {
-    C_d[i] = A_d[i] * A_d[i];
+//   for (size_t i = offset; i < N; i += stride) {
+//     C_d[i] = A_d[i] * A_d[i];
+//   }
+// }
+
+template <class buffer_type>
+GPUg() void readerKernel(
+  buffer_type* buffer,
+  size_t bufferSize)
+{
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
   }
 }
-
-// template <class buffer_type>
-// GPUg() void readerKernel(
-//   // buffer_type* buffer,
-//   // size_t bufferSize)
-// )
-// {
-//   printf("ciao");
-//   // for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
-//   //   if (i == 0) {
-//   //       }
-//   // }
-// }
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -232,56 +228,22 @@ void GPUbenchmark<buffer_type>::init(const int deviceId)
 
   // Setup
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.allocatedMemory));
-}
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingBenchmark()
-{
-  // dim3 nBlocks(mState.nMultiprocessors);
-  // dim3 nThreads(mState.nMaxThreadsPerBlock);
-  // gpu::readerKernel<buffer_type><<<1, 1>>>();
-  float *A_d, *C_d;
-  float *A_h, *C_h;
-  size_t N = 1000000;
-  size_t Nbytes = N * sizeof(float);
+  mState.computeBufferPointers();
 
-  cudaDeviceProp props;
-  CHECK(cudaGetDeviceProperties(&props, 0 /*deviceID*/));
-  printf("info: running on device %s\n", props.name);
-
-  printf("info: allocate host mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
-  A_h = (float*)malloc(Nbytes);
-  CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess);
-  C_h = (float*)malloc(Nbytes);
-  CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess);
-  // Fill with Phi + i
-  for (size_t i = 0; i < N; i++) {
-    A_h[i] = 1.618f + i;
-  }
-
-  printf("info: allocate device mem (%6.2f MB)\n", 2 * Nbytes / 1024.0 / 1024.0);
-  CHECK(cudaMalloc(&A_d, Nbytes));
-  CHECK(cudaMalloc(&C_d, Nbytes));
-
-  printf("info: copy Host2Device\n");
-  CHECK(cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
+  for (size_t iAddr{0}; iAddr < mState.getBuffersPointers().size(); ++iAddr) {
 
-  const unsigned blocks = 512;
-  const unsigned threadsPerBlock = 256;
+  }
 
-  printf("info: launch 'vector_square' kernel\n");
-  gpu::vector_square<<<blocks, threadsPerBlock>>>(C_d, A_d, N);
 
-  printf("info: copy Device2Host\n");
-  CHECK(cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
+}
 
-  printf("info: check result\n");
-  for (size_t i = 0; i < N; i++) {
-    if (C_h[i] != A_h[i] * A_h[i]) {
-      CHECK(cudaErrorUnknown);
-    }
-  }
-  printf("PASSED!\n");
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::readingBenchmark()
+{
+  dim3 nBlocks(mState.nMultiprocessors);
+  dim3 nThreads(mState.nMaxThreadsPerBlock);
+  // gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>();
 }
 
 template <class buffer_type>
@@ -294,20 +256,20 @@ template <class buffer_type>
 void GPUbenchmark<buffer_type>::run()
 {
   // printDevices();
-  // measure(&GPUbenchmark<buffer_type>::init, "Init", 0);
-  // std::cout << "  ├ Allocated " << mState.allocatedMemory << "/" << mState.totalMemory
-  //           << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n";
-  // std::cout << "  └ Can do " << mState.getMaxSegments() << " of 1GB memory segments\n";
-  // mState.computeBufferPointers();
-
-  // for (auto& addr : mState.getBuffersPointers()) {
-  //   std::cout << (void*)addr << std::endl;
-  // }
-  measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
-  // GPUbenchmark<buffer_type>::finalize();
+  measure(&GPUbenchmark<buffer_type>::init, "Init", 0);
+  std::cout << "  ├ Allocated: " << mState.allocatedMemory << "/" << mState.totalMemory
+            << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n";
+  std::cout << "  ├ Can do: " << mState.getMaxSegments() << " segments of " << PARTITION_SIZE_GB << "GB each\n";
+  std::cout << "  └ Length of arrays in segments: " << mState.getArrayLength() << std::endl;
+
+
+  // measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
+  GPUbenchmark<buffer_type>::finalize();
 }
 
 template class GPUbenchmark<char>;
+template class GPUbenchmark<uint4>;
+template class GPUbenchmark<int>;
 
 } // namespace benchmark
 } // namespace o2
\ No newline at end of file

From 6c352b15ce004b36df534f5c7babfef678f591ac Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Mon, 21 Jun 2021 17:28:26 +0200
Subject: [PATCH 17/42] Add first dummy benchmark

---
 GPU/GPUbenchmark/Shared/Common.h  |   6 +-
 GPU/GPUbenchmark/Shared/Kernels.h |  70 +++++++++----------
 GPU/GPUbenchmark/benchmark.cxx    |  11 +--
 GPU/GPUbenchmark/cuda/Kernels.cu  | 109 +++++++++++++++++-------------
 4 files changed, 106 insertions(+), 90 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index 99db8e114aee6..b9af3be3ed966 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -13,9 +13,9 @@
 
 #ifndef GPU_BENCHMARK_COMMON_H
 #define GPU_BENCHMARK_COMMON_H
-#if defined(__HIPCC__)
-#include "hip/hip_runtime.h"
-#endif
+// #if defined(__HIPCC__)
+// #include "hip/hip_runtime.h"
+// #endif
 
 #include <iostream>
 #include <iomanip>
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index a27938ad320ab..3c030cf9cae01 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -21,7 +21,7 @@
 #include <chrono>
 
 #define PARTITION_SIZE_GB 1
-#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.99f
+#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f
 #define GB 1073741824
 
 namespace o2
@@ -33,32 +33,46 @@ template <class T>
 struct gpuState {
   int getMaxSegments()
   {
-    return (double)allocatedMemory / (1024.0 * 1024.0 * 1024.0);
+    return (double)scratchSize / (1024.0 * 1024.0 * 1024.0);
   }
 
-  void computeBufferPointers()
+  void computeScratchPtrs()
   {
     addresses.resize(getMaxSegments());
     for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) {
-      addresses[iBuffAddress] = scratchPtr + GB * PARTITION_SIZE_GB * iBuffAddress;
+      addresses[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + GB * iBuffAddress);
     }
   }
 
-  size_t getArrayLength()
+  static constexpr size_t getArraySize()
   {
     return static_cast<size_t>(GB * PARTITION_SIZE_GB / sizeof(T));
   }
 
-  std::vector<T*> getBuffersPointers()
+  std::vector<T*> getScratchPtrs()
   {
     return addresses;
   }
 
-  std::vector<T*> addresses;
-  size_t allocatedMemory;
-  T* scratchPtr;
+  std::vector<std::vector<T>>& getHostBuffers()
+  {
+    return gpuBuffersHost;
+  }
+
+  // General containers and state
+  T* scratchPtr;                              // Pointer to scratch buffer
+  size_t scratchSize;                         // Size of scratch area (B)
+  std::vector<T*> addresses;                  // Pointers to scratch partitions
+  std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
+
+  // Test-specific containers
+  std::vector<T*> deviceReadingResultsPtrs; // Results of the reading test (single variable) on GPU
+  std::vector<T> hostReadingResultsVector;  // Results of the reading test (single variable) on host
 
-  //Static info
+  // Configuration
+  size_t nMaxThreadsPerDimension;
+
+  // Static info
   size_t totalMemory;
   size_t nMultiprocessors;
   size_t nMaxThreadsPerBlock;
@@ -73,11 +87,18 @@ class GPUbenchmark final
   template <typename... T>
   float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
 
-  void init(const int deviceId);
-  void run();
-  void finalize();
+  // Main interface
+  void generalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
+  void run();                           // Execute all specified callbacks
+  void generalFinalize();               // Cleanup
+  void printDevices();                  // Dump info
+
+  // Initializations/Finalizations of tests. Not to be measured, in principle used for report
+  void readingInit();
+  void readingFinalize();
+
+  // Benchmark kernel callbacks
   void readingBenchmark();
-  void printDevices();
 
  private:
   gpuState<buffer_type> mState;
@@ -85,23 +106,4 @@ class GPUbenchmark final
 
 } // namespace benchmark
 } // namespace o2
-#endif
-
-/*In particular: I'd allocate one single large buffer filling almost the whole GPU memory, and then assume that it is more or less linear, at least if the GPU memory was free before.
-I.e., at least the lower ~ 14 GB of the buffer should be in the lower 16 GB memory, and the higher ~14 GB in the upper 16 GP.
-
-Then we partition this buffer in say 1GB segments, and run benchmarks in the segments individually, or in multiple segments in parallel.
-For running on multiple segments in parallel, it would be interesting to split on the block level and on the thread level.
-We should always start as many blocks as there are multiprocessors on the GPU, such that we have a 1 to 1 mapping without scheduling blocks.
-We should make sure that the test runs long enough, say >5 seconds, then the initial scheduling should become irrelevant.
-
-For the tests I want to run in the segments, I think these should be:
-- Linear read in a multithreaded way: i.e. the standard GPU for loop:
-for (int i = threadIdx.x; i < segmentSIze; i += blockDim.x) foo += array[i];
-In the end we have to write foo to some output address to make sure the compiler cannot optimize anything.
-- Then I'd do the same with some stride, i.e.:
-foo += array[i * stride];
-- I'd try a random access with some simple linear congruence RNG per thread to determine the address.
-- Then I'd do the same with writing memory, and with copying memory.
-- Finally the data type should be flexible, going from char to uint4.
-That should cover most cases, but if you have more ideas, feel free to add something.*/
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 7a83888bd57ed..80d3d04595f3d 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -11,15 +11,16 @@
 /// \file benchmark.cxx
 /// \author: mconcas@cern.ch
 
-#include <iostream>
 #include "Shared/Kernels.h"
 
 int main()
 {
-  o2::benchmark::GPUbenchmark<char> bm_char{};
-  bm_char.run();
-  o2::benchmark::GPUbenchmark<int> bm_int{};
-  bm_int.run();
+  // o2::benchmark::GPUbenchmark<char> bm_char{};
+  // bm_char.run();
+  o2::benchmark::GPUbenchmark<size_t> bm_size_t{};
+  bm_size_t.run();
+  // o2::benchmark::GPUbenchmark<int> bm_int{};
+  // bm_int.run();
 
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index ffb0af087c147..b18d926e42b4e 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -25,45 +25,30 @@ double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
     failed("API returned error code.");                                                        \
   }
 
-// #define CHECK(cmd)                                                                                         \
-//   {                                                                                                        \
-//     cudaError_t error = cmd;                                                                               \
-//     if (error != cudaSuccess) {                                                                            \
-//       fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error, __FILE__, __LINE__); \
-//       exit(EXIT_FAILURE);                                                                                  \
-//     }                                                                                                      \
-//   }
-
 namespace o2
 {
 namespace benchmark
 {
 namespace gpu
 {
-// Kernels go here
-/* 
- * Square each element in the array A and write to array C.
- */
-// template <typename T>
-// __global__ void
-//   vector_square(T* C_d, T* A_d, size_t N)
-// {
-//   size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-//   size_t stride = blockDim.x * gridDim.x;
 
-//   for (size_t i = offset; i < N; i += stride) {
-//     C_d[i] = A_d[i] * A_d[i];
-//   }
-// }
+///////////////////
+/// Kernels go here
 
 template <class buffer_type>
 GPUg() void readerKernel(
+  size_t Ntimes,
+  buffer_type* result,
   buffer_type* buffer,
   size_t bufferSize)
 {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
+    buffer_type result{0};
+    result += buffer[i];
   }
 }
+///////////////////
+
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -191,12 +176,13 @@ template <typename... T>
 float GPUbenchmark<buffer_type>::measure(void (GPUbenchmark<buffer_type>::*task)(T...), const char* taskName, T&&... args)
 {
   float diff{0.f};
+  std::cout << std::setw(2) << ">>> " << taskName;
   auto start = std::chrono::high_resolution_clock::now();
   (this->*task)(std::forward<T>(args)...);
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double, std::milli> diff_t{end - start};
   diff = diff_t.count();
-  std::cout << std::setw(2) << ">>> " << taskName << " completed in: " << diff << " ms" << std::endl;
+  std::cout << std::setw(2) << " completed in: \x1B[32m" << diff << " ms\x1B[0m" << std::endl;
   return diff;
 }
 
@@ -213,41 +199,71 @@ void GPUbenchmark<buffer_type>::printDevices()
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::init(const int deviceId)
+void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
 {
   cudaDeviceProp props;
   size_t free;
 
-  // Fetch and store traits
+  // Fetch and store features
   GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
   GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
-  mState.allocatedMemory = static_cast<long int>(FREE_MEMORY_FRACTION_TO_ALLOCATE * free);
-
-  // Setup
-  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.allocatedMemory));
-
-  mState.computeBufferPointers();
-
-  for (size_t iAddr{0}; iAddr < mState.getBuffersPointers().size(); ++iAddr) {
+  mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
+  mState.scratchSize = static_cast<long int>(FREE_MEMORY_FRACTION_TO_ALLOCATE * free);
+  std::cout << ">>> Running on: " << props.name << std::endl;
+  // Allocate scratch on GPU
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
+  mState.computeScratchPtrs();
+
+  // Initialize corresponding buffers on host and copy content on GPU
+  mState.getHostBuffers().resize(mState.getScratchPtrs().size());
+  for (size_t iScratchPart{0}; iScratchPart < mState.getScratchPtrs().size(); ++iScratchPart) {
+    mState.getHostBuffers()[iScratchPart].resize(gpuState<buffer_type>::getArraySize());
+    GPUCHECK(cudaMemcpy(mState.getScratchPtrs()[iScratchPart], mState.getHostBuffers()[iScratchPart].data(), gpuState<buffer_type>::getArraySize() * sizeof(buffer_type), cudaMemcpyHostToDevice));
+  }
+  std::cout << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
+            << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n"
+            << "    ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << PARTITION_SIZE_GB << "GB each\n"
+            << "    ├ Size of arrays in segments: " << gpuState<buffer_type>::getArraySize() << " elements" << std::endl
+            << "    └ Memory buffers copied from host to device"
+            << std::endl;
+}
 
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::readingInit()
+{
+  mState.deviceReadingResultsPtrs.resize(mState.getMaxSegments());
+  mState.hostReadingResultsVector.resize(mState.getMaxSegments());
+  for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) {
+    GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtrs[iScratchPart])), sizeof(buffer_type)));
   }
+}
 
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::readingBenchmark()
+{
+  dim3 nBlocks{static_cast<uint32_t>(mState.nMultiprocessors / mState.getMaxSegments())};
+  dim3 nThreads{static_cast<uint32_t>(std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock))};
 
+  for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) {
+    gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(1, mState.deviceReadingResultsPtrs[iScratchPart], mState.getScratchPtrs()[iScratchPart], gpuState<buffer_type>::getArraySize());
+  }
+  GPUCHECK(cudaDeviceSynchronize());
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingBenchmark()
+void GPUbenchmark<buffer_type>::readingFinalize()
 {
-  dim3 nBlocks(mState.nMultiprocessors);
-  dim3 nThreads(mState.nMaxThreadsPerBlock);
-  // gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>();
+  for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) {
+    GPUCHECK(cudaMemcpy(&mState.hostReadingResultsVector[iScratchPart], mState.deviceReadingResultsPtrs[iScratchPart], sizeof(buffer_type), cudaMemcpyDeviceToHost));
+    std::cout << "result " << iScratchPart << ": " << mState.hostReadingResultsVector[iScratchPart] << std::endl;
+  }
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::finalize()
+void GPUbenchmark<buffer_type>::generalFinalize()
 {
   GPUCHECK(cudaFree(mState.scratchPtr));
 }
@@ -256,19 +272,16 @@ template <class buffer_type>
 void GPUbenchmark<buffer_type>::run()
 {
   // printDevices();
-  measure(&GPUbenchmark<buffer_type>::init, "Init", 0);
-  std::cout << "  ├ Allocated: " << mState.allocatedMemory << "/" << mState.totalMemory
-            << " bytes (" << std::setprecision(3) << (100.f) * (mState.allocatedMemory / (float)mState.totalMemory) << "%)\n";
-  std::cout << "  ├ Can do: " << mState.getMaxSegments() << " segments of " << PARTITION_SIZE_GB << "GB each\n";
-  std::cout << "  └ Length of arrays in segments: " << mState.getArrayLength() << std::endl;
-
+  generalInit(0);
 
-  // measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
-  GPUbenchmark<buffer_type>::finalize();
+  readingInit();
+  measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
+  GPUbenchmark<buffer_type>::generalFinalize();
 }
 
 template class GPUbenchmark<char>;
-template class GPUbenchmark<uint4>;
+// template class GPUbenchmark<uint4>;
+template class GPUbenchmark<size_t>;
 template class GPUbenchmark<int>;
 
 } // namespace benchmark

From 377365589e2ad89cd283dcc811dc9cd8a9c039da Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 23 Jun 2021 14:21:34 +0200
Subject: [PATCH 18/42] Assign a block to each scratch segment

---
 GPU/GPUbenchmark/CMakeLists.txt   |  4 +-
 GPU/GPUbenchmark/Shared/Common.h  | 71 ++++++++++++++++++++++-
 GPU/GPUbenchmark/Shared/Kernels.h | 62 +++------------------
 GPU/GPUbenchmark/benchmark.cxx    | 55 +++++++++++++++---
 GPU/GPUbenchmark/cuda/Kernels.cu  | 93 +++++++++++++++++++------------
 5 files changed, 185 insertions(+), 100 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 71ba08edeb975..77e0e63509936 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -16,6 +16,7 @@ if(CUDA_ENABLED)
                   SOURCES benchmark.cxx
                           cuda/Kernels.cu
                   PUBLIC_LINK_LIBRARIES O2::GPUCommon
+                                        Boost::program_options
                   TARGETVARNAME targetName)
 endif()
 
@@ -30,7 +31,7 @@ if(HIP_ENABLED)
   if(EXISTS ${HIPIFY_EXECUTABLE})
     set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
     message("Generating HIP kernel code ...")
-    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
   elseif()
     message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
   endif()
@@ -46,6 +47,7 @@ if(HIP_ENABLED)
                             hip/Kernels.hip.cxx
                     PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                           hip::host
+                                          Boost::program_options
                     TARGETVARNAME targetName)
 
   if(HIP_AMDGPUTARGET)
diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index b9af3be3ed966..74e7115b8623a 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -13,12 +13,11 @@
 
 #ifndef GPU_BENCHMARK_COMMON_H
 #define GPU_BENCHMARK_COMMON_H
-// #if defined(__HIPCC__)
-// #include "hip/hip_runtime.h"
-// #endif
 
 #include <iostream>
 #include <iomanip>
+#include <typeinfo>
+#include <boost/program_options.hpp>
 
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
@@ -29,6 +28,72 @@
 #define KCYN "\x1B[36m"
 #define KWHT "\x1B[37m"
 
+#define GB (1024 * 1024 * 1024)
+
+namespace o2
+{
+namespace benchmark
+{
+struct benchmarkOpts {
+  benchmarkOpts() = default;
+
+  float partitionSizeGB = 1.f;
+  float freeMemoryFractionToAllocate = 0.95f;
+};
+
+template <class T>
+struct gpuState {
+  int getMaxSegments()
+  {
+    return (double)scratchSize / (partitionSizeGB * GB);
+  }
+
+  void computeScratchPtrs()
+  {
+    partAddrOnHost.resize(getMaxSegments());
+    for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) {
+      partAddrOnHost[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partitionSizeGB) * iBuffAddress);
+    }
+  }
+
+  size_t getPartitionCapacity()
+  {
+    return static_cast<size_t>(GB * partitionSizeGB / sizeof(T));
+  }
+
+  std::vector<T*> getScratchPtrs()
+  {
+    return partAddrOnHost;
+  }
+
+  std::vector<std::vector<T>>& getHostBuffers()
+  {
+    return gpuBuffersHost;
+  }
+
+  // Configuration
+  size_t nMaxThreadsPerDimension;
+  float partitionSizeGB; // Size of each partition (GB)
+
+  // General containers and state
+  T* scratchPtr;                              // Pointer to scratch buffer
+  size_t scratchSize;                         // Size of scratch area (B)
+  std::vector<T*> partAddrOnHost;             // Pointers to scratch partitions on host vector
+  std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
+
+  // Test-specific containers
+  T* deviceReadingResultsPtr;       // Results of the reading test (single variable) on GPU
+  std::vector<T> hostReadingResultsVector; // Results of the reading test (single variable) on host
+
+  // Static info
+  size_t totalMemory;
+  size_t nMultiprocessors;
+  size_t nMaxThreadsPerBlock;
+};
+
+} // namespace benchmark
+} // namespace o2
+
 #define failed(...)                       \
   printf("%serror: ", KRED);              \
   printf(__VA_ARGS__);                    \
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 3c030cf9cae01..07c89836eabcc 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -15,74 +15,29 @@
 #define GPU_BENCHMARK_KERNELS_H
 
 #include "GPUCommonDef.h"
+#include "Common.h"
 #include <vector>
 #include <iostream>
 #include <iomanip>
 #include <chrono>
 
-#define PARTITION_SIZE_GB 1
-#define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f
-#define GB 1073741824
+// #define PARTITION_SIZE_GB 1
+// #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f
+
 
 namespace o2
 {
 namespace benchmark
 {
 
-template <class T>
-struct gpuState {
-  int getMaxSegments()
-  {
-    return (double)scratchSize / (1024.0 * 1024.0 * 1024.0);
-  }
-
-  void computeScratchPtrs()
-  {
-    addresses.resize(getMaxSegments());
-    for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) {
-      addresses[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + GB * iBuffAddress);
-    }
-  }
-
-  static constexpr size_t getArraySize()
-  {
-    return static_cast<size_t>(GB * PARTITION_SIZE_GB / sizeof(T));
-  }
-
-  std::vector<T*> getScratchPtrs()
-  {
-    return addresses;
-  }
-
-  std::vector<std::vector<T>>& getHostBuffers()
-  {
-    return gpuBuffersHost;
-  }
-
-  // General containers and state
-  T* scratchPtr;                              // Pointer to scratch buffer
-  size_t scratchSize;                         // Size of scratch area (B)
-  std::vector<T*> addresses;                  // Pointers to scratch partitions
-  std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
-
-  // Test-specific containers
-  std::vector<T*> deviceReadingResultsPtrs; // Results of the reading test (single variable) on GPU
-  std::vector<T> hostReadingResultsVector;  // Results of the reading test (single variable) on host
-
-  // Configuration
-  size_t nMaxThreadsPerDimension;
-
-  // Static info
-  size_t totalMemory;
-  size_t nMultiprocessors;
-  size_t nMaxThreadsPerBlock;
-};
-
 template <class buffer_type>
 class GPUbenchmark final
 {
  public:
-  GPUbenchmark() = default;
+  GPUbenchmark() = delete; // need for a configuration
+  GPUbenchmark(benchmarkOpts& opts) : mOptions{opts} {
+
+  }
   virtual ~GPUbenchmark() = default;
   template <typename... T>
   float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
@@ -102,6 +57,7 @@ class GPUbenchmark final
 
  private:
   gpuState<buffer_type> mState;
+  benchmarkOpts mOptions;
 };
 
 } // namespace benchmark
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 80d3d04595f3d..2824bdc8bd07f 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -9,18 +9,57 @@
 // or submit itself to any jurisdiction.
 ///
 /// \file benchmark.cxx
-/// \author: mconcas@cern.ch
-
+/// \author mconcas@cern.ch
+/// \brief configuration widely inspired/copied by SimConfig
 #include "Shared/Kernels.h"
 
-int main()
+bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
+{
+  namespace bpo = boost::program_options;
+  bpo::variables_map vm;
+  bpo::options_description options("Benchmark options");
+  options.add_options()(
+    "help,h", "Print help message.")(
+    "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
+    "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).");
+  try {
+    bpo::store(parse_command_line(argc, argv, options), vm);
+    if (vm.count("help")) {
+      std::cout << options << std::endl;
+      return false;
+    }
+
+    bpo::notify(vm);
+  } catch (const bpo::error& e) {
+    std::cerr << e.what() << "\n\n";
+    std::cerr << "Error parsing command line arguments. Available options:\n";
+
+    std::cerr << options << std::endl;
+    return false;
+  }
+
+  conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
+  conf.partitionSizeGB = vm["chunkSize"].as<float>();
+
+  return true;
+}
+
+int main(int argc, const char* argv[])
 {
-  // o2::benchmark::GPUbenchmark<char> bm_char{};
+
+  o2::benchmark::benchmarkOpts opts;
+  if (argc > 1) {
+    if (!parseArgs(opts, argc, argv)) {
+      return -1;
+    }
+  }
+
+  // o2::benchmark::GPUbenchmark<char> bm_char{opts};
   // bm_char.run();
-  o2::benchmark::GPUbenchmark<size_t> bm_size_t{};
-  bm_size_t.run();
-  // o2::benchmark::GPUbenchmark<int> bm_int{};
-  // bm_int.run();
+  // o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts};
+  // bm_size_t.run();
+  o2::benchmark::GPUbenchmark<int> bm_int{opts};
+  bm_int.run();
 
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index b18d926e42b4e..d0c7c59b86495 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -12,12 +12,8 @@
 /// \author: mconcas@cern.ch
 
 #include "../Shared/Kernels.h"
-#include "../Shared/Common.h"
 #include <stdio.h>
 
-double bytesToKB(size_t s) { return (double)s / (1024.0); }
-double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
-
 #define GPUCHECK(error)                                                                        \
   if (error != cudaSuccess) {                                                                  \
     printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \
@@ -25,6 +21,9 @@ double bytesToGB(size_t s) { return (double)s / (1024.0 * 1024.0 * 1024.0); }
     failed("API returned error code.");                                                        \
   }
 
+double bytesToKB(size_t s) { return (double)s / (1024.0); }
+double bytesToGB(size_t s) { return (double)s / GB; }
+
 namespace o2
 {
 namespace benchmark
@@ -33,24 +32,52 @@ namespace gpu
 {
 
 ///////////////////
-/// Kernels go here
+/// Kernels and device functions go here
+template <class buffer_type>
+GPUhd() buffer_type* getPartPtrOnScratch(buffer_type* scratchPtr, float partSizeGB, size_t partNumber)
+{
+  return reinterpret_cast<buffer_type*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partSizeGB) * partNumber);
+}
 
 template <class buffer_type>
 GPUg() void readerKernel(
-  size_t Ntimes,
-  buffer_type* result,
-  buffer_type* buffer,
-  size_t bufferSize)
+  buffer_type* results,
+  buffer_type* scratch,
+  size_t iterations,
+  size_t bufferSize,
+  float partitionSize = 1.f)
 {
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
-    buffer_type result{0};
-    result += buffer[i];
+  for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
+    buffer_type tmpResult{0};
+    for (size_t j{0}; j < iterations; ++j) {
+      tmpResult += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i];
+    }
+    results[blockIdx.x] += tmpResult; // FIXME: do something with data w/o data racing condition (avoid compiler optimizations)
+    // atomicAdd(reinterpret_cast<int*>(&(results[blockIdx.x])), tmpResult); // Does not work in CUDA
   }
 }
 ///////////////////
 
 } // namespace gpu
 
+template <class T>
+char* getType()
+{
+  if (typeid(T).name() == typeid(char).name()) {
+    return const_cast<char*>("\e[1mchar\e[0m");
+  }
+  if (typeid(T).name() == typeid(size_t).name()) {
+    return const_cast<char*>("\e[1msize_t\e[0m");
+  }
+  if (typeid(T).name() == typeid(int).name()) {
+    return const_cast<char*>("\e[1mint\e[0m");
+  }
+  if (typeid(T).name() == typeid(int4).name()) {
+    return const_cast<char*>("\e[1mint4\e[0m");
+  }
+  return const_cast<char*>("\e[1m unknown\e[0m");
+}
+
 void printDeviceProp(int deviceId)
 {
   const int w1 = 34;
@@ -208,25 +235,24 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
   GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
   GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
+  mState.partitionSizeGB = mOptions.partitionSizeGB;
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
-  mState.scratchSize = static_cast<long int>(FREE_MEMORY_FRACTION_TO_ALLOCATE * free);
-  std::cout << ">>> Running on: " << props.name << std::endl;
+  mState.scratchSize = static_cast<long int>(mOptions.freeMemoryFractionToAllocate * free);
+  std::cout << ">>> Running benchmark on : " << props.name << std::endl;
+
   // Allocate scratch on GPU
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
+
   mState.computeScratchPtrs();
+  GPUCHECK(cudaMemset(mState.scratchPtr, 1, mState.scratchSize))
 
-  // Initialize corresponding buffers on host and copy content on GPU
-  mState.getHostBuffers().resize(mState.getScratchPtrs().size());
-  for (size_t iScratchPart{0}; iScratchPart < mState.getScratchPtrs().size(); ++iScratchPart) {
-    mState.getHostBuffers()[iScratchPart].resize(gpuState<buffer_type>::getArraySize());
-    GPUCHECK(cudaMemcpy(mState.getScratchPtrs()[iScratchPart], mState.getHostBuffers()[iScratchPart].data(), gpuState<buffer_type>::getArraySize() * sizeof(buffer_type), cudaMemcpyHostToDevice));
-  }
-  std::cout << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
+  std::cout << "    ├ Buffer type: " << getType<buffer_type>() << std::endl
+            << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
             << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n"
-            << "    ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << PARTITION_SIZE_GB << "GB each\n"
-            << "    ├ Size of arrays in segments: " << gpuState<buffer_type>::getArraySize() << " elements" << std::endl
+            << "    ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n"
+            << "    ├ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl
             << "    └ Memory buffers copied from host to device"
             << std::endl;
 }
@@ -234,31 +260,27 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingInit()
 {
-  mState.deviceReadingResultsPtrs.resize(mState.getMaxSegments());
   mState.hostReadingResultsVector.resize(mState.getMaxSegments());
-  for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) {
-    GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtrs[iScratchPart])), sizeof(buffer_type)));
-  }
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type)));
 }
 
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingBenchmark()
 {
-  dim3 nBlocks{static_cast<uint32_t>(mState.nMultiprocessors / mState.getMaxSegments())};
-  dim3 nThreads{static_cast<uint32_t>(std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock))};
+  auto nBlocks{mState.getMaxSegments()};
+  auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
 
-  for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) {
-    gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(1, mState.deviceReadingResultsPtrs[iScratchPart], mState.getScratchPtrs()[iScratchPart], gpuState<buffer_type>::getArraySize());
-  }
+  gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1000, mState.getPartitionCapacity(), mState.partitionSizeGB);
   GPUCHECK(cudaDeviceSynchronize());
 }
 
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingFinalize()
 {
-  for (size_t iScratchPart{0}; iScratchPart < mState.getMaxSegments(); ++iScratchPart) {
-    GPUCHECK(cudaMemcpy(&mState.hostReadingResultsVector[iScratchPart], mState.deviceReadingResultsPtrs[iScratchPart], sizeof(buffer_type), cudaMemcpyDeviceToHost));
-    std::cout << "result " << iScratchPart << ": " << mState.hostReadingResultsVector[iScratchPart] << std::endl;
+
+  GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost));
+  for (auto r : mState.hostReadingResultsVector) {
+    std::cout << "Result " << r << std::endl;
   }
 }
 
@@ -276,6 +298,7 @@ void GPUbenchmark<buffer_type>::run()
 
   readingInit();
   measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
+  readingFinalize();
   GPUbenchmark<buffer_type>::generalFinalize();
 }
 

From 8933ff1a073e90ef42b3fa871652918301be8a8d Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 23 Jun 2021 14:25:19 +0200
Subject: [PATCH 19/42] Fix copyright

---
 GPU/GPUbenchmark/CMakeLists.txt   | 13 +++++++------
 GPU/GPUbenchmark/Shared/Common.h  |  9 +++++----
 GPU/GPUbenchmark/Shared/Kernels.h |  9 +++++----
 GPU/GPUbenchmark/benchmark.cxx    |  9 +++++----
 GPU/GPUbenchmark/cuda/Kernels.cu  |  9 +++++----
 5 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 77e0e63509936..5b821a09d4fd7 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -1,12 +1,13 @@
-# Copyright CERN and copyright holders of ALICE O2. This software is distributed
-# under the terms of the GNU General Public License v3 (GPL Version 3), copied
-# verbatim in the file "COPYING".
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
 #
-# See http://alice-o2.web.cern.ch/license for full licensing information.
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
 #
 # In applying this license CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization or
-# submit itself to any jurisdiction.
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
 
 set(HDRS_INSTALL ../Shared/Kernels.h)
 
diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index 74e7115b8623a..269847f7b483c 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -1,8 +1,9 @@
-// Copyright CERN and copyright holders of ALICE O2. This software is
-// distributed under the terms of the GNU General Public License v3 (GPL
-// Version 3), copied verbatim in the file "COPYING".
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
 //
-// See http://alice-o2.web.cern.ch/license for full licensing information.
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
 //
 // In applying this license CERN does not waive the privileges and immunities
 // granted to it by virtue of its status as an Intergovernmental Organization
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 07c89836eabcc..0325e13fd3782 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -1,8 +1,9 @@
-// Copyright CERN and copyright holders of ALICE O2. This software is
-// distributed under the terms of the GNU General Public License v3 (GPL
-// Version 3), copied verbatim in the file "COPYING".
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
 //
-// See http://alice-o2.web.cern.ch/license for full licensing information.
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
 //
 // In applying this license CERN does not waive the privileges and immunities
 // granted to it by virtue of its status as an Intergovernmental Organization
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 2824bdc8bd07f..6bdbd6c7e1237 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -1,8 +1,9 @@
-// Copyright CERN and copyright holders of ALICE O2. This software is
-// distributed under the terms of the GNU General Public License v3 (GPL
-// Version 3), copied verbatim in the file "COPYING".
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
 //
-// See http://alice-o2.web.cern.ch/license for full licensing information.
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
 //
 // In applying this license CERN does not waive the privileges and immunities
 // granted to it by virtue of its status as an Intergovernmental Organization
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index d0c7c59b86495..0743bda8ee616 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -1,8 +1,9 @@
-// Copyright CERN and copyright holders of ALICE O2. This software is
-// distributed under the terms of the GNU General Public License v3 (GPL
-// Version 3), copied verbatim in the file "COPYING".
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
 //
-// See http://alice-o2.web.cern.ch/license for full licensing information.
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
 //
 // In applying this license CERN does not waive the privileges and immunities
 // granted to it by virtue of its status as an Intergovernmental Organization

From fd8041b6a42421be7ccdd95d87111242a8d1c164 Mon Sep 17 00:00:00 2001
From: ALICE Builder <alibuild@users.noreply.github.com>
Date: Wed, 23 Jun 2021 14:33:15 +0200
Subject: [PATCH 20/42] Please consider the following formatting changes (#16)

---
 GPU/GPUbenchmark/Shared/Common.h  | 2 +-
 GPU/GPUbenchmark/Shared/Kernels.h | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index 269847f7b483c..5acc4eca56d62 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -83,7 +83,7 @@ struct gpuState {
   std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
 
   // Test-specific containers
-  T* deviceReadingResultsPtr;       // Results of the reading test (single variable) on GPU
+  T* deviceReadingResultsPtr;              // Results of the reading test (single variable) on GPU
   std::vector<T> hostReadingResultsVector; // Results of the reading test (single variable) on host
 
   // Static info
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 0325e13fd3782..54b4a057bc53b 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -25,7 +25,6 @@
 // #define PARTITION_SIZE_GB 1
 // #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f
 
-
 namespace o2
 {
 namespace benchmark
@@ -36,8 +35,8 @@ class GPUbenchmark final
 {
  public:
   GPUbenchmark() = delete; // need for a configuration
-  GPUbenchmark(benchmarkOpts& opts) : mOptions{opts} {
-
+  GPUbenchmark(benchmarkOpts& opts) : mOptions{opts}
+  {
   }
   virtual ~GPUbenchmark() = default;
   template <typename... T>

From f80c6843d02731ea212d0fdabbad973b46161f80 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 23 Jun 2021 17:39:23 +0200
Subject: [PATCH 21/42] Set configurable iterations

---
 GPU/GPUbenchmark/Shared/Common.h  |  5 +++++
 GPU/GPUbenchmark/Shared/Kernels.h |  2 +-
 GPU/GPUbenchmark/benchmark.cxx    | 12 +++++++-----
 GPU/GPUbenchmark/cuda/Kernels.cu  | 15 +++++++++------
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Common.h
index 5acc4eca56d62..33067d6f4b8c0 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Common.h
@@ -40,6 +40,7 @@ struct benchmarkOpts {
 
   float partitionSizeGB = 1.f;
   float freeMemoryFractionToAllocate = 0.95f;
+  size_t iterations = 1;
 };
 
 template <class T>
@@ -72,8 +73,12 @@ struct gpuState {
     return gpuBuffersHost;
   }
 
+  size_t getNiterations() { return iterations; }
+
   // Configuration
   size_t nMaxThreadsPerDimension;
+  size_t iterations;
+
   float partitionSizeGB; // Size of each partition (GB)
 
   // General containers and state
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 54b4a057bc53b..8d77dd9feabac 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -53,7 +53,7 @@ class GPUbenchmark final
   void readingFinalize();
 
   // Benchmark kernel callbacks
-  void readingBenchmark();
+  void readingBenchmark(size_t iterations);
 
  private:
   gpuState<buffer_type> mState;
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 6bdbd6c7e1237..8455b1921eb0b 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -22,7 +22,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   options.add_options()(
     "help,h", "Print help message.")(
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
-    "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).");
+    "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
+    "iterations,i", bpo::value<size_t>()->default_value(50), "Number of iterations in reading kernels.");
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
     if (vm.count("help")) {
@@ -41,6 +42,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 
   conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
   conf.partitionSizeGB = vm["chunkSize"].as<float>();
+  conf.iterations = vm["iterations"].as<size_t>();
 
   return true;
 }
@@ -55,10 +57,10 @@ int main(int argc, const char* argv[])
     }
   }
 
-  // o2::benchmark::GPUbenchmark<char> bm_char{opts};
-  // bm_char.run();
-  // o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts};
-  // bm_size_t.run();
+  o2::benchmark::GPUbenchmark<char> bm_char{opts};
+  bm_char.run();
+  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts};
+  bm_size_t.run();
   o2::benchmark::GPUbenchmark<int> bm_int{opts};
   bm_int.run();
 
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 0743bda8ee616..326f993999ae8 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -237,11 +237,12 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
   GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
   mState.partitionSizeGB = mOptions.partitionSizeGB;
+  mState.iterations = mOptions.iterations;
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
   mState.scratchSize = static_cast<long int>(mOptions.freeMemoryFractionToAllocate * free);
-  std::cout << ">>> Running benchmark on : " << props.name << std::endl;
+  std::cout << ">>> Running on : " << props.name << std::endl;
 
   // Allocate scratch on GPU
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
@@ -261,17 +262,19 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingInit()
 {
+  std::cout << ">>> Initializing reading benchmark with \e[1m" << mState.iterations << "\e[0m iterations." << std::endl;
   mState.hostReadingResultsVector.resize(mState.getMaxSegments());
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type)));
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingBenchmark()
+void GPUbenchmark<buffer_type>::readingBenchmark(size_t iterations)
 {
   auto nBlocks{mState.getMaxSegments()};
   auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
-
-  gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1000, mState.getPartitionCapacity(), mState.partitionSizeGB);
+  for (auto iteration{iterations}; iteration--;) {
+    gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1, mState.getPartitionCapacity(), mState.partitionSizeGB);
+  }
   GPUCHECK(cudaDeviceSynchronize());
 }
 
@@ -281,7 +284,7 @@ void GPUbenchmark<buffer_type>::readingFinalize()
 
   GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost));
   for (auto r : mState.hostReadingResultsVector) {
-    std::cout << "Result " << r << std::endl;
+    // std::cout << "Result " << r << std::endl;
   }
 }
 
@@ -298,7 +301,7 @@ void GPUbenchmark<buffer_type>::run()
   generalInit(0);
 
   readingInit();
-  measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark");
+  measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
   readingFinalize();
   GPUbenchmark<buffer_type>::generalFinalize();
 }

From 3896a71aa13394f6d1b61b882d353865f5ff3819 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 24 Jun 2021 18:06:57 +0200
Subject: [PATCH 22/42] Improve busy fucntion + streaming results on file

---
 GPU/GPUbenchmark/CMakeLists.txt               |  2 +
 GPU/GPUbenchmark/Shared/Kernels.h             |  6 +-
 GPU/GPUbenchmark/Shared/{Common.h => Utils.h} | 38 ++++++++++++-
 GPU/GPUbenchmark/benchmark.cxx                | 14 +++--
 GPU/GPUbenchmark/cuda/Kernels.cu              | 57 +++++++++----------
 5 files changed, 77 insertions(+), 40 deletions(-)
 rename GPU/GPUbenchmark/Shared/{Common.h => Utils.h} (74%)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 5b821a09d4fd7..9151acc8bc478 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -18,6 +18,7 @@ if(CUDA_ENABLED)
                           cuda/Kernels.cu
                   PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                         Boost::program_options
+                                        O2::CommonUtils
                   TARGETVARNAME targetName)
 endif()
 
@@ -49,6 +50,7 @@ if(HIP_ENABLED)
                     PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                           hip::host
                                           Boost::program_options
+                                          O2::CommonUtils
                     TARGETVARNAME targetName)
 
   if(HIP_AMDGPUTARGET)
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 8d77dd9feabac..66ae5aa05e176 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -16,10 +16,11 @@
 #define GPU_BENCHMARK_KERNELS_H
 
 #include "GPUCommonDef.h"
-#include "Common.h"
+#include "Utils.h"
 #include <vector>
 #include <iostream>
 #include <iomanip>
+#include <memory>
 #include <chrono>
 
 // #define PARTITION_SIZE_GB 1
@@ -35,7 +36,7 @@ class GPUbenchmark final
 {
  public:
   GPUbenchmark() = delete; // need for a configuration
-  GPUbenchmark(benchmarkOpts& opts) : mOptions{opts}
+  GPUbenchmark(benchmarkOpts& opts, std::shared_ptr<ResultStreamer> streamer) : mStreamer{streamer}, mOptions{opts}
   {
   }
   virtual ~GPUbenchmark() = default;
@@ -57,6 +58,7 @@ class GPUbenchmark final
 
  private:
   gpuState<buffer_type> mState;
+  std::shared_ptr<ResultStreamer> mStreamer;
   benchmarkOpts mOptions;
 };
 
diff --git a/GPU/GPUbenchmark/Shared/Common.h b/GPU/GPUbenchmark/Shared/Utils.h
similarity index 74%
rename from GPU/GPUbenchmark/Shared/Common.h
rename to GPU/GPUbenchmark/Shared/Utils.h
index 33067d6f4b8c0..0fc1d9ef24d6f 100644
--- a/GPU/GPUbenchmark/Shared/Common.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -12,13 +12,14 @@
 /// \file Common.h
 /// \author: mconcas@cern.ch
 
-#ifndef GPU_BENCHMARK_COMMON_H
-#define GPU_BENCHMARK_COMMON_H
+#ifndef GPU_BENCHMARK_UTILS_H
+#define GPU_BENCHMARK_UTILS_H
 
 #include <iostream>
 #include <iomanip>
 #include <typeinfo>
 #include <boost/program_options.hpp>
+#include "CommonUtils/TreeStreamRedirector.h"
 
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
@@ -97,6 +98,39 @@ struct gpuState {
   size_t nMaxThreadsPerBlock;
 };
 
+// Interface class to stream results to root file
+class ResultStreamer
+{
+ public:
+  explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root");
+  ~ResultStreamer();
+  void storeBenchmarkEntry(std::string benchmarkName, float entry);
+
+ private:
+  std::string mDebugTreeFileName = "benchmark_results.root"; // output filename
+  o2::utils::TreeStreamRedirector* mTreeStream;              // observer
+};
+
+inline ResultStreamer::ResultStreamer(const std::string debugTreeFileName)
+{
+  mDebugTreeFileName = debugTreeFileName;
+  mTreeStream = new o2::utils::TreeStreamRedirector(debugTreeFileName.data(), "recreate");
+}
+
+inline ResultStreamer::~ResultStreamer()
+{
+  delete mTreeStream;
+}
+
+inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, float entry)
+{
+  (*mTreeStream)
+    << "Benchmarks"
+    << benchmarkName.data()
+    << "elapsed=" << entry
+    << "\n";
+}
+
 } // namespace benchmark
 } // namespace o2
 
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 8455b1921eb0b..4e71cb5c1e7d6 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -21,9 +21,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   bpo::options_description options("Benchmark options");
   options.add_options()(
     "help,h", "Print help message.")(
-    "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
-    "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
-    "iterations,i", bpo::value<size_t>()->default_value(50), "Number of iterations in reading kernels.");
+    "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")("freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")("iterations,i", bpo::value<size_t>()->default_value(50), "Number of iterations in reading kernels.");
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
     if (vm.count("help")) {
@@ -47,6 +45,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   return true;
 }
 
+using o2::benchmark::ResultStreamer;
+
 int main(int argc, const char* argv[])
 {
 
@@ -57,11 +57,13 @@ int main(int argc, const char* argv[])
     }
   }
 
-  o2::benchmark::GPUbenchmark<char> bm_char{opts};
+  std::shared_ptr<ResultStreamer> streamer = std::make_shared<ResultStreamer>();
+
+  o2::benchmark::GPUbenchmark<char> bm_char{opts, streamer};
   bm_char.run();
-  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts};
+  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
   bm_size_t.run();
-  o2::benchmark::GPUbenchmark<int> bm_int{opts};
+  o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
   bm_int.run();
 
   return 0;
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 326f993999ae8..5fce52d6a445e 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -25,6 +25,24 @@
 double bytesToKB(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / GB; }
 
+template <class T>
+char* getType()
+{
+  if (typeid(T).name() == typeid(char).name()) {
+    return const_cast<char*>("\e[1mchar\e[0m");
+  }
+  if (typeid(T).name() == typeid(size_t).name()) {
+    return const_cast<char*>("\e[1msize_t\e[0m");
+  }
+  if (typeid(T).name() == typeid(int).name()) {
+    return const_cast<char*>("\e[1mint\e[0m");
+  }
+  if (typeid(T).name() == typeid(int4).name()) {
+    return const_cast<char*>("\e[1mint4\e[0m");
+  }
+  return const_cast<char*>("\e[1m unknown\e[0m");
+}
+
 namespace o2
 {
 namespace benchmark
@@ -44,41 +62,22 @@ template <class buffer_type>
 GPUg() void readerKernel(
   buffer_type* results,
   buffer_type* scratch,
-  size_t iterations,
+  size_t innerIterations,
   size_t bufferSize,
   float partitionSize = 1.f)
 {
   for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
-    buffer_type tmpResult{0};
-    for (size_t j{0}; j < iterations; ++j) {
-      tmpResult += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i];
+    for (size_t j{0}; j < innerIterations; ++j) {
+      if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
+        results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen threads and should be always in sync
+      }
     }
-    results[blockIdx.x] += tmpResult; // FIXME: do something with data w/o data racing condition (avoid compiler optimizations)
-    // atomicAdd(reinterpret_cast<int*>(&(results[blockIdx.x])), tmpResult); // Does not work in CUDA
   }
 }
 ///////////////////
 
 } // namespace gpu
 
-template <class T>
-char* getType()
-{
-  if (typeid(T).name() == typeid(char).name()) {
-    return const_cast<char*>("\e[1mchar\e[0m");
-  }
-  if (typeid(T).name() == typeid(size_t).name()) {
-    return const_cast<char*>("\e[1msize_t\e[0m");
-  }
-  if (typeid(T).name() == typeid(int).name()) {
-    return const_cast<char*>("\e[1mint\e[0m");
-  }
-  if (typeid(T).name() == typeid(int4).name()) {
-    return const_cast<char*>("\e[1mint4\e[0m");
-  }
-  return const_cast<char*>("\e[1m unknown\e[0m");
-}
-
 void printDeviceProp(int deviceId)
 {
   const int w1 = 34;
@@ -242,13 +241,13 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
   mState.scratchSize = static_cast<long int>(mOptions.freeMemoryFractionToAllocate * free);
-  std::cout << ">>> Running on : " << props.name << std::endl;
+  std::cout << ">>> Running on: \e[1m" << props.name << "\e[0m" << std::endl;
 
   // Allocate scratch on GPU
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
 
   mState.computeScratchPtrs();
-  GPUCHECK(cudaMemset(mState.scratchPtr, 1, mState.scratchSize))
+  GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize))
 
   std::cout << "    ├ Buffer type: " << getType<buffer_type>() << std::endl
             << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
@@ -283,9 +282,6 @@ void GPUbenchmark<buffer_type>::readingFinalize()
 {
 
   GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost));
-  for (auto r : mState.hostReadingResultsVector) {
-    // std::cout << "Result " << r << std::endl;
-  }
 }
 
 template <class buffer_type>
@@ -301,7 +297,8 @@ void GPUbenchmark<buffer_type>::run()
   generalInit(0);
 
   readingInit();
-  measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
+  auto result = measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
+  mStreamer.get()->storeBenchmarkEntry("readingBenchmark", result);
   readingFinalize();
   GPUbenchmark<buffer_type>::generalFinalize();
 }

From cf1276e001a6769afd2b1d02caee6c386429c0ae Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Fri, 25 Jun 2021 14:13:40 +0200
Subject: [PATCH 23/42] Fix bug in CLI params

---
 GPU/GPUbenchmark/Shared/Utils.h  |  7 +++----
 GPU/GPUbenchmark/benchmark.cxx   | 17 +++++++++--------
 GPU/GPUbenchmark/cuda/Kernels.cu | 21 +++++++++++----------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 0fc1d9ef24d6f..bd10aedfbf108 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -104,7 +104,7 @@ class ResultStreamer
  public:
   explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root");
   ~ResultStreamer();
-  void storeBenchmarkEntry(std::string benchmarkName, float entry);
+  void storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry);
 
  private:
   std::string mDebugTreeFileName = "benchmark_results.root"; // output filename
@@ -122,11 +122,10 @@ inline ResultStreamer::~ResultStreamer()
   delete mTreeStream;
 }
 
-inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, float entry)
+inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry)
 {
   (*mTreeStream)
-    << "Benchmarks"
-    << benchmarkName.data()
+    << (benchmarkName + type).data()
     << "elapsed=" << entry
     << "\n";
 }
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 4e71cb5c1e7d6..10cd63537282d 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -21,7 +21,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   bpo::options_description options("Benchmark options");
   options.add_options()(
     "help,h", "Print help message.")(
-    "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")("freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")("iterations,i", bpo::value<size_t>()->default_value(50), "Number of iterations in reading kernels.");
+    "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
+    "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
+    "iterations,i", bpo::value<int>()->default_value(50), "Number of iterations in reading kernels.");
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
     if (vm.count("help")) {
@@ -40,7 +42,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 
   conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
   conf.partitionSizeGB = vm["chunkSize"].as<float>();
-  conf.iterations = vm["iterations"].as<size_t>();
+  conf.iterations = vm["iterations"].as<int>();
 
   return true;
 }
@@ -51,20 +53,19 @@ int main(int argc, const char* argv[])
 {
 
   o2::benchmark::benchmarkOpts opts;
-  if (argc > 1) {
+
     if (!parseArgs(opts, argc, argv)) {
       return -1;
     }
-  }
 
   std::shared_ptr<ResultStreamer> streamer = std::make_shared<ResultStreamer>();
 
   o2::benchmark::GPUbenchmark<char> bm_char{opts, streamer};
   bm_char.run();
-  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
-  bm_size_t.run();
-  o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
-  bm_int.run();
+  // o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
+  // bm_int.run();
+  // o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
+  // bm_size_t.run();
 
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 5fce52d6a445e..17093307e2944 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -26,21 +26,21 @@ double bytesToKB(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / GB; }
 
 template <class T>
-char* getType()
+std::string getType()
 {
   if (typeid(T).name() == typeid(char).name()) {
-    return const_cast<char*>("\e[1mchar\e[0m");
+    return std::string{"char"};
   }
   if (typeid(T).name() == typeid(size_t).name()) {
-    return const_cast<char*>("\e[1msize_t\e[0m");
+    return std::string{"unsigned long"};
   }
   if (typeid(T).name() == typeid(int).name()) {
-    return const_cast<char*>("\e[1mint\e[0m");
+    return std::string{"int"};
   }
   if (typeid(T).name() == typeid(int4).name()) {
-    return const_cast<char*>("\e[1mint4\e[0m");
+    return std::string{"int4"};
   }
-  return const_cast<char*>("\e[1m unknown\e[0m");
+  return std::string{"unknown"};
 }
 
 namespace o2
@@ -249,7 +249,7 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
   mState.computeScratchPtrs();
   GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize))
 
-  std::cout << "    ├ Buffer type: " << getType<buffer_type>() << std::endl
+  std::cout << "    ├ Buffer type: \e[1m" << getType<buffer_type>() << "\e[0m" << std::endl
             << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
             << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n"
             << "    ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n"
@@ -293,13 +293,14 @@ void GPUbenchmark<buffer_type>::generalFinalize()
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::run()
 {
-  // printDevices();
   generalInit(0);
-
+  // Test calls go here
+  // - Reading
   readingInit();
   auto result = measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
-  mStreamer.get()->storeBenchmarkEntry("readingBenchmark", result);
+  mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType<buffer_type>(), result);
   readingFinalize();
+
   GPUbenchmark<buffer_type>::generalFinalize();
 }
 

From 2b7e27ec668869d23837894b1eaa1a1098563498 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Fri, 25 Jun 2021 17:54:43 +0200
Subject: [PATCH 24/42] Add configurable number of tests

---
 GPU/GPUbenchmark/Shared/Utils.h  |  3 ++-
 GPU/GPUbenchmark/benchmark.cxx   |  7 +++++--
 GPU/GPUbenchmark/cuda/Kernels.cu | 26 +++++++++++++-------------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index bd10aedfbf108..6ea03f2971052 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -41,7 +41,8 @@ struct benchmarkOpts {
 
   float partitionSizeGB = 1.f;
   float freeMemoryFractionToAllocate = 0.95f;
-  size_t iterations = 1;
+  size_t kernelLaunches = 1;
+  size_t nTests = 1;
 };
 
 template <class T>
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 10cd63537282d..ab108aa839acc 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -23,7 +23,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
     "help,h", "Print help message.")(
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
     "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
-    "iterations,i", bpo::value<int>()->default_value(50), "Number of iterations in reading kernels.");
+    "launches,l", bpo::value<size_t>()->default_value(50), "Number of iterations in reading kernels.")(
+    "ntests,n", bpo::value<size_t>()->default_value(1), "Number of times each test is run."
+    );
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
     if (vm.count("help")) {
@@ -42,7 +44,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 
   conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
   conf.partitionSizeGB = vm["chunkSize"].as<float>();
-  conf.iterations = vm["iterations"].as<int>();
+  conf.kernelLaunches = vm["launches"].as<size_t>();
+  conf.nTests = vm["ntests"].as<size_t>();
 
   return true;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 17093307e2944..ab175b91fe8e8 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -62,14 +62,14 @@ template <class buffer_type>
 GPUg() void readerKernel(
   buffer_type* results,
   buffer_type* scratch,
-  size_t innerIterations,
   size_t bufferSize,
-  float partitionSize = 1.f)
+  float partitionSize = 1.f,
+  size_t innerLoops = 1)
 {
   for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
-    for (size_t j{0}; j < innerIterations; ++j) {
+    for (size_t j{0}; j < innerLoops; ++j) {
       if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
-        results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen threads and should be always in sync
+        results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync
       }
     }
   }
@@ -236,7 +236,6 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
   GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
   mState.partitionSizeGB = mOptions.partitionSizeGB;
-  mState.iterations = mOptions.iterations;
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
@@ -253,26 +252,25 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
             << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
             << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n"
             << "    ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n"
-            << "    ├ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl
-            << "    └ Memory buffers copied from host to device"
+            << "    └ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl
             << std::endl;
 }
 
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingInit()
 {
-  std::cout << ">>> Initializing reading benchmark with \e[1m" << mState.iterations << "\e[0m iterations." << std::endl;
+  std::cout << ">>> Initializing reading benchmark with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
   mState.hostReadingResultsVector.resize(mState.getMaxSegments());
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type)));
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingBenchmark(size_t iterations)
+void GPUbenchmark<buffer_type>::readingBenchmark(size_t kernelLaunches)
 {
   auto nBlocks{mState.getMaxSegments()};
   auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
-  for (auto iteration{iterations}; iteration--;) {
-    gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, 1, mState.getPartitionCapacity(), mState.partitionSizeGB);
+  for (auto launch{kernelLaunches}; launch--;) {
+    gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, mState.getPartitionCapacity(), mState.partitionSizeGB);
   }
   GPUCHECK(cudaDeviceSynchronize());
 }
@@ -297,8 +295,10 @@ void GPUbenchmark<buffer_type>::run()
   // Test calls go here
   // - Reading
   readingInit();
-  auto result = measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
-  mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType<buffer_type>(), result);
+  for (auto measures{mOptions.nTests}; measures--;) {
+    auto result = measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
+    mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType<buffer_type>(), result);
+  }
   readingFinalize();
 
   GPUbenchmark<buffer_type>::generalFinalize();

From 134b31db1f796a7a2d00204943aacbf1684e45d5 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Fri, 25 Jun 2021 18:29:44 +0200
Subject: [PATCH 25/42] Fix undefined behaviour insetting nLaunches

---
 GPU/GPUbenchmark/Shared/Kernels.h |  2 +-
 GPU/GPUbenchmark/Shared/Utils.h   |  8 ++++----
 GPU/GPUbenchmark/benchmark.cxx    |  8 ++++----
 GPU/GPUbenchmark/cuda/Kernels.cu  | 15 ++++++---------
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 66ae5aa05e176..5b872e521a173 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -54,7 +54,7 @@ class GPUbenchmark final
   void readingFinalize();
 
   // Benchmark kernel callbacks
-  void readingBenchmark(size_t iterations);
+  void readingBenchmark(int iterations);
 
  private:
   gpuState<buffer_type> mState;
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 6ea03f2971052..8551419efd4ff 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -41,8 +41,8 @@ struct benchmarkOpts {
 
   float partitionSizeGB = 1.f;
   float freeMemoryFractionToAllocate = 0.95f;
-  size_t kernelLaunches = 1;
-  size_t nTests = 1;
+  int kernelLaunches = 1;
+  int nTests = 1;
 };
 
 template <class T>
@@ -75,11 +75,11 @@ struct gpuState {
     return gpuBuffersHost;
   }
 
-  size_t getNiterations() { return iterations; }
+  int getNiterations() { return iterations; }
 
   // Configuration
   size_t nMaxThreadsPerDimension;
-  size_t iterations;
+  int iterations;
 
   float partitionSizeGB; // Size of each partition (GB)
 
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index ab108aa839acc..ad4a9fe88cddb 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -23,8 +23,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
     "help,h", "Print help message.")(
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
     "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
-    "launches,l", bpo::value<size_t>()->default_value(50), "Number of iterations in reading kernels.")(
-    "ntests,n", bpo::value<size_t>()->default_value(1), "Number of times each test is run."
+    "launches,l", bpo::value<int>()->default_value(50), "Number of iterations in reading kernels.")(
+    "ntests,n", bpo::value<int>()->default_value(1), "Number of times each test is run."
     );
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
@@ -44,8 +44,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 
   conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
   conf.partitionSizeGB = vm["chunkSize"].as<float>();
-  conf.kernelLaunches = vm["launches"].as<size_t>();
-  conf.nTests = vm["ntests"].as<size_t>();
+  conf.kernelLaunches = vm["launches"].as<int>();
+  conf.nTests = vm["ntests"].as<int>();
 
   return true;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index ab175b91fe8e8..b8f0ac6691e9f 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -63,14 +63,11 @@ GPUg() void readerKernel(
   buffer_type* results,
   buffer_type* scratch,
   size_t bufferSize,
-  float partitionSize = 1.f,
-  size_t innerLoops = 1)
+  float partitionSize = 1.f)
 {
   for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
-    for (size_t j{0}; j < innerLoops; ++j) {
-      if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
-        results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync
-      }
+    if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
+      results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync
     }
   }
 }
@@ -236,6 +233,7 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
   GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
   mState.partitionSizeGB = mOptions.partitionSizeGB;
+  mState.iterations = mOptions.kernelLaunches;
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
@@ -265,11 +263,11 @@ void GPUbenchmark<buffer_type>::readingInit()
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingBenchmark(size_t kernelLaunches)
+void GPUbenchmark<buffer_type>::readingBenchmark(int kernelLaunches)
 {
   auto nBlocks{mState.getMaxSegments()};
   auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
-  for (auto launch{kernelLaunches}; launch--;) {
+  for (auto launch{0}; launch < kernelLaunches; ++launch) {
     gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, mState.getPartitionCapacity(), mState.partitionSizeGB);
   }
   GPUCHECK(cudaDeviceSynchronize());
@@ -278,7 +276,6 @@ void GPUbenchmark<buffer_type>::readingBenchmark(size_t kernelLaunches)
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingFinalize()
 {
-
   GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost));
 }
 

From 3e961257777cd7f1ddadc93cad9a50069536d103 Mon Sep 17 00:00:00 2001
From: ALICE Builder <alibuild@users.noreply.github.com>
Date: Fri, 25 Jun 2021 18:32:20 +0200
Subject: [PATCH 26/42] Please consider the following formatting changes (#17)

---
 GPU/GPUbenchmark/benchmark.cxx | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index ad4a9fe88cddb..0abd0d52be1ba 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -24,8 +24,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
     "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
     "launches,l", bpo::value<int>()->default_value(50), "Number of iterations in reading kernels.")(
-    "ntests,n", bpo::value<int>()->default_value(1), "Number of times each test is run."
-    );
+    "ntests,n", bpo::value<int>()->default_value(1), "Number of times each test is run.");
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
     if (vm.count("help")) {
@@ -57,9 +56,9 @@ int main(int argc, const char* argv[])
 
   o2::benchmark::benchmarkOpts opts;
 
-    if (!parseArgs(opts, argc, argv)) {
-      return -1;
-    }
+  if (!parseArgs(opts, argc, argv)) {
+    return -1;
+  }
 
   std::shared_ptr<ResultStreamer> streamer = std::make_shared<ResultStreamer>();
 

From d928a4cb7de0db7d81de82b82ed79d511e076ade Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 30 Jun 2021 12:06:49 +0200
Subject: [PATCH 27/42] Streamline ker benchmarking w/ events

---
 GPU/GPUbenchmark/CMakeLists.txt   |   2 +-
 GPU/GPUbenchmark/Shared/Kernels.h |  17 +++-
 GPU/GPUbenchmark/Shared/Utils.h   |  14 ++-
 GPU/GPUbenchmark/cuda/Kernels.cu  | 159 +++++++++++++++++++++++++-----
 4 files changed, 160 insertions(+), 32 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 9151acc8bc478..9ed33e179cc84 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -33,7 +33,7 @@ if(HIP_ENABLED)
   if(EXISTS ${HIPIFY_EXECUTABLE})
     set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
     message("Generating HIP kernel code ...")
-    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
   elseif()
     message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
   endif()
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 5b872e521a173..09272d37d8ccd 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -43,18 +43,25 @@ class GPUbenchmark final
   template <typename... T>
   float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
 
+  template <typename... T>
+  float benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args);
+
+  template <typename... T>
+  std::vector<float> benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nSplits, int nLaunches, int blocks, int threads, T&... args);
+
   // Main interface
-  void generalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
-  void run();                           // Execute all specified callbacks
-  void generalFinalize();               // Cleanup
-  void printDevices();                  // Dump info
+  void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
+  void run();                          // Execute all specified callbacks
+  void globalFinalize();               // Cleanup
+  void printDevices();                 // Dump info
 
   // Initializations/Finalizations of tests. Not to be measured, in principle used for report
   void readingInit();
   void readingFinalize();
 
   // Benchmark kernel callbacks
-  void readingBenchmark(int iterations);
+  void readingSequential(SplitLevel sl);
+  void readingConcurrent(SplitLevel sl);
 
  private:
   gpuState<buffer_type> mState;
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 8551419efd4ff..9682e69b2e5c3 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -36,6 +36,12 @@ namespace o2
 {
 namespace benchmark
 {
+
+enum class SplitLevel {
+  Blocks,
+  Threads
+};
+
 struct benchmarkOpts {
   benchmarkOpts() = default;
 
@@ -75,7 +81,7 @@ struct gpuState {
     return gpuBuffersHost;
   }
 
-  int getNiterations() { return iterations; }
+  int getNKernelLaunches() { return iterations; }
 
   // Configuration
   size_t nMaxThreadsPerDimension;
@@ -105,7 +111,7 @@ class ResultStreamer
  public:
   explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root");
   ~ResultStreamer();
-  void storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry);
+  void storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry);
 
  private:
   std::string mDebugTreeFileName = "benchmark_results.root"; // output filename
@@ -123,10 +129,10 @@ inline ResultStreamer::~ResultStreamer()
   delete mTreeStream;
 }
 
-inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string type, float entry)
+inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry)
 {
   (*mTreeStream)
-    << (benchmarkName + type).data()
+    << (benchmarkName + "_" + type + "_" + split).data()
     << "elapsed=" << entry
     << "\n";
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index b8f0ac6691e9f..1612437149c4a 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -9,10 +9,13 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 ///
-/// \file Kernels.cu
+/// \file Kernels.{cu, hip.cxx}
 /// \author: mconcas@cern.ch
 
 #include "../Shared/Kernels.h"
+#if defined(__HIPCC__)
+#include "hip/hip_runtime.h"
+#endif
 #include <stdio.h>
 
 #define GPUCHECK(error)                                                                        \
@@ -58,16 +61,41 @@ GPUhd() buffer_type* getPartPtrOnScratch(buffer_type* scratchPtr, float partSize
   return reinterpret_cast<buffer_type*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partSizeGB) * partNumber);
 }
 
+GPUhd() int getCorrespondingSplitId(int blockId, int nPartitions, int nSplits = 1)
+{
+  return blockId * nSplits / nPartitions;
+}
+
+template <class buffer_type>
+GPUg() void read_single_segment_k(
+  int segmentId,
+  buffer_type* results,
+  buffer_type* scratch,
+  size_t bufferSize,
+  float partitionSize = 1.f)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
+    if (getPartPtrOnScratch(scratch, partitionSize, segmentId)[i] == static_cast<buffer_type>(1)) {
+      results[segmentId] += getPartPtrOnScratch(scratch, partitionSize, segmentId)[i]; // should never happen and threads should be always in sync
+    }
+  }
+}
+
 template <class buffer_type>
-GPUg() void readerKernel(
+GPUg() void split_read_k(
+  int split, // Id of split partition
+  int nsplits,
+  int npartitions,
   buffer_type* results,
   buffer_type* scratch,
   size_t bufferSize,
   float partitionSize = 1.f)
 {
-  for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
-    if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
-      results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync
+  if (split == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
+      if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
+        results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync
+      }
     }
   }
 }
@@ -210,6 +238,55 @@ float GPUbenchmark<buffer_type>::measure(void (GPUbenchmark<buffer_type>::*task)
   return diff;
 }
 
+template <class buffer_type>
+template <typename... T>
+float GPUbenchmark<buffer_type>::benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args)
+{
+  cudaEvent_t start, stop;
+  GPUCHECK(cudaEventCreate(&start));
+  GPUCHECK(cudaEventCreate(&stop));
+
+  GPUCHECK(cudaEventRecord(start));
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {
+    (*kernel)<<<blocks, threads, 0, 0>>>(args...); // Stream is 0 by default, so that we don't have to convert cudaStream_t it in HIP header
+  }
+  GPUCHECK(cudaEventRecord(stop));
+
+  GPUCHECK(cudaEventSynchronize(stop));
+  float milliseconds{0.f};
+  GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop));
+
+  return milliseconds;
+}
+
+template <class buffer_type>
+template <typename... T>
+std::vector<float> GPUbenchmark<buffer_type>::benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nStreams, int nLaunches, int blocks, int threads, T&... args)
+{
+  std::vector<cudaEvent_t> splitStarts(nStreams), splitStops(nStreams);
+  std::vector<cudaStream_t> streams(nStreams);
+  std::vector<float> splitResults(nStreams);
+
+  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+    GPUCHECK(cudaStreamCreate(&(streams.at(iStream))));
+    GPUCHECK(cudaEventCreate(&(splitStarts[iStream])));
+    GPUCHECK(cudaEventCreate(&(splitStops[iStream])));
+    GPUCHECK(cudaEventRecord(splitStarts[iStream], streams[iStream]));
+
+    for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive lanuches on the same stream
+      (*kernel)<<<blocks, threads, 0, streams[iStream]>>>(iStream, nStreams, args...);
+    }
+    GPUCHECK(cudaEventRecord(splitStops[iStream], streams[iStream]));
+  }
+
+  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+    GPUCHECK(cudaEventSynchronize(splitStops[iStream]));
+    GPUCHECK(cudaEventElapsedTime(&(splitResults.at(iStream)), splitStarts[iStream], splitStops[iStream]));
+  }
+
+  return splitResults;
+}
+
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::printDevices()
 {
@@ -223,7 +300,7 @@ void GPUbenchmark<buffer_type>::printDevices()
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
+void GPUbenchmark<buffer_type>::globalInit(const int deviceId)
 {
   cudaDeviceProp props;
   size_t free;
@@ -257,30 +334,69 @@ void GPUbenchmark<buffer_type>::generalInit(const int deviceId)
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingInit()
 {
-  std::cout << ">>> Initializing reading benchmark with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
+  std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
   mState.hostReadingResultsVector.resize(mState.getMaxSegments());
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type)));
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingBenchmark(int kernelLaunches)
+void GPUbenchmark<buffer_type>::readingSequential(SplitLevel sl)
 {
-  auto nBlocks{mState.getMaxSegments()};
-  auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
-  for (auto launch{0}; launch < kernelLaunches; ++launch) {
-    gpu::readerKernel<buffer_type><<<nBlocks, nThreads>>>(mState.deviceReadingResultsPtr, mState.scratchPtr, mState.getPartitionCapacity(), mState.partitionSizeGB);
+  switch (sl) {
+    case SplitLevel::Blocks:
+      break;
+    case SplitLevel::Threads: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads";
+        for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately
+          auto result = benchmarkSynchExecution(&gpu::read_single_segment_k<buffer_type>, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
+          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iSegment), getType<buffer_type>(), result);
+        }
+      }
+      break;
+    }
+  }
+  std::cout << " completed." << std::endl;
+}
+
+template <class buffer_type>
+void GPUbenchmark<buffer_type>::readingConcurrent(SplitLevel sl)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto segments{mState.getMaxSegments()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurements{mOptions.nTests}; measurements--;) {
+        std::cout << std::setw(2) << ">>> Concurrent read benchmark, splitting on blocks";
+        auto results = benchmarkAsynchExecution(&gpu::split_read_k<buffer_type>, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType<buffer_type>(), results[iResult]);
+        }
+      }
+      break;
+    }
+    case SplitLevel::Threads:
+      break;
   }
-  GPUCHECK(cudaDeviceSynchronize());
+  std::cout << " completed." << std::endl;
 }
 
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingFinalize()
 {
   GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceReadingResultsPtr));
 }
 
 template <class buffer_type>
-void GPUbenchmark<buffer_type>::generalFinalize()
+void GPUbenchmark<buffer_type>::globalFinalize()
 {
   GPUCHECK(cudaFree(mState.scratchPtr));
 }
@@ -288,17 +404,16 @@ void GPUbenchmark<buffer_type>::generalFinalize()
 template <class buffer_type>
 void GPUbenchmark<buffer_type>::run()
 {
-  generalInit(0);
-  // Test calls go here
-  // - Reading
+  globalInit(0);
+  // Test calls go here:
   readingInit();
-  for (auto measures{mOptions.nTests}; measures--;) {
-    auto result = measure(&GPUbenchmark<buffer_type>::readingBenchmark, "Reading benchmark", mState.getNiterations());
-    mStreamer.get()->storeBenchmarkEntry("readingBenchmark", getType<buffer_type>(), result);
-  }
+  // - Reading
+  readingSequential(SplitLevel::Threads);
+  // - Split reading
+  readingConcurrent(SplitLevel::Blocks);
   readingFinalize();
 
-  GPUbenchmark<buffer_type>::generalFinalize();
+  GPUbenchmark<buffer_type>::globalFinalize();
 }
 
 template class GPUbenchmark<char>;

From bc8e75b7e321c1ddb81578e8155f5db1ee6c36e0 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 1 Jul 2021 15:21:07 +0200
Subject: [PATCH 28/42] Tidy up kernels and improve output

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 86 +++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 13 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 1612437149c4a..5be6250c42825 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -67,22 +67,54 @@ GPUhd() int getCorrespondingSplitId(int blockId, int nPartitions, int nSplits =
 }
 
 template <class buffer_type>
-GPUg() void read_single_segment_k(
+GPUd() void read_segment_singleblock(size_t threadId,
+                                     buffer_type* scratch,
+                                     buffer_type* results,
+                                     size_t blockDim,
+                                     size_t bufferSize,
+                                     float partSizeGB,
+                                     size_t segmentId)
+{
+  for (size_t i = threadId; i < bufferSize; i += blockDim) {
+    if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast<buffer_type>(1)) { // actual read operation is performed here
+      results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i];              // this case should never happen and waves should be always in sync
+    }
+  }
+}
+
+template <class buffer_type>
+GPUd() void read_segment_multiblock(size_t threadId,
+                                    size_t blockId,
+                                    buffer_type* scratch,
+                                    buffer_type* results,
+                                    size_t blockDim,
+                                    size_t gridDim,
+                                    size_t bufferSize,
+                                    float partSizeGB,
+                                    size_t segmentId)
+{
+  for (int i = blockId * blockDim + threadId; i < bufferSize; i += blockDim * gridDim) {
+    if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast<buffer_type>(1)) { // actual read operation is performed here
+      results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i];              // this case should never happen and waves should be always in sync
+    }
+  }
+}
+
+template <class buffer_type>
+GPUg() void read_segment_singleblock_k(
   int segmentId,
   buffer_type* results,
   buffer_type* scratch,
   size_t bufferSize,
   float partitionSize = 1.f)
 {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < bufferSize; i += blockDim.x * gridDim.x) {
-    if (getPartPtrOnScratch(scratch, partitionSize, segmentId)[i] == static_cast<buffer_type>(1)) {
-      results[segmentId] += getPartPtrOnScratch(scratch, partitionSize, segmentId)[i]; // should never happen and threads should be always in sync
-    }
+  if (segmentId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    read_segment_singleblock(threadIdx.x, scratch, results, blockDim.x, bufferSize, partitionSize, segmentId);
   }
 }
 
 template <class buffer_type>
-GPUg() void split_read_k(
+GPUg() void split_read_singleblock_k(
   int split, // Id of split partition
   int nsplits,
   int npartitions,
@@ -99,6 +131,18 @@ GPUg() void split_read_k(
     }
   }
 }
+
+template <class buffer_type>
+GPUg() void read_single_segment_multiblock_k(
+  int segmentId,
+  buffer_type* results,
+  buffer_type* scratch,
+  size_t bufferSize,
+  float partitionSize = 1.f)
+{
+  read_segment_multiblock(threadIdx.x, blockIdx.x, scratch, results, blockDim.x, gridDim.x, bufferSize, partitionSize, segmentId);
+}
+
 ///////////////////
 
 } // namespace gpu
@@ -315,7 +359,7 @@ void GPUbenchmark<buffer_type>::globalInit(const int deviceId)
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
   mState.scratchSize = static_cast<long int>(mOptions.freeMemoryFractionToAllocate * free);
-  std::cout << ">>> Running on: \e[1m" << props.name << "\e[0m" << std::endl;
+  std::cout << ">>> Running on: \033[1;31m" << props.name << "\e[0m" << std::endl;
 
   // Allocate scratch on GPU
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
@@ -343,24 +387,38 @@ template <class buffer_type>
 void GPUbenchmark<buffer_type>::readingSequential(SplitLevel sl)
 {
   switch (sl) {
-    case SplitLevel::Blocks:
+    case SplitLevel::Blocks: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on blocks:";
+        for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately
+          auto result = benchmarkSynchExecution(&gpu::read_segment_singleblock_k<buffer_type>, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
+          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iSegment), getType<buffer_type>(), result);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
       break;
+    }
+
     case SplitLevel::Threads: {
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads";
+        std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads:";
         for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately
-          auto result = benchmarkSynchExecution(&gpu::read_single_segment_k<buffer_type>, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
+          auto result = benchmarkSynchExecution(&gpu::read_single_segment_multiblock_k<buffer_type>, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
           mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iSegment), getType<buffer_type>(), result);
         }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
   }
-  std::cout << " completed." << std::endl;
 }
 
 template <class buffer_type>
@@ -375,7 +433,7 @@ void GPUbenchmark<buffer_type>::readingConcurrent(SplitLevel sl)
 
       for (auto measurements{mOptions.nTests}; measurements--;) {
         std::cout << std::setw(2) << ">>> Concurrent read benchmark, splitting on blocks";
-        auto results = benchmarkAsynchExecution(&gpu::split_read_k<buffer_type>, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
+        auto results = benchmarkAsynchExecution(&gpu::split_read_singleblock_k<buffer_type>, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType<buffer_type>(), results[iResult]);
         }
@@ -409,8 +467,10 @@ void GPUbenchmark<buffer_type>::run()
   readingInit();
   // - Reading
   readingSequential(SplitLevel::Threads);
+  readingSequential(SplitLevel::Blocks);
+
   // - Split reading
-  readingConcurrent(SplitLevel::Blocks);
+  // readingConcurrent(SplitLevel::Blocks);
   readingFinalize();
 
   GPUbenchmark<buffer_type>::globalFinalize();

From 71408e894498a162c7f9d2738db1d9c9a2dfaf8c Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 7 Jul 2021 09:47:39 +0200
Subject: [PATCH 29/42] Update read test

---
 GPU/GPUbenchmark/Shared/Kernels.h |  21 +-
 GPU/GPUbenchmark/Shared/Utils.h   |  22 +-
 GPU/GPUbenchmark/benchmark.cxx    |   2 +-
 GPU/GPUbenchmark/cuda/Kernels.cu  | 323 ++++++++++++++++--------------
 4 files changed, 194 insertions(+), 174 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 09272d37d8ccd..2f6f06764d373 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -23,15 +23,12 @@
 #include <memory>
 #include <chrono>
 
-// #define PARTITION_SIZE_GB 1
-// #define FREE_MEMORY_FRACTION_TO_ALLOCATE 0.95f
-
 namespace o2
 {
 namespace benchmark
 {
 
-template <class buffer_type>
+template <class chunk_type>
 class GPUbenchmark final
 {
  public:
@@ -43,12 +40,20 @@ class GPUbenchmark final
   template <typename... T>
   float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
 
+  // Single stream synchronous (sequential kernels) execution
   template <typename... T>
-  float benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args);
+  float benchmarkSync(void (*kernel)(T...),
+                       int nLaunches, int blocks, int threads, T&... args);
 
+  // Multi-streams asynchronous executions on whole memory
   template <typename... T>
-  std::vector<float> benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nSplits, int nLaunches, int blocks, int threads, T&... args);
+  std::vector<float> benchmarkAsync(void (*kernel)(int, T...),
+                                     int nStreams, int nLaunches, int blocks, int threads, T&... args);
 
+  // Per-memory region benchmarking
+  template <typename... T>
+  std::vector<float> benchmarkAsyncVsRegion(void (*kernel)(int, int, T...),
+                                             int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args);
   // Main interface
   void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
   void run();                          // Execute all specified callbacks
@@ -61,10 +66,10 @@ class GPUbenchmark final
 
   // Benchmark kernel callbacks
   void readingSequential(SplitLevel sl);
-  void readingConcurrent(SplitLevel sl);
+  void readingConcurrent(SplitLevel sl, int nRegions = 2);
 
  private:
-  gpuState<buffer_type> mState;
+  gpuState<chunk_type> mState;
   std::shared_ptr<ResultStreamer> mStreamer;
   benchmarkOpts mOptions;
 };
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 9682e69b2e5c3..8f43647090a8b 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -45,7 +45,7 @@ enum class SplitLevel {
 struct benchmarkOpts {
   benchmarkOpts() = default;
 
-  float partitionSizeGB = 1.f;
+  float chunkReservedGB = 1.f;
   float freeMemoryFractionToAllocate = 0.95f;
   int kernelLaunches = 1;
   int nTests = 1;
@@ -53,22 +53,22 @@ struct benchmarkOpts {
 
 template <class T>
 struct gpuState {
-  int getMaxSegments()
+  int getMaxChunks()
   {
-    return (double)scratchSize / (partitionSizeGB * GB);
+    return (double)scratchSize / (chunkReservedGB * GB);
   }
 
   void computeScratchPtrs()
   {
-    partAddrOnHost.resize(getMaxSegments());
-    for (size_t iBuffAddress{0}; iBuffAddress < getMaxSegments(); ++iBuffAddress) {
-      partAddrOnHost[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partitionSizeGB) * iBuffAddress);
+    partAddrOnHost.resize(getMaxChunks());
+    for (size_t iBuffAddress{0}; iBuffAddress < getMaxChunks(); ++iBuffAddress) {
+      partAddrOnHost[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * iBuffAddress);
     }
   }
 
   size_t getPartitionCapacity()
   {
-    return static_cast<size_t>(GB * partitionSizeGB / sizeof(T));
+    return static_cast<size_t>(GB * chunkReservedGB / sizeof(T));
   }
 
   std::vector<T*> getScratchPtrs()
@@ -87,7 +87,7 @@ struct gpuState {
   size_t nMaxThreadsPerDimension;
   int iterations;
 
-  float partitionSizeGB; // Size of each partition (GB)
+  float chunkReservedGB; // Size of each partition (GB)
 
   // General containers and state
   T* scratchPtr;                              // Pointer to scratch buffer
@@ -111,7 +111,7 @@ class ResultStreamer
  public:
   explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root");
   ~ResultStreamer();
-  void storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry);
+  void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry);
 
  private:
   std::string mDebugTreeFileName = "benchmark_results.root"; // output filename
@@ -129,10 +129,10 @@ inline ResultStreamer::~ResultStreamer()
   delete mTreeStream;
 }
 
-inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string split, std::string type, float entry)
+inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry)
 {
   (*mTreeStream)
-    << (benchmarkName + "_" + type + "_" + split).data()
+    << (benchmarkName + "_" + type + "_" + chunk).data()
     << "elapsed=" << entry
     << "\n";
 }
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 0abd0d52be1ba..070f43cc94add 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -42,7 +42,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   }
 
   conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
-  conf.partitionSizeGB = vm["chunkSize"].as<float>();
+  conf.chunkReservedGB = vm["chunkSize"].as<float>();
   conf.kernelLaunches = vm["launches"].as<int>();
   conf.nTests = vm["ntests"].as<int>();
 
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 5be6250c42825..78e2cbb4c82ca 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -18,6 +18,12 @@
 #endif
 #include <stdio.h>
 
+// Memory partition legend
+//
+// |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM
+// |--chunk 0--|--chunk 1--|--chunk 2--|                  ***                          |--chunk n--| chunks  -> default size: 1GB (single block pins)
+// |__________________________________________scratch______________________________________________| scratch -> default size: 95% free GPU RAM
+
 #define GPUCHECK(error)                                                                        \
   if (error != cudaSuccess) {                                                                  \
     printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \
@@ -28,6 +34,11 @@
 double bytesToKB(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / GB; }
 
+int getCorrespondingRegionId(int Id, int nChunks, int nRegions = 1)
+{
+  return Id * nRegions / nChunks;
+}
+
 template <class T>
 std::string getType()
 {
@@ -53,98 +64,47 @@ namespace benchmark
 namespace gpu
 {
 
-///////////////////
-/// Kernels and device functions go here
-template <class buffer_type>
-GPUhd() buffer_type* getPartPtrOnScratch(buffer_type* scratchPtr, float partSizeGB, size_t partNumber)
+///////////////////////////
+// Device functions go here
+template <class chunk_type>
+GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber)
 {
-  return reinterpret_cast<buffer_type*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partSizeGB) * partNumber);
-}
-
-GPUhd() int getCorrespondingSplitId(int blockId, int nPartitions, int nSplits = 1)
-{
-  return blockId * nSplits / nPartitions;
-}
-
-template <class buffer_type>
-GPUd() void read_segment_singleblock(size_t threadId,
-                                     buffer_type* scratch,
-                                     buffer_type* results,
-                                     size_t blockDim,
-                                     size_t bufferSize,
-                                     float partSizeGB,
-                                     size_t segmentId)
-{
-  for (size_t i = threadId; i < bufferSize; i += blockDim) {
-    if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast<buffer_type>(1)) { // actual read operation is performed here
-      results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i];              // this case should never happen and waves should be always in sync
-    }
-  }
+  return reinterpret_cast<chunk_type*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * partNumber);
 }
 
-template <class buffer_type>
-GPUd() void read_segment_multiblock(size_t threadId,
-                                    size_t blockId,
-                                    buffer_type* scratch,
-                                    buffer_type* results,
-                                    size_t blockDim,
-                                    size_t gridDim,
-                                    size_t bufferSize,
-                                    float partSizeGB,
-                                    size_t segmentId)
+//////////////////
+// Kernels go here
+template <class chunk_type>
+GPUg() void readChunkSBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
 {
-  for (int i = blockId * blockDim + threadId; i < bufferSize; i += blockDim * gridDim) {
-    if (getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i] == static_cast<buffer_type>(1)) { // actual read operation is performed here
-      results[segmentId] += getPartPtrOnScratch(scratch, partSizeGB, segmentId)[i];              // this case should never happen and waves should be always in sync
+  if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
+      if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast<chunk_type>(1)) { // actual read operation is performed here
+        results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i];               // this case should never happen and waves should be always in sync
+      }
     }
   }
 }
 
-template <class buffer_type>
-GPUg() void read_segment_singleblock_k(
-  int segmentId,
-  buffer_type* results,
-  buffer_type* scratch,
-  size_t bufferSize,
-  float partitionSize = 1.f)
+template <class chunk_type>
+GPUg() void readChunkMBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
 {
-  if (segmentId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
-    read_segment_singleblock(threadIdx.x, scratch, results, blockDim.x, bufferSize, partitionSize, segmentId);
-  }
-}
-
-template <class buffer_type>
-GPUg() void split_read_singleblock_k(
-  int split, // Id of split partition
-  int nsplits,
-  int npartitions,
-  buffer_type* results,
-  buffer_type* scratch,
-  size_t bufferSize,
-  float partitionSize = 1.f)
-{
-  if (split == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
-    for (size_t i = threadIdx.x; i < bufferSize; i += blockDim.x) {
-      if (getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i] == static_cast<buffer_type>(1)) {
-        results[blockIdx.x] += getPartPtrOnScratch(scratch, partitionSize, blockIdx.x)[i]; // should never happen and threads should be always in sync
-      }
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) {
+    if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast<chunk_type>(1)) { // actual read operation is performed here
+      results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i];               // this case should never happen and waves should be always in sync
     }
   }
 }
-
-template <class buffer_type>
-GPUg() void read_single_segment_multiblock_k(
-  int segmentId,
-  buffer_type* results,
-  buffer_type* scratch,
-  size_t bufferSize,
-  float partitionSize = 1.f)
-{
-  read_segment_multiblock(threadIdx.x, blockIdx.x, scratch, results, blockDim.x, gridDim.x, bufferSize, partitionSize, segmentId);
-}
-
-///////////////////
-
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -267,72 +227,98 @@ void printDeviceProp(int deviceId)
             << (float)free / total * 100.0 << "%)" << std::endl;
 }
 
-template <class buffer_type>
-template <typename... T>
-float GPUbenchmark<buffer_type>::measure(void (GPUbenchmark<buffer_type>::*task)(T...), const char* taskName, T&&... args)
-{
-  float diff{0.f};
-  std::cout << std::setw(2) << ">>> " << taskName;
-  auto start = std::chrono::high_resolution_clock::now();
-  (this->*task)(std::forward<T>(args)...);
-  auto end = std::chrono::high_resolution_clock::now();
-  std::chrono::duration<double, std::milli> diff_t{end - start};
-  diff = diff_t.count();
-  std::cout << std::setw(2) << " completed in: \x1B[32m" << diff << " ms\x1B[0m" << std::endl;
-  return diff;
-}
-
-template <class buffer_type>
+template <class chunk_type>
 template <typename... T>
-float GPUbenchmark<buffer_type>::benchmarkSynchExecution(void (*kernel)(T...), int nLaunches, int blocks, int threads, T&... args)
+float GPUbenchmark<chunk_type>::benchmarkSync(void (*kernel)(T...),
+                                              int nLaunches, int blocks, int threads, T&... args) // run for each chunk (id is passed in variadic args)
 {
   cudaEvent_t start, stop;
   GPUCHECK(cudaEventCreate(&start));
   GPUCHECK(cudaEventCreate(&stop));
 
   GPUCHECK(cudaEventRecord(start));
-  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {
-    (*kernel)<<<blocks, threads, 0, 0>>>(args...); // Stream is 0 by default, so that we don't have to convert cudaStream_t it in HIP header
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches
+    (*kernel)<<<blocks, threads, 0, 0>>>(args...);
   }
-  GPUCHECK(cudaEventRecord(stop));
+  GPUCHECK(cudaEventRecord(stop)); // record checkpoint
 
-  GPUCHECK(cudaEventSynchronize(stop));
+  GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions
   float milliseconds{0.f};
   GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop));
 
   return milliseconds;
 }
 
-template <class buffer_type>
+template <class chunk_type>
 template <typename... T>
-std::vector<float> GPUbenchmark<buffer_type>::benchmarkAsynchExecution(void (*kernel)(int, int, T...), int nStreams, int nLaunches, int blocks, int threads, T&... args)
+std::vector<float> GPUbenchmark<chunk_type>::benchmarkAsync(void (*kernel)(int, T...),
+                                                            int nStreams, int nLaunches, int blocks, int threads, T&... args)
 {
-  std::vector<cudaEvent_t> splitStarts(nStreams), splitStops(nStreams);
+  std::vector<cudaEvent_t> starts(nStreams), stops(nStreams);
   std::vector<cudaStream_t> streams(nStreams);
-  std::vector<float> splitResults(nStreams);
+  std::vector<float> results(nStreams);
 
-  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+  for (auto iStream{0}; iStream < nStreams; ++iStream) { // one stream per chunk
     GPUCHECK(cudaStreamCreate(&(streams.at(iStream))));
-    GPUCHECK(cudaEventCreate(&(splitStarts[iStream])));
-    GPUCHECK(cudaEventCreate(&(splitStops[iStream])));
-    GPUCHECK(cudaEventRecord(splitStarts[iStream], streams[iStream]));
+    GPUCHECK(cudaEventCreate(&(starts[iStream])));
+    GPUCHECK(cudaEventCreate(&(stops[iStream])));
+  }
 
-    for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive lanuches on the same stream
-      (*kernel)<<<blocks, threads, 0, streams[iStream]>>>(iStream, nStreams, args...);
+  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+    GPUCHECK(cudaEventRecord(starts[iStream], streams[iStream]));
+
+    for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream
+      (*kernel)<<<blocks, threads, 0, streams[iStream]>>>(iStream, args...);
     }
-    GPUCHECK(cudaEventRecord(splitStops[iStream], streams[iStream]));
+    GPUCHECK(cudaEventRecord(stops[iStream], streams[iStream]));
   }
 
   for (auto iStream{0}; iStream < nStreams; ++iStream) {
-    GPUCHECK(cudaEventSynchronize(splitStops[iStream]));
-    GPUCHECK(cudaEventElapsedTime(&(splitResults.at(iStream)), splitStarts[iStream], splitStops[iStream]));
+    GPUCHECK(cudaEventSynchronize(stops[iStream]));
+    GPUCHECK(cudaEventElapsedTime(&(results.at(iStream)), starts[iStream], stops[iStream]));
+  }
+
+  return results;
+}
+
+template <class chunk_type>
+template <typename... T>
+std::vector<float> GPUbenchmark<chunk_type>::benchmarkAsyncVsRegion(void (*kernel)(int, int, T...),
+                                                                    int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args)
+{
+  std::vector<cudaEvent_t> starts(nRegions), stops(nRegions); // I want one event per region
+  std::vector<cudaStream_t> streams(nStreams);
+  std::vector<float> results(nRegions);
+
+  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+    GPUCHECK(cudaStreamCreate(&(streams.at(iStream))));
   }
 
-  return splitResults;
+  for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) {
+    GPUCHECK(cudaEventCreate(&(starts[iRegion])));
+    GPUCHECK(cudaEventCreate(&(stops[iRegion])));
+
+    for (auto iStream{0}; iStream < nStreams; ++iStream) {
+      if (getCorrespondingRegionId(iStream, nStreams, nRegions) == iRegion) {
+        std::cout << "DEBUG: stream " << iStream << " " << getCorrespondingRegionId(iStream, nStreams, nRegions) << std::endl;
+        for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream
+          (*kernel)<<<blocks, threads, 0, streams[iStream]>>>(iStream, nStreams, args...);
+        }
+      }
+      GPUCHECK(cudaEventRecord(stops[iRegion], streams[iRegion]));
+    }
+  }
+
+  for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) {
+    GPUCHECK(cudaEventSynchronize(stops[iRegion]));
+    GPUCHECK(cudaEventElapsedTime(&(results.at(iRegion)), starts[iRegion], stops[iRegion]));
+  }
+
+  return results;
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::printDevices()
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::printDevices()
 {
   int deviceCnt;
   GPUCHECK(cudaGetDeviceCount(&deviceCnt));
@@ -343,8 +329,8 @@ void GPUbenchmark<buffer_type>::printDevices()
   }
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::globalInit(const int deviceId)
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::globalInit(const int deviceId)
 {
   cudaDeviceProp props;
   size_t free;
@@ -353,7 +339,7 @@ void GPUbenchmark<buffer_type>::globalInit(const int deviceId)
   GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
   GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
 
-  mState.partitionSizeGB = mOptions.partitionSizeGB;
+  mState.chunkReservedGB = mOptions.chunkReservedGB;
   mState.iterations = mOptions.kernelLaunches;
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
@@ -367,24 +353,24 @@ void GPUbenchmark<buffer_type>::globalInit(const int deviceId)
   mState.computeScratchPtrs();
   GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize))
 
-  std::cout << "    ├ Buffer type: \e[1m" << getType<buffer_type>() << "\e[0m" << std::endl
+  std::cout << "    ├ Buffer type: \e[1m" << getType<chunk_type>() << "\e[0m" << std::endl
             << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
             << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n"
-            << "    ├ Number of scratch partitions: " << mState.getMaxSegments() << " of " << mOptions.partitionSizeGB << "GB each\n"
-            << "    └ Each partition can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl
+            << "    ├ Number of scratch chunks: " << mState.getMaxChunks() << " of " << mOptions.chunkReservedGB << "GB each\n"
+            << "    └ Each chunk can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl
             << std::endl;
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingInit()
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readingInit()
 {
   std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
-  mState.hostReadingResultsVector.resize(mState.getMaxSegments());
-  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxSegments() * sizeof(buffer_type)));
+  mState.hostReadingResultsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingSequential(SplitLevel sl)
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
@@ -393,10 +379,18 @@ void GPUbenchmark<buffer_type>::readingSequential(SplitLevel sl)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on blocks:";
-        for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately
-          auto result = benchmarkSynchExecution(&gpu::read_segment_singleblock_k<buffer_type>, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
-          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iSegment), getType<buffer_type>(), result);
+        std::cout << std::setw(2) << ">>> Sequential read benchmark, one block per chunk:";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::readChunkSBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceReadingResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -410,9 +404,17 @@ void GPUbenchmark<buffer_type>::readingSequential(SplitLevel sl)
 
       for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
         std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads:";
-        for (auto iSegment{0}; iSegment < mState.getMaxSegments(); ++iSegment) { // loop over single segments separately
-          auto result = benchmarkSynchExecution(&gpu::read_single_segment_multiblock_k<buffer_type>, mState.getNKernelLaunches(), nBlocks, nThreads, iSegment, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
-          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iSegment), getType<buffer_type>(), result);
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::readChunkMBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceReadingResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -421,59 +423,72 @@ void GPUbenchmark<buffer_type>::readingSequential(SplitLevel sl)
   }
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingConcurrent(SplitLevel sl)
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nChunks)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
-      auto segments{mState.getMaxSegments()};
+      auto chunks{mState.getMaxChunks()};
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurements{mOptions.nTests}; measurements--;) {
-        std::cout << std::setw(2) << ">>> Concurrent read benchmark, splitting on blocks";
-        auto results = benchmarkAsynchExecution(&gpu::split_read_singleblock_k<buffer_type>, mState.getMaxSegments(), mState.getNKernelLaunches(), nBlocks, nThreads, segments, mState.deviceReadingResultsPtr, mState.scratchPtr, capacity, mState.partitionSizeGB);
+        std::cout << std::setw(2) << ">>> Concurrent read benchmark, one block per chunk";
+        auto results = benchmarkAsync(&gpu::readChunkSBKernel<chunk_type>,
+                                       mState.getMaxChunks(), // nStreams
+                                       mState.getNKernelLaunches(),
+                                       nBlocks,
+                                       nThreads,
+                                       mState.deviceReadingResultsPtr,  // kernel arguments (chunkId is passed by wrapper)
+                                       mState.scratchPtr,
+                                       capacity,
+                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType<buffer_type>(), results[iResult]);
+          mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType<chunk_type>(), results[iResult]);
         }
       }
       break;
     }
-    case SplitLevel::Threads:
+    case SplitLevel::Threads: {
+      // auto nBlocks{mState.nMultiprocessors};
+      // auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      // auto chunks{mState.getMaxChunks()};
+      // auto capacity{mState.getPartitionCapacity()};
       break;
+    }
   }
   std::cout << " completed." << std::endl;
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::readingFinalize()
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readingFinalize()
 {
-  GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxSegments() * sizeof(buffer_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
   GPUCHECK(cudaFree(mState.deviceReadingResultsPtr));
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::globalFinalize()
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::globalFinalize()
 {
   GPUCHECK(cudaFree(mState.scratchPtr));
 }
 
-template <class buffer_type>
-void GPUbenchmark<buffer_type>::run()
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::run()
 {
   globalInit(0);
   // Test calls go here:
   readingInit();
-  // - Reading
+  // - Reading whole memory
   readingSequential(SplitLevel::Threads);
   readingSequential(SplitLevel::Blocks);
 
-  // - Split reading
-  // readingConcurrent(SplitLevel::Blocks);
+  // - Reading memory partitions
+  readingConcurrent(SplitLevel::Blocks);
   readingFinalize();
 
-  GPUbenchmark<buffer_type>::globalFinalize();
+  GPUbenchmark<chunk_type>::globalFinalize();
 }
 
 template class GPUbenchmark<char>;

From 05cc7ae4b1b76801b2bff45b2430d9b1d3bad305 Mon Sep 17 00:00:00 2001
From: ALICE Builder <alibuild@users.noreply.github.com>
Date: Wed, 7 Jul 2021 20:32:53 +0200
Subject: [PATCH 30/42] Please consider the following formatting changes (#18)

---
 GPU/GPUbenchmark/Shared/Kernels.h |  6 +++---
 GPU/GPUbenchmark/cuda/Kernels.cu  | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 2f6f06764d373..fc64c414bf391 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -43,17 +43,17 @@ class GPUbenchmark final
   // Single stream synchronous (sequential kernels) execution
   template <typename... T>
   float benchmarkSync(void (*kernel)(T...),
-                       int nLaunches, int blocks, int threads, T&... args);
+                      int nLaunches, int blocks, int threads, T&... args);
 
   // Multi-streams asynchronous executions on whole memory
   template <typename... T>
   std::vector<float> benchmarkAsync(void (*kernel)(int, T...),
-                                     int nStreams, int nLaunches, int blocks, int threads, T&... args);
+                                    int nStreams, int nLaunches, int blocks, int threads, T&... args);
 
   // Per-memory region benchmarking
   template <typename... T>
   std::vector<float> benchmarkAsyncVsRegion(void (*kernel)(int, int, T...),
-                                             int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args);
+                                            int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args);
   // Main interface
   void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
   void run();                          // Execute all specified callbacks
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 78e2cbb4c82ca..5d894ca524fa7 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -436,14 +436,14 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nChunks)
       for (auto measurements{mOptions.nTests}; measurements--;) {
         std::cout << std::setw(2) << ">>> Concurrent read benchmark, one block per chunk";
         auto results = benchmarkAsync(&gpu::readChunkSBKernel<chunk_type>,
-                                       mState.getMaxChunks(), // nStreams
-                                       mState.getNKernelLaunches(),
-                                       nBlocks,
-                                       nThreads,
-                                       mState.deviceReadingResultsPtr,  // kernel arguments (chunkId is passed by wrapper)
-                                       mState.scratchPtr,
-                                       capacity,
-                                       mState.chunkReservedGB);
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType<chunk_type>(), results[iResult]);
         }

From 4ae3ae4c766e2aef10aff442bb045e82d7927694 Mon Sep 17 00:00:00 2001
From: Matteo Concas <mconcas@cern.ch>
Date: Thu, 8 Jul 2021 19:06:20 +0200
Subject: [PATCH 31/42] CP

---
 GPU/GPUbenchmark/Shared/Kernels.h |  4 --
 GPU/GPUbenchmark/Shared/Utils.h   | 10 ++++
 GPU/GPUbenchmark/benchmark.cxx    |  4 +-
 GPU/GPUbenchmark/cuda/Kernels.cu  | 85 +++++++++++++------------------
 4 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index fc64c414bf391..c741ce31a7f0c 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -50,10 +50,6 @@ class GPUbenchmark final
   std::vector<float> benchmarkAsync(void (*kernel)(int, T...),
                                     int nStreams, int nLaunches, int blocks, int threads, T&... args);
 
-  // Per-memory region benchmarking
-  template <typename... T>
-  std::vector<float> benchmarkAsyncVsRegion(void (*kernel)(int, int, T...),
-                                            int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args);
   // Main interface
   void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
   void run();                          // Execute all specified callbacks
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 8f43647090a8b..931ffbfc1fc75 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -46,6 +46,7 @@ struct benchmarkOpts {
   benchmarkOpts() = default;
 
   float chunkReservedGB = 1.f;
+  int nRegions = 2;
   float freeMemoryFractionToAllocate = 0.95f;
   int kernelLaunches = 1;
   int nTests = 1;
@@ -112,6 +113,7 @@ class ResultStreamer
   explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root");
   ~ResultStreamer();
   void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry);
+  void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
 
  private:
   std::string mDebugTreeFileName = "benchmark_results.root"; // output filename
@@ -137,6 +139,14 @@ inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::
     << "\n";
 }
 
+inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
+{
+  (*mTreeStream)
+    << (benchmarkName + "_" + type + "_region_" + region).data()
+    << "elapsed=" << entry
+    << "\n";
+}
+
 } // namespace benchmark
 } // namespace o2
 
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 070f43cc94add..4dfc039c34efe 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -22,8 +22,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   options.add_options()(
     "help,h", "Print help message.")(
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
+    "regions,r", bpo::value<int>()->default_value(2), "Number of memory regions to partition RAM in.")(
     "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
-    "launches,l", bpo::value<int>()->default_value(50), "Number of iterations in reading kernels.")(
+    "launches,l", bpo::value<int>()->default_value(10), "Number of iterations in reading kernels.")(
     "ntests,n", bpo::value<int>()->default_value(1), "Number of times each test is run.");
   try {
     bpo::store(parse_command_line(argc, argv, options), vm);
@@ -43,6 +44,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 
   conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
   conf.chunkReservedGB = vm["chunkSize"].as<float>();
+  conf.nRegions = vm["regions"].as<int>();
   conf.kernelLaunches = vm["launches"].as<int>();
   conf.nTests = vm["ntests"].as<int>();
 
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 5d894ca524fa7..6510d153c5a9d 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -281,42 +281,6 @@ std::vector<float> GPUbenchmark<chunk_type>::benchmarkAsync(void (*kernel)(int,
   return results;
 }
 
-template <class chunk_type>
-template <typename... T>
-std::vector<float> GPUbenchmark<chunk_type>::benchmarkAsyncVsRegion(void (*kernel)(int, int, T...),
-                                                                    int nRegions, int nStreams, int nLaunches, int blocks, int threads, T&... args)
-{
-  std::vector<cudaEvent_t> starts(nRegions), stops(nRegions); // I want one event per region
-  std::vector<cudaStream_t> streams(nStreams);
-  std::vector<float> results(nRegions);
-
-  for (auto iStream{0}; iStream < nStreams; ++iStream) {
-    GPUCHECK(cudaStreamCreate(&(streams.at(iStream))));
-  }
-
-  for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) {
-    GPUCHECK(cudaEventCreate(&(starts[iRegion])));
-    GPUCHECK(cudaEventCreate(&(stops[iRegion])));
-
-    for (auto iStream{0}; iStream < nStreams; ++iStream) {
-      if (getCorrespondingRegionId(iStream, nStreams, nRegions) == iRegion) {
-        std::cout << "DEBUG: stream " << iStream << " " << getCorrespondingRegionId(iStream, nStreams, nRegions) << std::endl;
-        for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream
-          (*kernel)<<<blocks, threads, 0, streams[iStream]>>>(iStream, nStreams, args...);
-        }
-      }
-      GPUCHECK(cudaEventRecord(stops[iRegion], streams[iRegion]));
-    }
-  }
-
-  for (auto iRegion{nRegions}; iRegion < nRegions; ++iRegion) {
-    GPUCHECK(cudaEventSynchronize(stops[iRegion]));
-    GPUCHECK(cudaEventElapsedTime(&(results.at(iRegion)), starts[iRegion], stops[iRegion]));
-  }
-
-  return results;
-}
-
 template <class chunk_type>
 void GPUbenchmark<chunk_type>::printDevices()
 {
@@ -378,8 +342,8 @@ void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
 
-      for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential read benchmark, one block per chunk:";
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << ">>> Sequential read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
         for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
           auto result = benchmarkSync(&gpu::readChunkSBKernel<chunk_type>,
                                       mState.getNKernelLaunches(),
@@ -402,8 +366,8 @@ void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
 
-      for (auto measurements{mOptions.nTests}; measurements--;) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential read benchmark, splitting on threads:";
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << ">>> Sequential read, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
         for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
           auto result = benchmarkSync(&gpu::readChunkMBKernel<chunk_type>,
                                       mState.getNKernelLaunches(),
@@ -424,7 +388,7 @@ void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
 }
 
 template <class chunk_type>
-void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nChunks)
+void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
@@ -433,8 +397,8 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nChunks)
       auto chunks{mState.getMaxChunks()};
       auto capacity{mState.getPartitionCapacity()};
 
-      for (auto measurements{mOptions.nTests}; measurements--;) {
-        std::cout << std::setw(2) << ">>> Concurrent read benchmark, one block per chunk";
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
         auto results = benchmarkAsync(&gpu::readChunkSBKernel<chunk_type>,
                                       mState.getMaxChunks(), // nStreams
                                       mState.getNKernelLaunches(),
@@ -445,20 +409,39 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nChunks)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          mStreamer.get()->storeBenchmarkEntry("readConcurrentSplitBlocks", std::to_string(iResult), getType<chunk_type>(), results[iResult]);
+          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
+          mStreamer.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
     case SplitLevel::Threads: {
-      // auto nBlocks{mState.nMultiprocessors};
-      // auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
-      // auto chunks{mState.getMaxChunks()};
-      // auto capacity{mState.getPartitionCapacity()};
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::readChunkMBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
+          mStreamer.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
       break;
     }
   }
-  std::cout << " completed." << std::endl;
 }
 
 template <class chunk_type>
@@ -481,8 +464,8 @@ void GPUbenchmark<chunk_type>::run()
   // Test calls go here:
   readingInit();
   // - Reading whole memory
-  readingSequential(SplitLevel::Threads);
-  readingSequential(SplitLevel::Blocks);
+  // readingSequential(SplitLevel::Threads);
+  // readingSequential(SplitLevel::Blocks);
 
   // - Reading memory partitions
   readingConcurrent(SplitLevel::Blocks);

From 195aae766adfb8e4b38cd9d5acb1fbba400dc197 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 8 Jul 2021 22:13:19 +0200
Subject: [PATCH 32/42] Add last read test

---
 GPU/GPUbenchmark/benchmark.cxx   | 8 ++++----
 GPU/GPUbenchmark/cuda/Kernels.cu | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 4dfc039c34efe..6b1d5338db5c1 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -66,10 +66,10 @@ int main(int argc, const char* argv[])
 
   o2::benchmark::GPUbenchmark<char> bm_char{opts, streamer};
   bm_char.run();
-  // o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
-  // bm_int.run();
-  // o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
-  // bm_size_t.run();
+  o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
+  bm_int.run();
+  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
+  bm_size_t.run();
 
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 6510d153c5a9d..e182d3889ded4 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -423,7 +423,7 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
-        std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << ">>> Concurrent read, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
         auto results = benchmarkAsync(&gpu::readChunkMBKernel<chunk_type>,
                                       mState.getMaxChunks(), // nStreams
                                       mState.getNKernelLaunches(),
@@ -464,11 +464,12 @@ void GPUbenchmark<chunk_type>::run()
   // Test calls go here:
   readingInit();
   // - Reading whole memory
-  // readingSequential(SplitLevel::Threads);
-  // readingSequential(SplitLevel::Blocks);
+  readingSequential(SplitLevel::Threads);
+  readingSequential(SplitLevel::Blocks);
 
   // - Reading memory partitions
   readingConcurrent(SplitLevel::Blocks);
+  readingConcurrent(SplitLevel::Threads);
   readingFinalize();
 
   GPUbenchmark<chunk_type>::globalFinalize();

From 33b19bec7a405490b8425d4e53418a5851fb2edf Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 8 Jul 2021 22:17:22 +0200
Subject: [PATCH 33/42] Add last read test

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e182d3889ded4..e4784da0bb5d5 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -464,8 +464,8 @@ void GPUbenchmark<chunk_type>::run()
   // Test calls go here:
   readingInit();
   // - Reading whole memory
-  readingSequential(SplitLevel::Threads);
   readingSequential(SplitLevel::Blocks);
+  readingSequential(SplitLevel::Threads);
 
   // - Reading memory partitions
   readingConcurrent(SplitLevel::Blocks);

From 434c9af4fceca965ddae2dd8b02a7d2d08b3f305 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Thu, 8 Jul 2021 22:36:50 +0200
Subject: [PATCH 34/42] Fix result dump on file

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e4784da0bb5d5..9cba1e713f852 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -434,7 +434,7 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
+          auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
           mStreamer.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;

From 9daeab5b442849df500e26995f3bd9dfec9c465e Mon Sep 17 00:00:00 2001
From: Matteo Concas <mconcas@cern.ch>
Date: Fri, 9 Jul 2021 19:00:30 +0200
Subject: [PATCH 35/42] add reading kernel

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 9cba1e713f852..5149245789e61 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -105,6 +105,36 @@ GPUg() void readChunkMBKernel(
     }
   }
 }
+
+// Writing
+template <class chunk_type>
+GPUg() void writeChunkSBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
+      getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1;
+    }
+  }
+}
+
+template <class chunk_type>
+GPUg() void writeChunkMBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) {
+    getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1;
+  }
+}
+
 } // namespace gpu
 
 void printDeviceProp(int deviceId)

From e309004c388f114fba0e3f1acfbbaaf706e0e835 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Sat, 10 Jul 2021 10:51:08 +0200
Subject: [PATCH 36/42] Add write tests

---
 GPU/GPUbenchmark/CMakeLists.txt   |   2 +-
 GPU/GPUbenchmark/Shared/Kernels.h |  16 ++-
 GPU/GPUbenchmark/Shared/Utils.h   |   8 +-
 GPU/GPUbenchmark/cuda/Kernels.cu  | 183 ++++++++++++++++++++++++++----
 4 files changed, 178 insertions(+), 31 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 9ed33e179cc84..9151acc8bc478 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -33,7 +33,7 @@ if(HIP_ENABLED)
   if(EXISTS ${HIPIFY_EXECUTABLE})
     set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
     message("Generating HIP kernel code ...")
-    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} | tee ${HIP_KERNEL_PATH}")
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
   elseif()
     message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
   endif()
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index c741ce31a7f0c..b39dc2f3a836a 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -57,12 +57,18 @@ class GPUbenchmark final
   void printDevices();                 // Dump info
 
   // Initializations/Finalizations of tests. Not to be measured, in principle used for report
-  void readingInit();
-  void readingFinalize();
+  void readInit();
+  void readFinalize();
 
-  // Benchmark kernel callbacks
-  void readingSequential(SplitLevel sl);
-  void readingConcurrent(SplitLevel sl, int nRegions = 2);
+  void writeInit();
+  void writeFinalize();
+
+  // Kernel calling wrappers
+  void readSequential(SplitLevel sl);
+  void readConcurrent(SplitLevel sl, int nRegions = 2);
+
+  void writeSequential(SplitLevel sl);
+  void writeConcurrent(SplitLevel sl, int nRegions = 2);
 
  private:
   gpuState<chunk_type> mState;
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 931ffbfc1fc75..c8f1679fbbbd8 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -97,8 +97,12 @@ struct gpuState {
   std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
 
   // Test-specific containers
-  T* deviceReadingResultsPtr;              // Results of the reading test (single variable) on GPU
-  std::vector<T> hostReadingResultsVector; // Results of the reading test (single variable) on host
+  T* deviceReadResultsPtr;              // Results of the read test (single variable) on GPU
+  std::vector<T> hostReadResultsVector; // Results of the read test (single variable) on host
+
+  // Test-specific containers
+  T* deviceWriteResultsPtr;              // Results of the write test (single variable) on GPU
+  std::vector<T> hostWriteResultsVector; // Results of the write test (single variable) on host
 
   // Static info
   size_t totalMemory;
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 5149245789e61..e79bb1670d31d 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -18,7 +18,7 @@
 #endif
 #include <stdio.h>
 
-// Memory partition legend
+// Memory partitioning legend
 //
 // |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM
 // |--chunk 0--|--chunk 1--|--chunk 2--|                  ***                          |--chunk n--| chunks  -> default size: 1GB (single block pins)
@@ -356,15 +356,15 @@ void GPUbenchmark<chunk_type>::globalInit(const int deviceId)
 }
 
 template <class chunk_type>
-void GPUbenchmark<chunk_type>::readingInit()
+void GPUbenchmark<chunk_type>::readInit()
 {
   std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
-  mState.hostReadingResultsVector.resize(mState.getMaxChunks());
-  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadingResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
+  mState.hostReadResultsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
 }
 
 template <class chunk_type>
-void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
+void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
@@ -380,11 +380,11 @@ void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
                                       nBlocks,
                                       nThreads,
                                       iChunk,
-                                      mState.deviceReadingResultsPtr,
+                                      mState.deviceReadResultsPtr,
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitBlocks", std::to_string(iChunk), getType<chunk_type>(), result);
+          mStreamer.get()->storeBenchmarkEntry("seq_R_SB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -404,11 +404,11 @@ void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
                                       nBlocks,
                                       nThreads,
                                       iChunk,
-                                      mState.deviceReadingResultsPtr,
+                                      mState.deviceReadResultsPtr,
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("readSequentialSplitThreads", std::to_string(iChunk), getType<chunk_type>(), result);
+          mStreamer.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -418,7 +418,7 @@ void GPUbenchmark<chunk_type>::readingSequential(SplitLevel sl)
 }
 
 template <class chunk_type>
-void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
+void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
@@ -434,7 +434,7 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
                                       mState.getNKernelLaunches(),
                                       nBlocks,
                                       nThreads,
-                                      mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
@@ -459,7 +459,7 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
                                       mState.getNKernelLaunches(),
                                       nBlocks,
                                       nThreads,
-                                      mState.deviceReadingResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
@@ -475,10 +475,137 @@ void GPUbenchmark<chunk_type>::readingConcurrent(SplitLevel sl, int nRegions)
 }
 
 template <class chunk_type>
-void GPUbenchmark<chunk_type>::readingFinalize()
+void GPUbenchmark<chunk_type>::readFinalize()
 {
-  GPUCHECK(cudaMemcpy(mState.hostReadingResultsVector.data(), mState.deviceReadingResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
-  GPUCHECK(cudaFree(mState.deviceReadingResultsPtr));
+  GPUCHECK(cudaMemcpy(mState.hostReadResultsVector.data(), mState.deviceReadResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceReadResultsPtr));
+}
+
+/// Write
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeInit()
+{
+  std::cout << ">>> Initializing write benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
+  mState.hostWriteResultsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceWriteResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << ">>> Sequential write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::writeChunkSBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceWriteResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mStreamer.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+
+    case SplitLevel::Threads: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << ">>> Sequential write, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::writeChunkMBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceWriteResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mStreamer.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << ">>> Concurrent write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::writeChunkSBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
+          mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+    case SplitLevel::Threads: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << ">>> Concurrent write, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::writeChunkMBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
+          mStreamer.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeFinalize()
+{
+  GPUCHECK(cudaMemcpy(mState.hostWriteResultsVector.data(), mState.deviceWriteResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceWriteResultsPtr));
 }
 
 template <class chunk_type>
@@ -492,15 +619,25 @@ void GPUbenchmark<chunk_type>::run()
 {
   globalInit(0);
   // Test calls go here:
-  readingInit();
+  readInit();
   // - Reading whole memory
-  readingSequential(SplitLevel::Blocks);
-  readingSequential(SplitLevel::Threads);
-
-  // - Reading memory partitions
-  readingConcurrent(SplitLevel::Blocks);
-  readingConcurrent(SplitLevel::Threads);
-  readingFinalize();
+  readSequential(SplitLevel::Blocks);
+  readSequential(SplitLevel::Threads);
+
+  // - Reading memory regions
+  readConcurrent(SplitLevel::Blocks);
+  readConcurrent(SplitLevel::Threads);
+  readFinalize();
+
+  writeInit();
+  // - Write on whole memory
+  writeSequential(SplitLevel::Blocks);
+  writeSequential(SplitLevel::Threads);
+
+  // - Write memory regions
+  writeConcurrent(SplitLevel::Blocks);
+  writeConcurrent(SplitLevel::Threads);
+  writeFinalize();
 
   GPUbenchmark<chunk_type>::globalFinalize();
 }

From 901892248bb6a2d8c384a1845388054937233014 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Sat, 10 Jul 2021 13:40:24 +0200
Subject: [PATCH 37/42] Add copy benchmark

---
 GPU/GPUbenchmark/Shared/Kernels.h |   6 +
 GPU/GPUbenchmark/Shared/Utils.h   |  14 +-
 GPU/GPUbenchmark/cuda/Kernels.cu  | 211 +++++++++++++++++++++++++++---
 3 files changed, 205 insertions(+), 26 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index b39dc2f3a836a..89a5086bc5bb3 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -63,6 +63,9 @@ class GPUbenchmark final
   void writeInit();
   void writeFinalize();
 
+  void copyInit();
+  void copyFinalize();
+
   // Kernel calling wrappers
   void readSequential(SplitLevel sl);
   void readConcurrent(SplitLevel sl, int nRegions = 2);
@@ -70,6 +73,9 @@ class GPUbenchmark final
   void writeSequential(SplitLevel sl);
   void writeConcurrent(SplitLevel sl, int nRegions = 2);
 
+  void copySequential(SplitLevel sl);
+  void copyConcurrent(SplitLevel sl, int nRegions = 2);
+
  private:
   gpuState<chunk_type> mState;
   std::shared_ptr<ResultStreamer> mStreamer;
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index c8f1679fbbbd8..991e078e63888 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -95,14 +95,12 @@ struct gpuState {
   size_t scratchSize;                         // Size of scratch area (B)
   std::vector<T*> partAddrOnHost;             // Pointers to scratch partitions on host vector
   std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
-
-  // Test-specific containers
-  T* deviceReadResultsPtr;              // Results of the read test (single variable) on GPU
-  std::vector<T> hostReadResultsVector; // Results of the read test (single variable) on host
-
-  // Test-specific containers
-  T* deviceWriteResultsPtr;              // Results of the write test (single variable) on GPU
-  std::vector<T> hostWriteResultsVector; // Results of the write test (single variable) on host
+  T* deviceReadResultsPtr;                    // Results of the read test (single variable) on GPU
+  std::vector<T> hostReadResultsVector;       // Results of the read test (single variable) on host
+  T* deviceWriteResultsPtr;                   // Results of the write test (single variable) on GPU
+  std::vector<T> hostWriteResultsVector;      // Results of the write test (single variable) on host
+  T* deviceCopyInputsPtr;                     // Inputs of the copy test (single variable) on GPU
+  std::vector<T> hostCopyInputsVector;        // Inputs of the copy test (single variable) on host
 
   // Static info
   size_t totalMemory;
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e79bb1670d31d..05f17eadc9c0a 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -21,7 +21,7 @@
 // Memory partitioning legend
 //
 // |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM
-// |--chunk 0--|--chunk 1--|--chunk 2--|                  ***                          |--chunk n--| chunks  -> default size: 1GB (single block pins)
+// |--chunk 0--|--chunk 1--|--chunk 2--|                  ***                          |--chunk n--| chunks  -> default size: 1GB (sing block pins)
 // |__________________________________________scratch______________________________________________| scratch -> default size: 95% free GPU RAM
 
 #define GPUCHECK(error)                                                                        \
@@ -74,6 +74,7 @@ GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReser
 
 //////////////////
 // Kernels go here
+// Reading
 template <class chunk_type>
 GPUg() void readChunkSBKernel(
   int chunkId,
@@ -83,10 +84,13 @@ GPUg() void readChunkSBKernel(
   float chunkReservedGB = 1.f)
 {
   if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    chunk_type sink{0};
+    chunk_type* ptr = getPartPtrOnScratch(scratch, chunkReservedGB, chunkId);
     for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
-      if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast<chunk_type>(1)) { // actual read operation is performed here
-        results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i];               // this case should never happen and waves should be always in sync
-      }
+      sink += ptr[i];
+    }
+    if (sink == static_cast<chunk_type>(1)) {
+      results[chunkId] = sink;
     }
   }
 }
@@ -135,6 +139,35 @@ GPUg() void writeChunkMBKernel(
   }
 }
 
+// Copying
+template <class chunk_type>
+GPUg() void copyChunkSBKernel(
+  int chunkId,
+  chunk_type* inputs,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
+      getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId];
+    }
+  }
+}
+
+template <class chunk_type>
+GPUg() void copyChunkMBKernel(
+  int chunkId,
+  chunk_type* inputs,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) {
+    getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId];
+  }
+}
+
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -355,6 +388,7 @@ void GPUbenchmark<chunk_type>::globalInit(const int deviceId)
             << std::endl;
 }
 
+/// Read
 template <class chunk_type>
 void GPUbenchmark<chunk_type>::readInit()
 {
@@ -373,7 +407,7 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << std::setw(2) << "    ├ Sequential read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
           auto result = benchmarkSync(&gpu::readChunkSBKernel<chunk_type>,
                                       mState.getNKernelLaunches(),
@@ -397,7 +431,7 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential read, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << std::setw(2) << "    ├ Sequential read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
           auto result = benchmarkSync(&gpu::readChunkMBKernel<chunk_type>,
                                       mState.getNKernelLaunches(),
@@ -428,7 +462,7 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
-        std::cout << ">>> Concurrent read, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << "    ├ Concurrent read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         auto results = benchmarkAsync(&gpu::readChunkSBKernel<chunk_type>,
                                       mState.getMaxChunks(), // nStreams
                                       mState.getNKernelLaunches(),
@@ -453,7 +487,7 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
-        std::cout << ">>> Concurrent read, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << "    ├ Concurrent read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         auto results = benchmarkAsync(&gpu::readChunkMBKernel<chunk_type>,
                                       mState.getMaxChunks(), // nStreams
                                       mState.getNKernelLaunches(),
@@ -479,6 +513,7 @@ void GPUbenchmark<chunk_type>::readFinalize()
 {
   GPUCHECK(cudaMemcpy(mState.hostReadResultsVector.data(), mState.deviceReadResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
   GPUCHECK(cudaFree(mState.deviceReadResultsPtr));
+  std::cout << "    └ done." << std::endl;
 }
 
 /// Write
@@ -500,7 +535,7 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << std::setw(2) << "    ├ Sequential write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
           auto result = benchmarkSync(&gpu::writeChunkSBKernel<chunk_type>,
                                       mState.getNKernelLaunches(),
@@ -524,7 +559,7 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
-        std::cout << std::setw(2) << ">>> Sequential write, splitting on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << std::setw(2) << "    ├ Sequential write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
           auto result = benchmarkSync(&gpu::writeChunkMBKernel<chunk_type>,
                                       mState.getNKernelLaunches(),
@@ -555,7 +590,7 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
-        std::cout << ">>> Concurrent write, one block per chunk (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << "    ├ Concurrent write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         auto results = benchmarkAsync(&gpu::writeChunkSBKernel<chunk_type>,
                                       mState.getMaxChunks(), // nStreams
                                       mState.getNKernelLaunches(),
@@ -580,7 +615,7 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
       auto capacity{mState.getPartitionCapacity()};
 
       for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
-        std::cout << ">>> Concurrent write, split on threads (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        std::cout << "    ├ Concurrent write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
         auto results = benchmarkAsync(&gpu::writeChunkMBKernel<chunk_type>,
                                       mState.getMaxChunks(), // nStreams
                                       mState.getNKernelLaunches(),
@@ -606,6 +641,136 @@ void GPUbenchmark<chunk_type>::writeFinalize()
 {
   GPUCHECK(cudaMemcpy(mState.hostWriteResultsVector.data(), mState.deviceWriteResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
   GPUCHECK(cudaFree(mState.deviceWriteResultsPtr));
+  std::cout << "    └ done." << std::endl;
+}
+
+/// Copy
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copyInit()
+{
+  std::cout << ">>> Initializing copy benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
+  mState.hostCopyInputsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceCopyInputsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
+  GPUCHECK(cudaMemset(mState.deviceCopyInputsPtr, 1, mState.getMaxChunks() * sizeof(chunk_type)));
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::copyChunkSBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceCopyInputsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mStreamer.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+
+    case SplitLevel::Threads: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::copyChunkMBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceCopyInputsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mStreamer.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::copyChunkSBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
+          mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+    case SplitLevel::Threads: {
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::copyChunkMBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
+          mStreamer.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+        }
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copyFinalize()
+{
+  GPUCHECK(cudaMemcpy(mState.hostCopyInputsVector.data(), mState.deviceCopyInputsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceCopyInputsPtr));
+  std::cout << "    └ done." << std::endl;
 }
 
 template <class chunk_type>
@@ -618,34 +783,44 @@ template <class chunk_type>
 void GPUbenchmark<chunk_type>::run()
 {
   globalInit(0);
-  // Test calls go here:
+
   readInit();
-  // - Reading whole memory
+  // Reading in whole memory
   readSequential(SplitLevel::Blocks);
   readSequential(SplitLevel::Threads);
 
-  // - Reading memory regions
+  // Reading in memory regions
   readConcurrent(SplitLevel::Blocks);
   readConcurrent(SplitLevel::Threads);
   readFinalize();
 
   writeInit();
-  // - Write on whole memory
+  // Write on whole memory
   writeSequential(SplitLevel::Blocks);
   writeSequential(SplitLevel::Threads);
 
-  // - Write memory regions
+  // Write on memory regions
   writeConcurrent(SplitLevel::Blocks);
   writeConcurrent(SplitLevel::Threads);
   writeFinalize();
 
+  copyInit();
+  // Copy from input buffer (size = nChunks) on whole memory
+  copySequential(SplitLevel::Blocks);
+  copySequential(SplitLevel::Threads);
+
+  // Copy from input buffer (size = nChunks) on memory regions
+  copyConcurrent(SplitLevel::Blocks);
+  copyConcurrent(SplitLevel::Threads);
+  copyFinalize();
+
   GPUbenchmark<chunk_type>::globalFinalize();
 }
 
 template class GPUbenchmark<char>;
-// template class GPUbenchmark<uint4>;
 template class GPUbenchmark<size_t>;
 template class GPUbenchmark<int>;
+// template class GPUbenchmark<uint4>;
 
 } // namespace benchmark
 } // namespace o2
\ No newline at end of file

From f31f7741b60242340be1500a3df207a02b6b3deb Mon Sep 17 00:00:00 2001
From: Matteo Concas <mconcas@cern.ch>
Date: Mon, 12 Jul 2021 10:24:01 +0200
Subject: [PATCH 38/42] Remove CommonUtils dependency

---
 GPU/GPUbenchmark/CMakeLists.txt |  2 +-
 GPU/GPUbenchmark/Shared/Utils.h | 32 ++++++++++++++++----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 9151acc8bc478..df0b4e4e47263 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -50,7 +50,7 @@ if(HIP_ENABLED)
                     PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                           hip::host
                                           Boost::program_options
-                                          O2::CommonUtils
+                                          ROOT::Tree
                     TARGETVARNAME targetName)
 
   if(HIP_AMDGPUTARGET)
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 991e078e63888..cf46c61bd2c55 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -19,7 +19,7 @@
 #include <iomanip>
 #include <typeinfo>
 #include <boost/program_options.hpp>
-#include "CommonUtils/TreeStreamRedirector.h"
+#include <TTree.h>
 
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
@@ -112,41 +112,41 @@ struct gpuState {
 class ResultStreamer
 {
  public:
-  explicit ResultStreamer(const std::string debugTreeFileName = "benchmark_results.root");
+  explicit ResultStreamer(const std::string resultsTreeFilename = "benchmark_results.root");
   ~ResultStreamer();
   void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry);
   void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
 
  private:
-  std::string mDebugTreeFileName = "benchmark_results.root"; // output filename
-  o2::utils::TreeStreamRedirector* mTreeStream;              // observer
+  std::string mResultsTreeFilename = "benchmark_results.root"; // output filename
+  TTree* mTree;                                                // observer
 };
 
-inline ResultStreamer::ResultStreamer(const std::string debugTreeFileName)
+inline ResultStreamer::ResultStreamer(const std::string resultsTreeFilename)
 {
-  mDebugTreeFileName = debugTreeFileName;
-  mTreeStream = new o2::utils::TreeStreamRedirector(debugTreeFileName.data(), "recreate");
+  mResultsTreeFilename = resultsTreeFilename;
+  mTree = new TTree(resultsTreeFilename.data(), resultsTreeFilename.data());
 }
 
 inline ResultStreamer::~ResultStreamer()
 {
-  delete mTreeStream;
+  delete mTree;
 }
 
 inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry)
 {
-  (*mTreeStream)
-    << (benchmarkName + "_" + type + "_" + chunk).data()
-    << "elapsed=" << entry
-    << "\n";
+  // (*mTree)
+  //   << (benchmarkName + "_" + type + "_" + chunk).data()
+  //   << "elapsed=" << entry
+  //   << "\n";
 }
 
 inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
 {
-  (*mTreeStream)
-    << (benchmarkName + "_" + type + "_region_" + region).data()
-    << "elapsed=" << entry
-    << "\n";
+  // (*mTree)
+  //   << (benchmarkName + "_" + type + "_region_" + region).data()
+  //   << "elapsed=" << entry
+  //   << "\n";
 }
 
 } // namespace benchmark

From e330a7ebb7c2d1b3080410cb9ae8f17258440492 Mon Sep 17 00:00:00 2001
From: Matteo Concas <mconcas@cern.ch>
Date: Mon, 12 Jul 2021 12:35:06 +0200
Subject: [PATCH 39/42] Remove GPUCommon dependency

---
 GPU/GPUbenchmark/CMakeLists.txt   |  8 +++-----
 GPU/GPUbenchmark/Shared/Kernels.h |  1 -
 GPU/GPUbenchmark/cuda/Kernels.cu  | 14 +++++++-------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index df0b4e4e47263..4e9f9be10618b 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -16,9 +16,8 @@ if(CUDA_ENABLED)
   o2_add_executable(gpu-memory-benchmark-cuda
                   SOURCES benchmark.cxx
                           cuda/Kernels.cu
-                  PUBLIC_LINK_LIBRARIES O2::GPUCommon
-                                        Boost::program_options
-                                        O2::CommonUtils
+                  PUBLIC_LINK_LIBRARIES Boost::program_options
+                                        ROOT::Tree
                   TARGETVARNAME targetName)
 endif()
 
@@ -47,8 +46,7 @@ if(HIP_ENABLED)
   o2_add_executable(gpu-memory-benchmark-hip
                     SOURCES benchmark.cxx
                             hip/Kernels.hip.cxx
-                    PUBLIC_LINK_LIBRARIES O2::GPUCommon
-                                          hip::host
+                    PUBLIC_LINK_LIBRARIES hip::host
                                           Boost::program_options
                                           ROOT::Tree
                     TARGETVARNAME targetName)
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 89a5086bc5bb3..6218a18015b0d 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -15,7 +15,6 @@
 #ifndef GPU_BENCHMARK_KERNELS_H
 #define GPU_BENCHMARK_KERNELS_H
 
-#include "GPUCommonDef.h"
 #include "Utils.h"
 #include <vector>
 #include <iostream>
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 05f17eadc9c0a..12437dc0f5383 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -67,7 +67,7 @@ namespace gpu
 ///////////////////////////
 // Device functions go here
 template <class chunk_type>
-GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber)
+__host__ __device__ inline chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber)
 {
   return reinterpret_cast<chunk_type*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * partNumber);
 }
@@ -76,7 +76,7 @@ GPUhd() chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReser
 // Kernels go here
 // Reading
 template <class chunk_type>
-GPUg() void readChunkSBKernel(
+__global__ void readChunkSBKernel(
   int chunkId,
   chunk_type* results,
   chunk_type* scratch,
@@ -96,7 +96,7 @@ GPUg() void readChunkSBKernel(
 }
 
 template <class chunk_type>
-GPUg() void readChunkMBKernel(
+__global__ void readChunkMBKernel(
   int chunkId,
   chunk_type* results,
   chunk_type* scratch,
@@ -112,7 +112,7 @@ GPUg() void readChunkMBKernel(
 
 // Writing
 template <class chunk_type>
-GPUg() void writeChunkSBKernel(
+__global__ void writeChunkSBKernel(
   int chunkId,
   chunk_type* results,
   chunk_type* scratch,
@@ -127,7 +127,7 @@ GPUg() void writeChunkSBKernel(
 }
 
 template <class chunk_type>
-GPUg() void writeChunkMBKernel(
+__global__ void writeChunkMBKernel(
   int chunkId,
   chunk_type* results,
   chunk_type* scratch,
@@ -141,7 +141,7 @@ GPUg() void writeChunkMBKernel(
 
 // Copying
 template <class chunk_type>
-GPUg() void copyChunkSBKernel(
+__global__ void copyChunkSBKernel(
   int chunkId,
   chunk_type* inputs,
   chunk_type* scratch,
@@ -156,7 +156,7 @@ GPUg() void copyChunkSBKernel(
 }
 
 template <class chunk_type>
-GPUg() void copyChunkMBKernel(
+__global__ void copyChunkMBKernel(
   int chunkId,
   chunk_type* inputs,
   chunk_type* scratch,

From 1480959537ffdd822c0af4f05ae65c744b934bad Mon Sep 17 00:00:00 2001
From: Matteo Concas <mconcas@cern.ch>
Date: Mon, 12 Jul 2021 19:59:25 +0200
Subject: [PATCH 40/42] Fix fullCI errors

---
 GPU/GPUbenchmark/CMakeLists.txt  | 5 +++--
 GPU/GPUbenchmark/Shared/Utils.h  | 7 +------
 GPU/GPUbenchmark/cuda/Kernels.cu | 2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 4e9f9be10618b..8a93dcec41101 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -32,8 +32,9 @@ if(HIP_ENABLED)
   if(EXISTS ${HIPIFY_EXECUTABLE})
     set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
     message("Generating HIP kernel code ...")
-    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} ${CU_KERNEL} > ${HIP_KERNEL_PATH}")
-  elseif()
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}")
+# sed '1{/\#include \"hip\/hip_runtime.h\"/d}'
+    elseif()
     message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
   endif()
 
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index cf46c61bd2c55..319ac6a156539 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -113,7 +113,7 @@ class ResultStreamer
 {
  public:
   explicit ResultStreamer(const std::string resultsTreeFilename = "benchmark_results.root");
-  ~ResultStreamer();
+  ~ResultStreamer() = default;
   void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry);
   void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
 
@@ -128,11 +128,6 @@ inline ResultStreamer::ResultStreamer(const std::string resultsTreeFilename)
   mTree = new TTree(resultsTreeFilename.data(), resultsTreeFilename.data());
 }
 
-inline ResultStreamer::~ResultStreamer()
-{
-  delete mTree;
-}
-
 inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry)
 {
   // (*mTree)
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 12437dc0f5383..e43b9dc1ab792 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -16,7 +16,7 @@
 #if defined(__HIPCC__)
 #include "hip/hip_runtime.h"
 #endif
-#include <stdio.h>
+#include <cstdio>
 
 // Memory partitioning legend
 //

From 5383afc00e0ad1308b7c4fc419160d8dc3574771 Mon Sep 17 00:00:00 2001
From: Matteo Concas <mconcas@cern.ch>
Date: Tue, 13 Jul 2021 19:56:14 +0200
Subject: [PATCH 41/42] Revise result saving

---
 GPU/GPUbenchmark/Shared/Kernels.h |  4 +-
 GPU/GPUbenchmark/Shared/Utils.h   | 51 +++++++++++++++------
 GPU/GPUbenchmark/benchmark.cxx    | 12 ++---
 GPU/GPUbenchmark/cuda/Kernels.cu  | 75 ++++++++++++++++---------------
 4 files changed, 84 insertions(+), 58 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index 6218a18015b0d..a4e7f71440347 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -32,7 +32,7 @@ class GPUbenchmark final
 {
  public:
   GPUbenchmark() = delete; // need for a configuration
-  GPUbenchmark(benchmarkOpts& opts, std::shared_ptr<ResultStreamer> streamer) : mStreamer{streamer}, mOptions{opts}
+  GPUbenchmark(benchmarkOpts& opts, std::shared_ptr<ResultWriter> rWriter) : mResultWriter{rWriter}, mOptions{opts}
   {
   }
   virtual ~GPUbenchmark() = default;
@@ -77,7 +77,7 @@ class GPUbenchmark final
 
  private:
   gpuState<chunk_type> mState;
-  std::shared_ptr<ResultStreamer> mStreamer;
+  std::shared_ptr<ResultWriter> mResultWriter;
   benchmarkOpts mOptions;
 };
 
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 319ac6a156539..e5ae595883c74 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -19,7 +19,9 @@
 #include <iomanip>
 #include <typeinfo>
 #include <boost/program_options.hpp>
+#include <vector>
 #include <TTree.h>
+#include <TFile.h>
 
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
@@ -109,34 +111,56 @@ struct gpuState {
 };
 
 // Interface class to stream results to root file
-class ResultStreamer
+class ResultWriter
 {
  public:
-  explicit ResultStreamer(const std::string resultsTreeFilename = "benchmark_results.root");
-  ~ResultStreamer() = default;
-  void storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry);
+  explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root");
+  ~ResultWriter() = default;
+  void storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry);
   void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
+  void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks);
+  void snapshotBenchmark(const std::string bName, const std::string type);
+  void saveToFile(std::string filename = "benchmark_results.root");
 
  private:
   std::string mResultsTreeFilename = "benchmark_results.root"; // output filename
-  TTree* mTree;                                                // observer
+  // std::unordered_map<std::string, std::vector<float>> mBenchmarksChunk;
+  // std::unordered_map<std::string, std::vector<float>> mBenchmarksRegions;
+  std::vector<float> mBenchmarkResults;
+  TBranch* mTmpBranch;
+  TTree* mTree;
 };
 
-inline ResultStreamer::ResultStreamer(const std::string resultsTreeFilename)
+inline ResultWriter::ResultWriter(const std::string resultsTreeFilename)
 {
   mResultsTreeFilename = resultsTreeFilename;
-  mTree = new TTree(resultsTreeFilename.data(), resultsTreeFilename.data());
+  mTree = new TTree("GPUbenchmarks", "GPUbenchmarks");
 }
 
-inline void ResultStreamer::storeBenchmarkEntry(std::string benchmarkName, std::string chunk, std::string type, float entry)
+inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks)
 {
-  // (*mTree)
-  //   << (benchmarkName + "_" + type + "_" + chunk).data()
-  //   << "elapsed=" << entry
-  //   << "\n";
+  mTmpBranch = mTree->Branch((bName + "_" + type).data(), &mBenchmarkResults);
+  mBenchmarkResults.resize(nChunks);
 }
 
-inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
+inline void ResultWriter::storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry)
+{
+  mBenchmarkResults[chunk] = entry;
+}
+
+inline void ResultWriter::snapshotBenchmark(const std::string bName, const std::string type)
+{
+  mTree->Fill();
+}
+
+inline void ResultWriter::saveToFile(std::string filename)
+{
+  auto file = TFile::Open(filename.data(), "recreate");
+  mTree->Write();
+  file->Close();
+}
+
+inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
 {
   // (*mTree)
   //   << (benchmarkName + "_" + type + "_region_" + region).data()
@@ -153,5 +177,4 @@ inline void ResultStreamer::storeEntryForRegion(std::string benchmarkName, std::
   printf("\n");                           \
   printf("error: TEST FAILED\n%s", KNRM); \
   exit(EXIT_FAILURE);
-
 #endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 6b1d5338db5c1..83acae586d735 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -51,7 +51,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   return true;
 }
 
-using o2::benchmark::ResultStreamer;
+using o2::benchmark::ResultWriter;
 
 int main(int argc, const char* argv[])
 {
@@ -62,12 +62,12 @@ int main(int argc, const char* argv[])
     return -1;
   }
 
-  std::shared_ptr<ResultStreamer> streamer = std::make_shared<ResultStreamer>();
+  std::shared_ptr<ResultWriter> streamer = std::make_shared<ResultWriter>();
 
-  o2::benchmark::GPUbenchmark<char> bm_char{opts, streamer};
-  bm_char.run();
-  o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
-  bm_int.run();
+  // o2::benchmark::GPUbenchmark<char> bm_char{opts, streamer};
+  // bm_char.run();
+  // o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
+  // bm_int.run();
   o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
   bm_size_t.run();
 
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e43b9dc1ab792..516e3d5327d4f 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -46,7 +46,7 @@ std::string getType()
     return std::string{"char"};
   }
   if (typeid(T).name() == typeid(size_t).name()) {
-    return std::string{"unsigned long"};
+    return std::string{"unsigned_long"};
   }
   if (typeid(T).name() == typeid(int).name()) {
     return std::string{"int"};
@@ -402,6 +402,7 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("seq_R_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -418,10 +419,12 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("seq_R_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+          mResultWriter.get()->storeBenchmarkEntry("seq_R_SB", getType<chunk_type>(), iChunk, result);
         }
+        mResultWriter.get()->snapshotBenchmark("seq_R_SB", getType<chunk_type>());
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
+      mResultWriter->saveToFile();
       break;
     }
 
@@ -442,7 +445,7 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+          // mResultWriter.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -474,7 +477,7 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
-          mStreamer.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          // mResultWriter.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -499,7 +502,7 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
-          mStreamer.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          // mResultWriter.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -546,7 +549,7 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+          // mResultWriter.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -570,7 +573,7 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+          // mResultWriter.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -602,7 +605,7 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
-          mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -627,7 +630,7 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
-          mStreamer.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          // mResultWriter.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -675,7 +678,7 @@ void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+          // mResultWriter.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -699,7 +702,7 @@ void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mStreamer.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+          // mResultWriter.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType<chunk_type>(), result);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -731,7 +734,7 @@ void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
-          mStreamer.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -756,7 +759,7 @@ void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
-          mStreamer.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          // mResultWriter.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
         }
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
@@ -787,32 +790,32 @@ void GPUbenchmark<chunk_type>::run()
   readInit();
   // Reading in whole memory
   readSequential(SplitLevel::Blocks);
-  readSequential(SplitLevel::Threads);
+  // readSequential(SplitLevel::Threads);
 
-  // Reading in memory regions
-  readConcurrent(SplitLevel::Blocks);
-  readConcurrent(SplitLevel::Threads);
+  // // Reading in memory regions
+  // readConcurrent(SplitLevel::Blocks);
+  // readConcurrent(SplitLevel::Threads);
   readFinalize();
 
-  writeInit();
-  // Write on whole memory
-  writeSequential(SplitLevel::Blocks);
-  writeSequential(SplitLevel::Threads);
-
-  // Write on memory regions
-  writeConcurrent(SplitLevel::Blocks);
-  writeConcurrent(SplitLevel::Threads);
-  writeFinalize();
-
-  copyInit();
-  // Copy from input buffer (size = nChunks) on whole memory
-  copySequential(SplitLevel::Blocks);
-  copySequential(SplitLevel::Threads);
-
-  // Copy from input buffer (size = nChunks) on memory regions
-  copyConcurrent(SplitLevel::Blocks);
-  copyConcurrent(SplitLevel::Threads);
-  copyFinalize();
+  // writeInit();
+  // // Write on whole memory
+  // writeSequential(SplitLevel::Blocks);
+  // writeSequential(SplitLevel::Threads);
+
+  // // Write on memory regions
+  // writeConcurrent(SplitLevel::Blocks);
+  // writeConcurrent(SplitLevel::Threads);
+  // writeFinalize();
+
+  // copyInit();
+  // // Copy from input buffer (size = nChunks) on whole memory
+  // copySequential(SplitLevel::Blocks);
+  // copySequential(SplitLevel::Threads);
+
+  // // Copy from input buffer (size = nChunks) on memory regions
+  // copyConcurrent(SplitLevel::Blocks);
+  // copyConcurrent(SplitLevel::Threads);
+  // copyFinalize();
 
   GPUbenchmark<chunk_type>::globalFinalize();
 }

From cbef6f8da07b607e8c29890d34ad3566bace5044 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 14 Jul 2021 00:35:58 +0200
Subject: [PATCH 42/42] Ready to test on EPN

---
 GPU/GPUbenchmark/CMakeLists.txt  |   1 -
 GPU/GPUbenchmark/Shared/Utils.h  |  36 +++++------
 GPU/GPUbenchmark/benchmark.cxx   |  15 +++--
 GPU/GPUbenchmark/cuda/Kernels.cu | 102 ++++++++++++++++++-------------
 4 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
index 8a93dcec41101..e008ab4cc0f41 100644
--- a/GPU/GPUbenchmark/CMakeLists.txt
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -33,7 +33,6 @@ if(HIP_ENABLED)
     set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
     message("Generating HIP kernel code ...")
     execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}")
-# sed '1{/\#include \"hip\/hip_runtime.h\"/d}'
     elseif()
     message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
   endif()
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index e5ae595883c74..6d3400aa9a6ec 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -116,48 +116,48 @@ class ResultWriter
  public:
   explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root");
   ~ResultWriter() = default;
-  void storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry);
+  void storeBenchmarkEntry(int chunk, float entry);
   void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
   void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks);
-  void snapshotBenchmark(const std::string bName, const std::string type);
-  void saveToFile(std::string filename = "benchmark_results.root");
+  void snapshotBenchmark();
+  void saveToFile();
 
  private:
-  std::string mResultsTreeFilename = "benchmark_results.root"; // output filename
-  // std::unordered_map<std::string, std::vector<float>> mBenchmarksChunk;
-  // std::unordered_map<std::string, std::vector<float>> mBenchmarksRegions;
   std::vector<float> mBenchmarkResults;
-  TBranch* mTmpBranch;
-  TTree* mTree;
+  std::vector<TTree*> mBenchmarkTrees;
+  TFile* mOutfile;
 };
 
 inline ResultWriter::ResultWriter(const std::string resultsTreeFilename)
 {
-  mResultsTreeFilename = resultsTreeFilename;
-  mTree = new TTree("GPUbenchmarks", "GPUbenchmarks");
+  mOutfile = TFile::Open(resultsTreeFilename.data(), "recreate");
 }
 
 inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks)
 {
-  mTmpBranch = mTree->Branch((bName + "_" + type).data(), &mBenchmarkResults);
+  mBenchmarkTrees.emplace_back(new TTree((bName + "_" + type).data(), (bName + "_" + type).data()));
+  mBenchmarkResults.clear();
   mBenchmarkResults.resize(nChunks);
+  mBenchmarkTrees.back()->Branch("elapsed", &mBenchmarkResults);
 }
 
-inline void ResultWriter::storeBenchmarkEntry(const std::string bName, const std::string type, int chunk, float entry)
+inline void ResultWriter::storeBenchmarkEntry(int chunk, float entry)
 {
   mBenchmarkResults[chunk] = entry;
 }
 
-inline void ResultWriter::snapshotBenchmark(const std::string bName, const std::string type)
+inline void ResultWriter::snapshotBenchmark()
 {
-  mTree->Fill();
+  mBenchmarkTrees.back()->Fill();
 }
 
-inline void ResultWriter::saveToFile(std::string filename)
+inline void ResultWriter::saveToFile()
 {
-  auto file = TFile::Open(filename.data(), "recreate");
-  mTree->Write();
-  file->Close();
+  mOutfile->cd();
+  for (auto t : mBenchmarkTrees) {
+    t->Write();
+  }
+  mOutfile->Close();
 }
 
 inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
index 83acae586d735..7ee638594f9e3 100644
--- a/GPU/GPUbenchmark/benchmark.cxx
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -62,14 +62,17 @@ int main(int argc, const char* argv[])
     return -1;
   }
 
-  std::shared_ptr<ResultWriter> streamer = std::make_shared<ResultWriter>();
+  std::shared_ptr<ResultWriter> writer = std::make_shared<ResultWriter>();
 
-  // o2::benchmark::GPUbenchmark<char> bm_char{opts, streamer};
-  // bm_char.run();
-  // o2::benchmark::GPUbenchmark<int> bm_int{opts, streamer};
-  // bm_int.run();
-  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, streamer};
+  o2::benchmark::GPUbenchmark<char> bm_char{opts, writer};
+  bm_char.run();
+  o2::benchmark::GPUbenchmark<int> bm_int{opts, writer};
+  bm_int.run();
+  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, writer};
   bm_size_t.run();
 
+  // save results
+  writer.get()->saveToFile();
+
   return 0;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 516e3d5327d4f..8af91423c12e5 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -402,7 +402,7 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
-      mResultWriter.get()->addBenchmarkEntry("seq_R_SB", getType<chunk_type>(), mState.getMaxChunks());
+      mResultWriter.get()->addBenchmarkEntry("seq_read_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -419,16 +419,16 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          mResultWriter.get()->storeBenchmarkEntry("seq_R_SB", getType<chunk_type>(), iChunk, result);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
         }
-        mResultWriter.get()->snapshotBenchmark("seq_R_SB", getType<chunk_type>());
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
-      mResultWriter->saveToFile();
       break;
     }
 
     case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("seq_read_MB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -445,8 +445,9 @@ void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          // mResultWriter.get()->storeBenchmarkEntry("seq_R_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
@@ -459,6 +460,7 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("conc_read_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto chunks{mState.getMaxChunks()};
@@ -476,14 +478,15 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
-          // mResultWriter.get()->storeEntryForRegion("conc_R_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
     case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("conc_read_MB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto chunks{mState.getMaxChunks()};
@@ -501,9 +504,9 @@ void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
-          // mResultWriter.get()->storeEntryForRegion("conc_R_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
@@ -533,6 +536,7 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("seq_write_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -549,14 +553,16 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          // mResultWriter.get()->storeBenchmarkEntry("seq_W_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
 
     case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("seq_write_MB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -573,8 +579,9 @@ void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          // mResultWriter.get()->storeBenchmarkEntry("seq_W_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
@@ -587,6 +594,7 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("conc_write_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto chunks{mState.getMaxChunks()};
@@ -604,14 +612,15 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
-          // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
     case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("conc_write_MB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto chunks{mState.getMaxChunks()};
@@ -629,9 +638,9 @@ void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
-          // mResultWriter.get()->storeEntryForRegion("conc_W_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
@@ -662,6 +671,7 @@ void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("seq_copy_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -678,14 +688,16 @@ void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          // mResultWriter.get()->storeBenchmarkEntry("seq_C_SB", std::to_string(iChunk), getType<chunk_type>(), result);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
 
     case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("seq_copy_MB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto capacity{mState.getPartitionCapacity()};
@@ -702,8 +714,9 @@ void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
                                       mState.scratchPtr,
                                       capacity,
                                       mState.chunkReservedGB);
-          // mResultWriter.get()->storeBenchmarkEntry("seq_C_MB", std::to_string(iChunk), getType<chunk_type>(), result);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
@@ -716,6 +729,7 @@ void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
 {
   switch (sl) {
     case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("conc_copy_SB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto chunks{mState.getMaxChunks()};
@@ -733,14 +747,15 @@ void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
                                       capacity,
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
-          auto region = getCorrespondingRegionId(iResult, mState.getMaxChunks(), nRegions);
-          // mResultWriter.get()->storeEntryForRegion("conc_W_SB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
     }
     case SplitLevel::Threads: {
+            mResultWriter.get()->addBenchmarkEntry("conc_copy_MB", getType<chunk_type>(), mState.getMaxChunks());
       auto nBlocks{mState.nMultiprocessors};
       auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
       auto chunks{mState.getMaxChunks()};
@@ -759,8 +774,9 @@ void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
                                       mState.chunkReservedGB);
         for (auto iResult{0}; iResult < results.size(); ++iResult) {
           auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
-          // mResultWriter.get()->storeEntryForRegion("conc_C_MB", std::to_string(region), getType<chunk_type>(), results[iResult]);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
         }
+        mResultWriter.get()->snapshotBenchmark();
         std::cout << "\033[1;32m complete\033[0m" << std::endl;
       }
       break;
@@ -790,32 +806,32 @@ void GPUbenchmark<chunk_type>::run()
   readInit();
   // Reading in whole memory
   readSequential(SplitLevel::Blocks);
-  // readSequential(SplitLevel::Threads);
+  readSequential(SplitLevel::Threads);
 
-  // // Reading in memory regions
-  // readConcurrent(SplitLevel::Blocks);
-  // readConcurrent(SplitLevel::Threads);
+  // Reading in memory regions
+  readConcurrent(SplitLevel::Blocks);
+  readConcurrent(SplitLevel::Threads);
   readFinalize();
 
-  // writeInit();
-  // // Write on whole memory
-  // writeSequential(SplitLevel::Blocks);
-  // writeSequential(SplitLevel::Threads);
-
-  // // Write on memory regions
-  // writeConcurrent(SplitLevel::Blocks);
-  // writeConcurrent(SplitLevel::Threads);
-  // writeFinalize();
-
-  // copyInit();
-  // // Copy from input buffer (size = nChunks) on whole memory
-  // copySequential(SplitLevel::Blocks);
-  // copySequential(SplitLevel::Threads);
-
-  // // Copy from input buffer (size = nChunks) on memory regions
-  // copyConcurrent(SplitLevel::Blocks);
-  // copyConcurrent(SplitLevel::Threads);
-  // copyFinalize();
+  writeInit();
+  // Write on whole memory
+  writeSequential(SplitLevel::Blocks);
+  writeSequential(SplitLevel::Threads);
+
+  // Write on memory regions
+  writeConcurrent(SplitLevel::Blocks);
+  writeConcurrent(SplitLevel::Threads);
+  writeFinalize();
+
+  copyInit();
+  // Copy from input buffer (size = nChunks) on whole memory
+  copySequential(SplitLevel::Blocks);
+  copySequential(SplitLevel::Threads);
+
+  // Copy from input buffer (size = nChunks) on memory regions
+  copyConcurrent(SplitLevel::Blocks);
+  copyConcurrent(SplitLevel::Threads);
+  copyFinalize();
 
   GPUbenchmark<chunk_type>::globalFinalize();
 }