diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt
index f8f1931f35547..7019f951b25fb 100644
--- a/GPU/CMakeLists.txt
+++ b/GPU/CMakeLists.txt
@@ -22,6 +22,7 @@ add_subdirectory(Common)
 add_subdirectory(Utils)
 add_subdirectory(TPCFastTransformation)
 add_subdirectory(GPUTracking)
+add_subdirectory(GPUbenchmark)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
   add_subdirectory(Workflow)
 endif()
diff --git a/GPU/GPUbenchmark/CMakeLists.txt b/GPU/GPUbenchmark/CMakeLists.txt
new file mode 100644
index 0000000000000..e008ab4cc0f41
--- /dev/null
+++ b/GPU/GPUbenchmark/CMakeLists.txt
@@ -0,0 +1,58 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+set(HDRS_INSTALL ../Shared/Kernels.h)
+
+if(CUDA_ENABLED)
+  # add_subdirectory(cuda)
+  o2_add_executable(gpu-memory-benchmark-cuda
+                  SOURCES benchmark.cxx
+                          cuda/Kernels.cu
+                  PUBLIC_LINK_LIBRARIES Boost::program_options
+                                        ROOT::Tree
+                  TARGETVARNAME targetName)
+endif()
+
+if(HIP_ENABLED)
+  # Hipify-perl
+  set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
+
+  set(HIP_KERNEL "Kernels.hip.cxx")
+  set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
+  set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/hip/${HIP_KERNEL}")
+
+  if(EXISTS ${HIPIFY_EXECUTABLE})
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
+    message("Generating HIP kernel code ...")
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}")
+    elseif()
+    message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
+  endif()
+
+  set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+  set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+
+  o2_add_executable(gpu-memory-benchmark-hip
+                    SOURCES benchmark.cxx
+                            hip/Kernels.hip.cxx
+                    PUBLIC_LINK_LIBRARIES hip::host
+                                          Boost::program_options
+                                          ROOT::Tree
+                    TARGETVARNAME targetName)
+
+  if(HIP_AMDGPUTARGET)
+    # Need to add gpu target also to link flags due to gpu-rdc option
+    target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+  endif()
+endif()
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
new file mode 100644
index 0000000000000..a4e7f71440347
--- /dev/null
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -0,0 +1,86 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file Kernels.h
+/// \author: mconcas@cern.ch
+
+#ifndef GPU_BENCHMARK_KERNELS_H
+#define GPU_BENCHMARK_KERNELS_H
+
+#include "Utils.h"
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <chrono>
+
+namespace o2
+{
+namespace benchmark
+{
+
+template <class chunk_type>
+class GPUbenchmark final
+{
+ public:
+  GPUbenchmark() = delete; // need for a configuration
+  GPUbenchmark(benchmarkOpts& opts, std::shared_ptr<ResultWriter> rWriter) : mResultWriter{rWriter}, mOptions{opts}
+  {
+  }
+  virtual ~GPUbenchmark() = default;
+  template <typename... T>
+  float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
+
+  // Single stream synchronous (sequential kernels) execution
+  template <typename... T>
+  float benchmarkSync(void (*kernel)(T...),
+                      int nLaunches, int blocks, int threads, T&... args);
+
+  // Multi-streams asynchronous executions on whole memory
+  template <typename... T>
+  std::vector<float> benchmarkAsync(void (*kernel)(int, T...),
+                                    int nStreams, int nLaunches, int blocks, int threads, T&... args);
+
+  // Main interface
+  void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
+  void run();                          // Execute all specified callbacks
+  void globalFinalize();               // Cleanup
+  void printDevices();                 // Dump info
+
+  // Initializations/Finalizations of tests. Not to be measured, in principle used for report
+  void readInit();
+  void readFinalize();
+
+  void writeInit();
+  void writeFinalize();
+
+  void copyInit();
+  void copyFinalize();
+
+  // Kernel calling wrappers
+  void readSequential(SplitLevel sl);
+  void readConcurrent(SplitLevel sl, int nRegions = 2);
+
+  void writeSequential(SplitLevel sl);
+  void writeConcurrent(SplitLevel sl, int nRegions = 2);
+
+  void copySequential(SplitLevel sl);
+  void copyConcurrent(SplitLevel sl, int nRegions = 2);
+
+ private:
+  gpuState<chunk_type> mState;
+  std::shared_ptr<ResultWriter> mResultWriter;
+  benchmarkOpts mOptions;
+};
+
+} // namespace benchmark
+} // namespace o2
+#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
new file mode 100644
index 0000000000000..6d3400aa9a6ec
--- /dev/null
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -0,0 +1,180 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file Common.h
+/// \author: mconcas@cern.ch
+
+#ifndef GPU_BENCHMARK_UTILS_H
+#define GPU_BENCHMARK_UTILS_H
+
+#include <iostream>
+#include <iomanip>
+#include <typeinfo>
+#include <boost/program_options.hpp>
+#include <vector>
+#include <TTree.h>
+#include <TFile.h>
+
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"
+
+#define GB (1024 * 1024 * 1024)
+
+namespace o2
+{
+namespace benchmark
+{
+
+enum class SplitLevel {
+  Blocks,
+  Threads
+};
+
+struct benchmarkOpts {
+  benchmarkOpts() = default;
+
+  float chunkReservedGB = 1.f;
+  int nRegions = 2;
+  float freeMemoryFractionToAllocate = 0.95f;
+  int kernelLaunches = 1;
+  int nTests = 1;
+};
+
+template <class T>
+struct gpuState {
+  int getMaxChunks()
+  {
+    return (double)scratchSize / (chunkReservedGB * GB);
+  }
+
+  void computeScratchPtrs()
+  {
+    partAddrOnHost.resize(getMaxChunks());
+    for (size_t iBuffAddress{0}; iBuffAddress < getMaxChunks(); ++iBuffAddress) {
+      partAddrOnHost[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * iBuffAddress);
+    }
+  }
+
+  size_t getPartitionCapacity()
+  {
+    return static_cast<size_t>(GB * chunkReservedGB / sizeof(T));
+  }
+
+  std::vector<T*> getScratchPtrs()
+  {
+    return partAddrOnHost;
+  }
+
+  std::vector<std::vector<T>>& getHostBuffers()
+  {
+    return gpuBuffersHost;
+  }
+
+  int getNKernelLaunches() { return iterations; }
+
+  // Configuration
+  size_t nMaxThreadsPerDimension;
+  int iterations;
+
+  float chunkReservedGB; // Size of each partition (GB)
+
+  // General containers and state
+  T* scratchPtr;                              // Pointer to scratch buffer
+  size_t scratchSize;                         // Size of scratch area (B)
+  std::vector<T*> partAddrOnHost;             // Pointers to scratch partitions on host vector
+  std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
+  T* deviceReadResultsPtr;                    // Results of the read test (single variable) on GPU
+  std::vector<T> hostReadResultsVector;       // Results of the read test (single variable) on host
+  T* deviceWriteResultsPtr;                   // Results of the write test (single variable) on GPU
+  std::vector<T> hostWriteResultsVector;      // Results of the write test (single variable) on host
+  T* deviceCopyInputsPtr;                     // Inputs of the copy test (single variable) on GPU
+  std::vector<T> hostCopyInputsVector;        // Inputs of the copy test (single variable) on host
+
+  // Static info
+  size_t totalMemory;
+  size_t nMultiprocessors;
+  size_t nMaxThreadsPerBlock;
+};
+
+// Interface class to stream results to root file
+class ResultWriter
+{
+ public:
+  explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root");
+  ~ResultWriter() = default;
+  void storeBenchmarkEntry(int chunk, float entry);
+  void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
+  void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks);
+  void snapshotBenchmark();
+  void saveToFile();
+
+ private:
+  std::vector<float> mBenchmarkResults;
+  std::vector<TTree*> mBenchmarkTrees;
+  TFile* mOutfile;
+};
+
+inline ResultWriter::ResultWriter(const std::string resultsTreeFilename)
+{
+  mOutfile = TFile::Open(resultsTreeFilename.data(), "recreate");
+}
+
+inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks)
+{
+  mBenchmarkTrees.emplace_back(new TTree((bName + "_" + type).data(), (bName + "_" + type).data()));
+  mBenchmarkResults.clear();
+  mBenchmarkResults.resize(nChunks);
+  mBenchmarkTrees.back()->Branch("elapsed", &mBenchmarkResults);
+}
+
+inline void ResultWriter::storeBenchmarkEntry(int chunk, float entry)
+{
+  mBenchmarkResults[chunk] = entry;
+}
+
+inline void ResultWriter::snapshotBenchmark()
+{
+  mBenchmarkTrees.back()->Fill();
+}
+
+inline void ResultWriter::saveToFile()
+{
+  mOutfile->cd();
+  for (auto t : mBenchmarkTrees) {
+    t->Write();
+  }
+  mOutfile->Close();
+}
+
+inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
+{
+  // (*mTree)
+  //   << (benchmarkName + "_" + type + "_region_" + region).data()
+  //   << "elapsed=" << entry
+  //   << "\n";
+}
+
+} // namespace benchmark
+} // namespace o2
+
+#define failed(...)                       \
+  printf("%serror: ", KRED);              \
+  printf(__VA_ARGS__);                    \
+  printf("\n");                           \
+  printf("error: TEST FAILED\n%s", KNRM); \
+  exit(EXIT_FAILURE);
+#endif
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/benchmark.cxx b/GPU/GPUbenchmark/benchmark.cxx
new file mode 100644
index 0000000000000..7ee638594f9e3
--- /dev/null
+++ b/GPU/GPUbenchmark/benchmark.cxx
@@ -0,0 +1,78 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file benchmark.cxx
+/// \author mconcas@cern.ch
+/// \brief configuration widely inspired/copied by SimConfig
+#include "Shared/Kernels.h"
+
+bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
+{
+  namespace bpo = boost::program_options;
+  bpo::variables_map vm;
+  bpo::options_description options("Benchmark options");
+  options.add_options()(
+    "help,h", "Print help message.")(
+    "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
+    "regions,r", bpo::value<int>()->default_value(2), "Number of memory regions to partition RAM in.")(
+    "freeMemFraction,f", bpo::value<float>()->default_value(0.95f), "Fraction of free memory to be allocated (min: 0.f, max: 1.f).")(
+    "launches,l", bpo::value<int>()->default_value(10), "Number of iterations in reading kernels.")(
+    "ntests,n", bpo::value<int>()->default_value(1), "Number of times each test is run.");
+  try {
+    bpo::store(parse_command_line(argc, argv, options), vm);
+    if (vm.count("help")) {
+      std::cout << options << std::endl;
+      return false;
+    }
+
+    bpo::notify(vm);
+  } catch (const bpo::error& e) {
+    std::cerr << e.what() << "\n\n";
+    std::cerr << "Error parsing command line arguments. Available options:\n";
+
+    std::cerr << options << std::endl;
+    return false;
+  }
+
+  conf.freeMemoryFractionToAllocate = vm["freeMemFraction"].as<float>();
+  conf.chunkReservedGB = vm["chunkSize"].as<float>();
+  conf.nRegions = vm["regions"].as<int>();
+  conf.kernelLaunches = vm["launches"].as<int>();
+  conf.nTests = vm["ntests"].as<int>();
+
+  return true;
+}
+
+using o2::benchmark::ResultWriter;
+
+int main(int argc, const char* argv[])
+{
+
+  o2::benchmark::benchmarkOpts opts;
+
+  if (!parseArgs(opts, argc, argv)) {
+    return -1;
+  }
+
+  std::shared_ptr<ResultWriter> writer = std::make_shared<ResultWriter>();
+
+  o2::benchmark::GPUbenchmark<char> bm_char{opts, writer};
+  bm_char.run();
+  o2::benchmark::GPUbenchmark<int> bm_int{opts, writer};
+  bm_int.run();
+  o2::benchmark::GPUbenchmark<size_t> bm_size_t{opts, writer};
+  bm_size_t.run();
+
+  // save results
+  writer.get()->saveToFile();
+
+  return 0;
+}
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
new file mode 100644
index 0000000000000..8af91423c12e5
--- /dev/null
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -0,0 +1,845 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file Kernels.{cu, hip.cxx}
+/// \author: mconcas@cern.ch
+
+#include "../Shared/Kernels.h"
+#if defined(__HIPCC__)
+#include "hip/hip_runtime.h"
+#endif
+#include <cstdio>
+
+// Memory partitioning legend
+//
+// |----------------------region 0-----------------|----------------------region 1-----------------| regions -> deafult: 2, to test lower and upper RAM
+// |--chunk 0--|--chunk 1--|--chunk 2--|                  ***                          |--chunk n--| chunks  -> default size: 1GB (sing block pins)
+// |__________________________________________scratch______________________________________________| scratch -> default size: 95% free GPU RAM
+
+#define GPUCHECK(error)                                                                        \
+  if (error != cudaSuccess) {                                                                  \
+    printf("%serror: '%s'(%d) at %s:%d%s\n", KRED, cudaGetErrorString(error), error, __FILE__, \
+           __LINE__, KNRM);                                                                    \
+    failed("API returned error code.");                                                        \
+  }
+
+double bytesToKB(size_t s) { return (double)s / (1024.0); }
+double bytesToGB(size_t s) { return (double)s / GB; }
+
+int getCorrespondingRegionId(int Id, int nChunks, int nRegions = 1)
+{
+  return Id * nRegions / nChunks;
+}
+
+template <class T>
+std::string getType()
+{
+  if (typeid(T).name() == typeid(char).name()) {
+    return std::string{"char"};
+  }
+  if (typeid(T).name() == typeid(size_t).name()) {
+    return std::string{"unsigned_long"};
+  }
+  if (typeid(T).name() == typeid(int).name()) {
+    return std::string{"int"};
+  }
+  if (typeid(T).name() == typeid(int4).name()) {
+    return std::string{"int4"};
+  }
+  return std::string{"unknown"};
+}
+
+namespace o2
+{
+namespace benchmark
+{
+namespace gpu
+{
+
+///////////////////////////
+// Device functions go here
+template <class chunk_type>
+__host__ __device__ inline chunk_type* getPartPtrOnScratch(chunk_type* scratchPtr, float chunkReservedGB, size_t partNumber)
+{
+  return reinterpret_cast<chunk_type*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * partNumber);
+}
+
+//////////////////
+// Kernels go here
+// Reading
+template <class chunk_type>
+__global__ void readChunkSBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    chunk_type sink{0};
+    chunk_type* ptr = getPartPtrOnScratch(scratch, chunkReservedGB, chunkId);
+    for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
+      sink += ptr[i];
+    }
+    if (sink == static_cast<chunk_type>(1)) {
+      results[chunkId] = sink;
+    }
+  }
+}
+
+template <class chunk_type>
+__global__ void readChunkMBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) {
+    if (getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] == static_cast<chunk_type>(1)) { // actual read operation is performed here
+      results[chunkId] += getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i];               // this case should never happen and waves should be always in sync
+    }
+  }
+}
+
+// Writing
+template <class chunk_type>
+__global__ void writeChunkSBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
+      getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1;
+    }
+  }
+}
+
+template <class chunk_type>
+__global__ void writeChunkMBKernel(
+  int chunkId,
+  chunk_type* results,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) {
+    getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = 1;
+  }
+}
+
+// Copying
+template <class chunk_type>
+__global__ void copyChunkSBKernel(
+  int chunkId,
+  chunk_type* inputs,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  if (chunkId == blockIdx.x) { // runs only if blockIdx.x is allowed in given split
+    for (size_t i = threadIdx.x; i < chunkSize; i += blockDim.x) {
+      getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId];
+    }
+  }
+}
+
+template <class chunk_type>
+__global__ void copyChunkMBKernel(
+  int chunkId,
+  chunk_type* inputs,
+  chunk_type* scratch,
+  size_t chunkSize,
+  float chunkReservedGB = 1.f)
+{
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < chunkSize; i += blockDim.x * gridDim.x) {
+    getPartPtrOnScratch(scratch, chunkReservedGB, chunkId)[i] = inputs[chunkId];
+  }
+}
+
+} // namespace gpu
+
+void printDeviceProp(int deviceId)
+{
+  const int w1 = 34;
+  std::cout << std::left;
+  std::cout << std::setw(w1)
+            << "--------------------------------------------------------------------------------"
+            << std::endl;
+  std::cout << std::setw(w1) << "device#" << deviceId << std::endl;
+
+  cudaDeviceProp props;
+  GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
+
+  std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
+  std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
+  std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
+  std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
+  std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
+            << std::endl;
+  std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
+  std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl;
+  std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
+            << std::endl;
+  std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
+  std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0
+            << " Mhz" << std::endl;
+  std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
+            << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
+#if !defined(__CUDACC__)
+  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+            << bytesToKB(props.sharedMemPerMultiprocessor) << " KB" << std::endl;
+#endif
+#if defined(__HIPCC__)
+  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+            << bytesToKB(props.maxSharedMemoryPerMultiProcessor) << " KB" << std::endl;
+#endif
+  std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
+  std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB"
+            << std::endl;
+  std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
+  std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
+  std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
+  std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
+  std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
+  std::cout << std::setw(w1) << "major: " << props.major << std::endl;
+  std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
+  std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
+  std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl;
+  std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl;
+#if defined(__HIPCC__)
+  std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
+            << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
+            << std::endl;
+  std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
+  std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
+  std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
+  std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
+  std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
+  std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
+#endif
+  std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
+  std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
+  std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
+#if defined(__HIPCC__)
+  std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
+  std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
+#endif
+
+  int deviceCnt;
+  GPUCHECK(cudaGetDeviceCount(&deviceCnt));
+  std::cout << std::setw(w1) << "peers: ";
+  for (int i = 0; i < deviceCnt; i++) {
+    int isPeer;
+    GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId));
+    if (isPeer) {
+      std::cout << "device#" << i << " ";
+    }
+  }
+  std::cout << std::endl;
+  std::cout << std::setw(w1) << "non-peers: ";
+  for (int i = 0; i < deviceCnt; i++) {
+    int isPeer;
+    GPUCHECK(cudaDeviceCanAccessPeer(&isPeer, i, deviceId));
+    if (!isPeer) {
+      std::cout << "device#" << i << " ";
+    }
+  }
+  std::cout << std::endl;
+
+  size_t free, total;
+  GPUCHECK(cudaMemGetInfo(&free, &total));
+
+  std::cout << std::fixed << std::setprecision(2);
+  std::cout << std::setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << std::endl;
+  std::cout << std::setw(w1) << "memInfo.free:  " << bytesToGB(free) << " GB (" << std::setprecision(0)
+            << (float)free / total * 100.0 << "%)" << std::endl;
+}
+
+template <class chunk_type>
+template <typename... T>
+float GPUbenchmark<chunk_type>::benchmarkSync(void (*kernel)(T...),
+                                              int nLaunches, int blocks, int threads, T&... args) // run for each chunk (id is passed in variadic args)
+{
+  cudaEvent_t start, stop;
+  GPUCHECK(cudaEventCreate(&start));
+  GPUCHECK(cudaEventCreate(&stop));
+
+  GPUCHECK(cudaEventRecord(start));
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches
+    (*kernel)<<<blocks, threads, 0, 0>>>(args...);
+  }
+  GPUCHECK(cudaEventRecord(stop)); // record checkpoint
+
+  GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions
+  float milliseconds{0.f};
+  GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop));
+
+  return milliseconds;
+}
+
+template <class chunk_type>
+template <typename... T>
+std::vector<float> GPUbenchmark<chunk_type>::benchmarkAsync(void (*kernel)(int, T...),
+                                                            int nStreams, int nLaunches, int blocks, int threads, T&... args)
+{
+  std::vector<cudaEvent_t> starts(nStreams), stops(nStreams);
+  std::vector<cudaStream_t> streams(nStreams);
+  std::vector<float> results(nStreams);
+
+  for (auto iStream{0}; iStream < nStreams; ++iStream) { // one stream per chunk
+    GPUCHECK(cudaStreamCreate(&(streams.at(iStream))));
+    GPUCHECK(cudaEventCreate(&(starts[iStream])));
+    GPUCHECK(cudaEventCreate(&(stops[iStream])));
+  }
+
+  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+    GPUCHECK(cudaEventRecord(starts[iStream], streams[iStream]));
+
+    for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // consecutive launches on the same stream
+      (*kernel)<<<blocks, threads, 0, streams[iStream]>>>(iStream, args...);
+    }
+    GPUCHECK(cudaEventRecord(stops[iStream], streams[iStream]));
+  }
+
+  for (auto iStream{0}; iStream < nStreams; ++iStream) {
+    GPUCHECK(cudaEventSynchronize(stops[iStream]));
+    GPUCHECK(cudaEventElapsedTime(&(results.at(iStream)), starts[iStream], stops[iStream]));
+  }
+
+  return results;
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::printDevices()
+{
+  int deviceCnt;
+  GPUCHECK(cudaGetDeviceCount(&deviceCnt));
+
+  for (int i = 0; i < deviceCnt; i++) {
+    GPUCHECK(cudaSetDevice(i));
+    printDeviceProp(i);
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::globalInit(const int deviceId)
+{
+  cudaDeviceProp props;
+  size_t free;
+
+  // Fetch and store features
+  GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
+  GPUCHECK(cudaMemGetInfo(&free, &mState.totalMemory));
+
+  mState.chunkReservedGB = mOptions.chunkReservedGB;
+  mState.iterations = mOptions.kernelLaunches;
+  mState.nMultiprocessors = props.multiProcessorCount;
+  mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
+  mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
+  mState.scratchSize = static_cast<long int>(mOptions.freeMemoryFractionToAllocate * free);
+  std::cout << ">>> Running on: \033[1;31m" << props.name << "\e[0m" << std::endl;
+
+  // Allocate scratch on GPU
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
+
+  mState.computeScratchPtrs();
+  GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize))
+
+  std::cout << "    ├ Buffer type: \e[1m" << getType<chunk_type>() << "\e[0m" << std::endl
+            << "    ├ Allocated: " << std::setprecision(2) << bytesToGB(mState.scratchSize) << "/" << std::setprecision(2) << bytesToGB(mState.totalMemory)
+            << "(GB) [" << std::setprecision(3) << (100.f) * (mState.scratchSize / (float)mState.totalMemory) << "%]\n"
+            << "    ├ Number of scratch chunks: " << mState.getMaxChunks() << " of " << mOptions.chunkReservedGB << "GB each\n"
+            << "    └ Each chunk can store up to: " << mState.getPartitionCapacity() << " elements" << std::endl
+            << std::endl;
+}
+
+/// Read
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readInit()
+{
+  std::cout << ">>> Initializing read benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
+  mState.hostReadResultsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceReadResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readSequential(SplitLevel sl)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("seq_read_SB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::readChunkSBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceReadResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+
+    case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("seq_read_MB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::readChunkMBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceReadResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readConcurrent(SplitLevel sl, int nRegions)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("conc_read_SB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent read, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::readChunkSBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+    case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("conc_read_MB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent read, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::readChunkMBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceReadResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::readFinalize()
+{
+  GPUCHECK(cudaMemcpy(mState.hostReadResultsVector.data(), mState.deviceReadResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceReadResultsPtr));
+  std::cout << "    └ done." << std::endl;
+}
+
+/// Write
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeInit()
+{
+  std::cout << ">>> Initializing write benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
+  mState.hostWriteResultsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceWriteResultsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeSequential(SplitLevel sl)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("seq_write_SB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::writeChunkSBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceWriteResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+
+    case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("seq_write_MB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::writeChunkMBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceWriteResultsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeConcurrent(SplitLevel sl, int nRegions)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("conc_write_SB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent write, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::writeChunkSBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+    case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("conc_write_MB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent write, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::writeChunkMBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceWriteResultsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::writeFinalize()
+{
+  GPUCHECK(cudaMemcpy(mState.hostWriteResultsVector.data(), mState.deviceWriteResultsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceWriteResultsPtr));
+  std::cout << "    └ done." << std::endl;
+}
+
+/// Copy
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copyInit()
+{
+  std::cout << ">>> Initializing copy benchmarks with \e[1m" << mOptions.nTests << "\e[0m runs and \e[1m" << mOptions.kernelLaunches << "\e[0m kernel launches" << std::endl;
+  mState.hostCopyInputsVector.resize(mState.getMaxChunks());
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&(mState.deviceCopyInputsPtr)), mState.getMaxChunks() * sizeof(chunk_type)));
+  GPUCHECK(cudaMemset(mState.deviceCopyInputsPtr, 1, mState.getMaxChunks() * sizeof(chunk_type)));
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copySequential(SplitLevel sl)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("seq_copy_SB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::copyChunkSBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceCopyInputsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+
+    case SplitLevel::Threads: {
+      mResultWriter.get()->addBenchmarkEntry("seq_copy_MB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { // loop on the number of times we perform same measurement
+        std::cout << std::setw(2) << "    ├ Sequential copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        for (auto iChunk{0}; iChunk < mState.getMaxChunks(); ++iChunk) { // loop over single chunks separately
+          auto result = benchmarkSync(&gpu::copyChunkMBKernel<chunk_type>,
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      iChunk,
+                                      mState.deviceCopyInputsPtr,
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+          mResultWriter.get()->storeBenchmarkEntry(iChunk, result);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copyConcurrent(SplitLevel sl, int nRegions)
+{
+  switch (sl) {
+    case SplitLevel::Blocks: {
+      mResultWriter.get()->addBenchmarkEntry("conc_copy_SB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent copy, sing block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::copyChunkSBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+    case SplitLevel::Threads: {
+            mResultWriter.get()->addBenchmarkEntry("conc_copy_MB", getType<chunk_type>(), mState.getMaxChunks());
+      auto nBlocks{mState.nMultiprocessors};
+      auto nThreads{std::min(mState.nMaxThreadsPerDimension, mState.nMaxThreadsPerBlock)};
+      auto chunks{mState.getMaxChunks()};
+      auto capacity{mState.getPartitionCapacity()};
+
+      for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
+        std::cout << "    ├ Concurrent copy, mult block (" << measurement + 1 << "/" << mOptions.nTests << "):";
+        auto results = benchmarkAsync(&gpu::copyChunkMBKernel<chunk_type>,
+                                      mState.getMaxChunks(), // nStreams
+                                      mState.getNKernelLaunches(),
+                                      nBlocks,
+                                      nThreads,
+                                      mState.deviceCopyInputsPtr, // kernel arguments (chunkId is passed by wrapper)
+                                      mState.scratchPtr,
+                                      capacity,
+                                      mState.chunkReservedGB);
+        for (auto iResult{0}; iResult < results.size(); ++iResult) {
+          auto region = getCorrespondingRegionId(iResult, nBlocks, nRegions);
+          mResultWriter.get()->storeBenchmarkEntry(iResult, results[iResult]);
+        }
+        mResultWriter.get()->snapshotBenchmark();
+        std::cout << "\033[1;32m complete\033[0m" << std::endl;
+      }
+      break;
+    }
+  }
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::copyFinalize()
+{
+  GPUCHECK(cudaMemcpy(mState.hostCopyInputsVector.data(), mState.deviceCopyInputsPtr, mState.getMaxChunks() * sizeof(chunk_type), cudaMemcpyDeviceToHost));
+  GPUCHECK(cudaFree(mState.deviceCopyInputsPtr));
+  std::cout << "    └ done." << std::endl;
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::globalFinalize()
+{
+  GPUCHECK(cudaFree(mState.scratchPtr));
+}
+
+template <class chunk_type>
+void GPUbenchmark<chunk_type>::run()
+{
+  globalInit(0);
+
+  readInit();
+  // Reading in whole memory
+  readSequential(SplitLevel::Blocks);
+  readSequential(SplitLevel::Threads);
+
+  // Reading in memory regions
+  readConcurrent(SplitLevel::Blocks);
+  readConcurrent(SplitLevel::Threads);
+  readFinalize();
+
+  writeInit();
+  // Write on whole memory
+  writeSequential(SplitLevel::Blocks);
+  writeSequential(SplitLevel::Threads);
+
+  // Write on memory regions
+  writeConcurrent(SplitLevel::Blocks);
+  writeConcurrent(SplitLevel::Threads);
+  writeFinalize();
+
+  copyInit();
+  // Copy from input buffer (size = nChunks) on whole memory
+  copySequential(SplitLevel::Blocks);
+  copySequential(SplitLevel::Threads);
+
+  // Copy from input buffer (size = nChunks) on memory regions
+  copyConcurrent(SplitLevel::Blocks);
+  copyConcurrent(SplitLevel::Threads);
+  copyFinalize();
+
+  GPUbenchmark<chunk_type>::globalFinalize();
+}
+
+template class GPUbenchmark<char>;
+template class GPUbenchmark<size_t>;
+template class GPUbenchmark<int>;
+// template class GPUbenchmark<uint4>;
+
+} // namespace benchmark
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUbenchmark/hip/.gitignore b/GPU/GPUbenchmark/hip/.gitignore
new file mode 100644
index 0000000000000..14f27f00c53c2
--- /dev/null
+++ b/GPU/GPUbenchmark/hip/.gitignore
@@ -0,0 +1 @@
+*.hip.cxx
\ No newline at end of file