AliceO2Group · davidrohr · Jul 14, 2021 · May 19, 2021 · May 19, 2021 · May 20, 2021
@@ -22,6 +22,7 @@ add_subdirectory(Common)
 add_subdirectory(Utils)
 add_subdirectory(TPCFastTransformation)
 add_subdirectory(GPUTracking)
+add_subdirectory(GPUbenchmark)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
   add_subdirectory(Workflow)
 endif()
@@ -0,0 +1,58 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+set(HDRS_INSTALL ../Shared/Kernels.h)
+
+if(CUDA_ENABLED)
+  # add_subdirectory(cuda)
+  o2_add_executable(gpu-memory-benchmark-cuda
+                  SOURCES benchmark.cxx
+                          cuda/Kernels.cu
+                  PUBLIC_LINK_LIBRARIES Boost::program_options
+                                        ROOT::Tree
+                  TARGETVARNAME targetName)
+endif()
+
+if(HIP_ENABLED)
+  # Hipify-perl
+  set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")
+
+  set(HIP_KERNEL "Kernels.hip.cxx")
+  set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
+  set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/hip/${HIP_KERNEL}")
+
+  if(EXISTS ${HIPIFY_EXECUTABLE})
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
+    message("Generating HIP kernel code ...")
+    execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}")
+    elseif()
+    message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
+  endif()
+
+  set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+  set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+  set(CMAKE_CXX_EXTENSIONS OFF)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")
+
+  o2_add_executable(gpu-memory-benchmark-hip
+                    SOURCES benchmark.cxx
+                            hip/Kernels.hip.cxx
+                    PUBLIC_LINK_LIBRARIES hip::host
+                                          Boost::program_options
+                                          ROOT::Tree
+                    TARGETVARNAME targetName)
+
+  if(HIP_AMDGPUTARGET)
+    # Need to add gpu target also to link flags due to gpu-rdc option
+    target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
+  endif()
+endif()
@@ -0,0 +1,86 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file Kernels.h
+/// \author: mconcas@cern.ch
+
+#ifndef GPU_BENCHMARK_KERNELS_H
+#define GPU_BENCHMARK_KERNELS_H
+
+#include "Utils.h"
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <chrono>
+
+namespace o2
+{
+namespace benchmark
+{
+
+template <class chunk_type>
+class GPUbenchmark final
+{
+ public:
+  GPUbenchmark() = delete; // need for a configuration
+  GPUbenchmark(benchmarkOpts& opts, std::shared_ptr<ResultWriter> rWriter) : mResultWriter{rWriter}, mOptions{opts}
+  {
+  }
+  virtual ~GPUbenchmark() = default;
+  template <typename... T>
+  float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
+
+  // Single stream synchronous (sequential kernels) execution
+  template <typename... T>
+  float benchmarkSync(void (*kernel)(T...),
+                      int nLaunches, int blocks, int threads, T&... args);
+
+  // Multi-streams asynchronous executions on whole memory
+  template <typename... T>
+  std::vector<float> benchmarkAsync(void (*kernel)(int, T...),
+                                    int nStreams, int nLaunches, int blocks, int threads, T&... args);
+
+  // Main interface
+  void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
+  void run();                          // Execute all specified callbacks
+  void globalFinalize();               // Cleanup
+  void printDevices();                 // Dump info
+
+  // Initializations/Finalizations of tests. Not to be measured, in principle used for report
+  void readInit();
+  void readFinalize();
+
+  void writeInit();
+  void writeFinalize();
+
+  void copyInit();
+  void copyFinalize();
+
+  // Kernel calling wrappers
+  void readSequential(SplitLevel sl);
+  void readConcurrent(SplitLevel sl, int nRegions = 2);
+
+  void writeSequential(SplitLevel sl);
+  void writeConcurrent(SplitLevel sl, int nRegions = 2);
+
+  void copySequential(SplitLevel sl);
+  void copyConcurrent(SplitLevel sl, int nRegions = 2);
+
+ private:
+  gpuState<chunk_type> mState;
+  std::shared_ptr<ResultWriter> mResultWriter;
+  benchmarkOpts mOptions;
+};
+
+} // namespace benchmark
+} // namespace o2
+#endif
@@ -0,0 +1,180 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+///
+/// \file Common.h
+/// \author: mconcas@cern.ch
+
+#ifndef GPU_BENCHMARK_UTILS_H
+#define GPU_BENCHMARK_UTILS_H
+
+#include <iostream>
+#include <iomanip>
+#include <typeinfo>
+#include <boost/program_options.hpp>
+#include <vector>
+#include <TTree.h>
+#include <TFile.h>
+
+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"
+
+#define GB (1024 * 1024 * 1024)
+
+namespace o2
+{
+namespace benchmark
+{
+
+enum class SplitLevel {
+  Blocks,
+  Threads
+};
+
+struct benchmarkOpts {
+  benchmarkOpts() = default;
+
+  float chunkReservedGB = 1.f;
+  int nRegions = 2;
+  float freeMemoryFractionToAllocate = 0.95f;
+  int kernelLaunches = 1;
+  int nTests = 1;
+};
+
+template <class T>
+struct gpuState {
+  int getMaxChunks()
+  {
+    return (double)scratchSize / (chunkReservedGB * GB);
+  }
+
+  void computeScratchPtrs()
+  {
+    partAddrOnHost.resize(getMaxChunks());
+    for (size_t iBuffAddress{0}; iBuffAddress < getMaxChunks(); ++iBuffAddress) {
+      partAddrOnHost[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * iBuffAddress);
+    }
+  }
+
+  size_t getPartitionCapacity()
+  {
+    return static_cast<size_t>(GB * chunkReservedGB / sizeof(T));
+  }
+
+  std::vector<T*> getScratchPtrs()
+  {
+    return partAddrOnHost;
+  }
+
+  std::vector<std::vector<T>>& getHostBuffers()
+  {
+    return gpuBuffersHost;
+  }
+
+  int getNKernelLaunches() { return iterations; }
+
+  // Configuration
+  size_t nMaxThreadsPerDimension;
+  int iterations;
+
+  float chunkReservedGB; // Size of each partition (GB)
+
+  // General containers and state
+  T* scratchPtr;                              // Pointer to scratch buffer
+  size_t scratchSize;                         // Size of scratch area (B)
+  std::vector<T*> partAddrOnHost;             // Pointers to scratch partitions on host vector
+  std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
+  T* deviceReadResultsPtr;                    // Results of the read test (single variable) on GPU
+  std::vector<T> hostReadResultsVector;       // Results of the read test (single variable) on host
+  T* deviceWriteResultsPtr;                   // Results of the write test (single variable) on GPU
+  std::vector<T> hostWriteResultsVector;      // Results of the write test (single variable) on host
+  T* deviceCopyInputsPtr;                     // Inputs of the copy test (single variable) on GPU
+  std::vector<T> hostCopyInputsVector;        // Inputs of the copy test (single variable) on host
+
+  // Static info
+  size_t totalMemory;
+  size_t nMultiprocessors;
+  size_t nMaxThreadsPerBlock;
+};
+
+// Interface class to stream results to root file
+class ResultWriter
+{
+ public:
+  explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root");
+  ~ResultWriter() = default;
+  void storeBenchmarkEntry(int chunk, float entry);
+  void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
+  void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks);
+  void snapshotBenchmark();
+  void saveToFile();
+
+ private:
+  std::vector<float> mBenchmarkResults;
+  std::vector<TTree*> mBenchmarkTrees;
+  TFile* mOutfile;
+};
+
+inline ResultWriter::ResultWriter(const std::string resultsTreeFilename)
+{
+  mOutfile = TFile::Open(resultsTreeFilename.data(), "recreate");
+}
+
+inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks)
+{
+  mBenchmarkTrees.emplace_back(new TTree((bName + "_" + type).data(), (bName + "_" + type).data()));
+  mBenchmarkResults.clear();
+  mBenchmarkResults.resize(nChunks);
+  mBenchmarkTrees.back()->Branch("elapsed", &mBenchmarkResults);
+}
+
+inline void ResultWriter::storeBenchmarkEntry(int chunk, float entry)
+{
+  mBenchmarkResults[chunk] = entry;
+}
+
+inline void ResultWriter::snapshotBenchmark()
+{
+  mBenchmarkTrees.back()->Fill();
+}
+
+inline void ResultWriter::saveToFile()
+{
+  mOutfile->cd();
+  for (auto t : mBenchmarkTrees) {
+    t->Write();
+  }
+  mOutfile->Close();
+}
+
+inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
+{
+  // (*mTree)
+  //   << (benchmarkName + "_" + type + "_region_" + region).data()
+  //   << "elapsed=" << entry
+  //   << "\n";
+}
+
+} // namespace benchmark
+} // namespace o2
+
+#define failed(...)                       \
+  printf("%serror: ", KRED);              \
+  printf(__VA_ARGS__);                    \
+  printf("\n");                           \
+  printf("error: TEST FAILED\n%s", KNRM); \
+  exit(EXIT_FAILURE);
+#endif