Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
244287b
Add CUDA backbone
mconcas May 19, 2021
4071b11
HIP breaks
mconcas May 19, 2021
04ca94b
Make two separate libraries
mconcas May 20, 2021
aae7f42
Re-arrange directories
mconcas May 23, 2021
3cc9695
Add missing header
mconcas May 23, 2021
1f31791
Meta library does not compile
mconcas May 23, 2021
175be3f
Port hipInfo example to test gpu specs
mconcas May 24, 2021
ed9ade9
Fix compilation to test build on EPN
mconcas May 25, 2021
d74f0b4
Produce two separate executables
mconcas May 27, 2021
9454b54
Flatten dir tree a bit
mconcas May 27, 2021
84f8534
Cleanup
mconcas May 27, 2021
7e2ef6a
Add CMake forced re-configuration
mconcas May 28, 2021
664759f
HIP can't find symbols
mconcas May 31, 2021
1d352ab
Checkpoint before radical change
mconcas Jun 1, 2021
12c5d39
Create single executable
mconcas Jun 1, 2021
0a7ae2e
Update
mconcas Jun 1, 2021
6c352b1
Add first dummy benchmark
mconcas Jun 21, 2021
3773655
Assign a block to each scratch segment
mconcas Jun 23, 2021
8933ff1
Fix copyright
mconcas Jun 23, 2021
fd8041b
Please consider the following formatting changes (#16)
alibuild Jun 23, 2021
f80c684
Set configurable iterations
mconcas Jun 23, 2021
3896a71
Improve busy fucntion + streaming results on file
mconcas Jun 24, 2021
cf1276e
Fix bug in CLI params
mconcas Jun 25, 2021
2b7e27e
Add configurable number of tests
mconcas Jun 25, 2021
134b31d
Fix undefined behaviour insetting nLaunches
mconcas Jun 25, 2021
3e96125
Please consider the following formatting changes (#17)
alibuild Jun 25, 2021
d928a4c
Streamline ker benchmarking w/ events
mconcas Jun 30, 2021
bc8e75b
Tidy up kernels and improve output
mconcas Jul 1, 2021
71408e8
Update read test
mconcas Jul 7, 2021
05cc7ae
Please consider the following formatting changes (#18)
alibuild Jul 7, 2021
4ae3ae4
CP
mconcas Jul 8, 2021
195aae7
Add last read test
mconcas Jul 8, 2021
33b19be
Add last read test
mconcas Jul 8, 2021
434c9af
Fix result dump on file
mconcas Jul 8, 2021
9daeab5
add reading kernel
mconcas Jul 9, 2021
e309004
Add write tests
mconcas Jul 10, 2021
9018922
Add copy benchmark
mconcas Jul 10, 2021
f31f774
Remove CommonUtils dependency
mconcas Jul 12, 2021
e330a7e
Remove GPUCommon dependency
mconcas Jul 12, 2021
1480959
Fix fullCI errors
mconcas Jul 12, 2021
5383afc
Revise result saving
mconcas Jul 13, 2021
cbef6f8
Ready to test on EPN
mconcas Jul 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ add_subdirectory(Common)
add_subdirectory(Utils)
add_subdirectory(TPCFastTransformation)
add_subdirectory(GPUTracking)
add_subdirectory(GPUbenchmark)
if(ALIGPU_BUILD_TYPE STREQUAL "O2")
add_subdirectory(Workflow)
endif()
58 changes: 58 additions & 0 deletions GPU/GPUbenchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
# All rights not expressly granted are reserved.
#
# This software is distributed under the terms of the GNU General Public
# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
#
# In applying this license CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

set(HDRS_INSTALL ../Shared/Kernels.h)

if(CUDA_ENABLED)
# add_subdirectory(cuda)
o2_add_executable(gpu-memory-benchmark-cuda
SOURCES benchmark.cxx
cuda/Kernels.cu
PUBLIC_LINK_LIBRARIES Boost::program_options
ROOT::Tree
TARGETVARNAME targetName)
endif()

if(HIP_ENABLED)
# Hipify-perl
set(HIPIFY_EXECUTABLE "/opt/rocm/bin/hipify-perl")

set(HIP_KERNEL "Kernels.hip.cxx")
set(CU_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Kernels.cu)
set(HIP_KERNEL_PATH "${CMAKE_CURRENT_SOURCE_DIR}/hip/${HIP_KERNEL}")

if(EXISTS ${HIPIFY_EXECUTABLE})
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${CU_KERNEL})
message("Generating HIP kernel code ...")
execute_process(COMMAND /bin/sh -c "${HIPIFY_EXECUTABLE} --quiet-warnings ${CU_KERNEL} | sed '1{/\\#include \"hip\\/hip_runtime.h\"/d}' > ${HIP_KERNEL_PATH}")
elseif()
message(STATUS "Could not generate ${HIP_KERNEL} HIP kernel, skipping...")
endif()

set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})

set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${O2_HIP_CMAKE_CXX_FLAGS} -fgpu-rdc")

o2_add_executable(gpu-memory-benchmark-hip
SOURCES benchmark.cxx
hip/Kernels.hip.cxx
PUBLIC_LINK_LIBRARIES hip::host
Boost::program_options
ROOT::Tree
TARGETVARNAME targetName)

if(HIP_AMDGPUTARGET)
# Need to add gpu target also to link flags due to gpu-rdc option
target_link_options(${targetName} PUBLIC --amdgpu-target=${HIP_AMDGPUTARGET})
endif()
endif()
86 changes: 86 additions & 0 deletions GPU/GPUbenchmark/Shared/Kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.
///
/// \file Kernels.h
/// \author: mconcas@cern.ch

#ifndef GPU_BENCHMARK_KERNELS_H
#define GPU_BENCHMARK_KERNELS_H

#include "Utils.h"
#include <vector>
#include <iostream>
#include <iomanip>
#include <memory>
#include <chrono>

namespace o2
{
namespace benchmark
{

template <class chunk_type>
class GPUbenchmark final
{
public:
GPUbenchmark() = delete; // need for a configuration
GPUbenchmark(benchmarkOpts& opts, std::shared_ptr<ResultWriter> rWriter) : mResultWriter{rWriter}, mOptions{opts}
{
}
virtual ~GPUbenchmark() = default;
template <typename... T>
float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);

// Single stream synchronous (sequential kernels) execution
template <typename... T>
float benchmarkSync(void (*kernel)(T...),
int nLaunches, int blocks, int threads, T&... args);

// Multi-streams asynchronous executions on whole memory
template <typename... T>
std::vector<float> benchmarkAsync(void (*kernel)(int, T...),
int nStreams, int nLaunches, int blocks, int threads, T&... args);

// Main interface
void globalInit(const int deviceId); // Allocate scratch buffers and compute runtime parameters
void run(); // Execute all specified callbacks
void globalFinalize(); // Cleanup
void printDevices(); // Dump info

// Initializations/Finalizations of tests. Not to be measured, in principle used for report
void readInit();
void readFinalize();

void writeInit();
void writeFinalize();

void copyInit();
void copyFinalize();

// Kernel calling wrappers
void readSequential(SplitLevel sl);
void readConcurrent(SplitLevel sl, int nRegions = 2);

void writeSequential(SplitLevel sl);
void writeConcurrent(SplitLevel sl, int nRegions = 2);

void copySequential(SplitLevel sl);
void copyConcurrent(SplitLevel sl, int nRegions = 2);

private:
gpuState<chunk_type> mState;
std::shared_ptr<ResultWriter> mResultWriter;
benchmarkOpts mOptions;
};

} // namespace benchmark
} // namespace o2
#endif
180 changes: 180 additions & 0 deletions GPU/GPUbenchmark/Shared/Utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.
///
/// \file Common.h
/// \author: mconcas@cern.ch

#ifndef GPU_BENCHMARK_UTILS_H
#define GPU_BENCHMARK_UTILS_H

#include <iostream>
#include <iomanip>
#include <typeinfo>
#include <boost/program_options.hpp>
#include <vector>
#include <TTree.h>
#include <TFile.h>

#define KNRM "\x1B[0m"
#define KRED "\x1B[31m"
#define KGRN "\x1B[32m"
#define KYEL "\x1B[33m"
#define KBLU "\x1B[34m"
#define KMAG "\x1B[35m"
#define KCYN "\x1B[36m"
#define KWHT "\x1B[37m"

#define GB (1024 * 1024 * 1024)

namespace o2
{
namespace benchmark
{

enum class SplitLevel {
Blocks,
Threads
};

struct benchmarkOpts {
benchmarkOpts() = default;

float chunkReservedGB = 1.f;
int nRegions = 2;
float freeMemoryFractionToAllocate = 0.95f;
int kernelLaunches = 1;
int nTests = 1;
};

template <class T>
struct gpuState {
int getMaxChunks()
{
return (double)scratchSize / (chunkReservedGB * GB);
}

void computeScratchPtrs()
{
partAddrOnHost.resize(getMaxChunks());
for (size_t iBuffAddress{0}; iBuffAddress < getMaxChunks(); ++iBuffAddress) {
partAddrOnHost[iBuffAddress] = reinterpret_cast<T*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * iBuffAddress);
}
}

size_t getPartitionCapacity()
{
return static_cast<size_t>(GB * chunkReservedGB / sizeof(T));
}

std::vector<T*> getScratchPtrs()
{
return partAddrOnHost;
}

std::vector<std::vector<T>>& getHostBuffers()
{
return gpuBuffersHost;
}

int getNKernelLaunches() { return iterations; }

// Configuration
size_t nMaxThreadsPerDimension;
int iterations;

float chunkReservedGB; // Size of each partition (GB)

// General containers and state
T* scratchPtr; // Pointer to scratch buffer
size_t scratchSize; // Size of scratch area (B)
std::vector<T*> partAddrOnHost; // Pointers to scratch partitions on host vector
std::vector<std::vector<T>> gpuBuffersHost; // Host-based vector-ized data
T* deviceReadResultsPtr; // Results of the read test (single variable) on GPU
std::vector<T> hostReadResultsVector; // Results of the read test (single variable) on host
T* deviceWriteResultsPtr; // Results of the write test (single variable) on GPU
std::vector<T> hostWriteResultsVector; // Results of the write test (single variable) on host
T* deviceCopyInputsPtr; // Inputs of the copy test (single variable) on GPU
std::vector<T> hostCopyInputsVector; // Inputs of the copy test (single variable) on host

// Static info
size_t totalMemory;
size_t nMultiprocessors;
size_t nMaxThreadsPerBlock;
};

// Interface class to stream results to root file
class ResultWriter
{
public:
explicit ResultWriter(const std::string resultsTreeFilename = "benchmark_results.root");
~ResultWriter() = default;
void storeBenchmarkEntry(int chunk, float entry);
void storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry);
void addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks);
void snapshotBenchmark();
void saveToFile();

private:
std::vector<float> mBenchmarkResults;
std::vector<TTree*> mBenchmarkTrees;
TFile* mOutfile;
};

inline ResultWriter::ResultWriter(const std::string resultsTreeFilename)
{
mOutfile = TFile::Open(resultsTreeFilename.data(), "recreate");
}

inline void ResultWriter::addBenchmarkEntry(const std::string bName, const std::string type, const int nChunks)
{
mBenchmarkTrees.emplace_back(new TTree((bName + "_" + type).data(), (bName + "_" + type).data()));
mBenchmarkResults.clear();
mBenchmarkResults.resize(nChunks);
mBenchmarkTrees.back()->Branch("elapsed", &mBenchmarkResults);
}

inline void ResultWriter::storeBenchmarkEntry(int chunk, float entry)
{
mBenchmarkResults[chunk] = entry;
}

inline void ResultWriter::snapshotBenchmark()
{
mBenchmarkTrees.back()->Fill();
}

inline void ResultWriter::saveToFile()
{
mOutfile->cd();
for (auto t : mBenchmarkTrees) {
t->Write();
}
mOutfile->Close();
}

inline void ResultWriter::storeEntryForRegion(std::string benchmarkName, std::string region, std::string type, float entry)
{
// (*mTree)
// << (benchmarkName + "_" + type + "_region_" + region).data()
// << "elapsed=" << entry
// << "\n";
}

} // namespace benchmark
} // namespace o2

#define failed(...) \
printf("%serror: ", KRED); \
printf(__VA_ARGS__); \
printf("\n"); \
printf("error: TEST FAILED\n%s", KNRM); \
exit(EXIT_FAILURE);
#endif
Loading