Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 9 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ option(ENABLE_RAPIDJSON "Enable rapid-json usage" OFF)
option(ENABLE_CNPY "Enable cnpy usage" OFF)
option(ENABLE_CUSOLVERMP "Enable cusolvermp" OFF)

if(NOT DEFINED NVHPC_ROOT_DIR AND DEFINED ENV{NVHPC_ROOT})
set(NVHPC_ROOT_DIR
"$ENV{NVHPC_ROOT}"
CACHE PATH "Path to NVIDIA HPC SDK root directory.")
endif()

# enable json support
if(ENABLE_RAPIDJSON)
find_package(RapidJSON)
Expand Down Expand Up @@ -397,21 +403,9 @@ if(USE_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}" CACHE STRING "CUDA flags" FORCE)
endif()
if (ENABLE_CUSOLVERMP)
add_compile_definitions(__CUSOLVERMP)
find_library(CAL_LIBRARY
NAMES cal
PATHS ${CAL_CUSOLVERMP_PATH}
NO_DEFAULT_PATH
)
find_library(CUSOLVERMP_LIBRARY
NAMES cusolverMp
PATHS ${CAL_CUSOLVERMP_PATH}
NO_DEFAULT_PATH
)
target_link_libraries(${ABACUS_BIN_NAME}
${CAL_LIBRARY}
${CUSOLVERMP_LIBRARY}
)
# Keep cuSolverMp discovery/linking logic in a dedicated module.
include(cmake/SetupCuSolverMp.cmake)
abacus_setup_cusolvermp(${ABACUS_BIN_NAME})
endif()
endif()
endif()
Expand Down
127 changes: 127 additions & 0 deletions cmake/SetupCuSolverMp.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# =============================================================================
# Configure cuSolverMp dependencies and linking for ABACUS
# =============================================================================

include_guard(GLOBAL)

function(abacus_setup_cusolvermp target_name)
add_compile_definitions(__CUSOLVERMP)

# Find cuSolverMp first, then decide communicator backend.
find_library(CUSOLVERMP_LIBRARY NAMES cusolverMp
HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR}
PATH_SUFFIXES lib lib64 math_libs/lib math_libs/lib64)

find_path(CUSOLVERMP_INCLUDE_DIR NAMES cusolverMp.h
HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR}
PATH_SUFFIXES include math_libs/include)

if(NOT CUSOLVERMP_LIBRARY OR NOT CUSOLVERMP_INCLUDE_DIR)
message(FATAL_ERROR
"cusolverMp not found. Set CUSOLVERMP_PATH or NVHPC_ROOT_DIR."
)
endif()

message(STATUS "Found cusolverMp: ${CUSOLVERMP_LIBRARY}")

set(CUSOLVERMP_VERSION_STR "")
set(CUSOLVERMP_VERSION_HEADER "${CUSOLVERMP_INCLUDE_DIR}/cusolverMp.h")
if(EXISTS "${CUSOLVERMP_VERSION_HEADER}")
file(STRINGS "${CUSOLVERMP_VERSION_HEADER}" CUSOLVERMP_MAJOR_LINE
REGEX "^#define[ \t]+CUSOLVERMP_VER_MAJOR[ \t]+[0-9]+")
file(STRINGS "${CUSOLVERMP_VERSION_HEADER}" CUSOLVERMP_MINOR_LINE
REGEX "^#define[ \t]+CUSOLVERMP_VER_MINOR[ \t]+[0-9]+")
file(STRINGS "${CUSOLVERMP_VERSION_HEADER}" CUSOLVERMP_PATCH_LINE
REGEX "^#define[ \t]+CUSOLVERMP_VER_PATCH[ \t]+[0-9]+")
string(REGEX MATCH "([0-9]+)" CUSOLVERMP_VER_MAJOR "${CUSOLVERMP_MAJOR_LINE}")
string(REGEX MATCH "([0-9]+)" CUSOLVERMP_VER_MINOR "${CUSOLVERMP_MINOR_LINE}")
string(REGEX MATCH "([0-9]+)" CUSOLVERMP_VER_PATCH "${CUSOLVERMP_PATCH_LINE}")
if(NOT CUSOLVERMP_VER_MAJOR STREQUAL ""
AND NOT CUSOLVERMP_VER_MINOR STREQUAL ""
AND NOT CUSOLVERMP_VER_PATCH STREQUAL "")
set(CUSOLVERMP_VERSION_STR
"${CUSOLVERMP_VER_MAJOR}.${CUSOLVERMP_VER_MINOR}.${CUSOLVERMP_VER_PATCH}")
endif()
endif()

# Auto-select communicator backend by cuSolverMp version.
# cuSolverMp < 0.7.0 -> CAL, otherwise -> NCCL.
set(_use_cal OFF)
if(CUSOLVERMP_VERSION_STR AND CUSOLVERMP_VERSION_STR VERSION_LESS "0.7.0")
set(_use_cal ON)
message(STATUS
"Detected cuSolverMp ${CUSOLVERMP_VERSION_STR} (< 0.7.0). Using CAL backend.")
elseif(CUSOLVERMP_VERSION_STR)
message(STATUS
"Detected cuSolverMp ${CUSOLVERMP_VERSION_STR} (>= 0.7.0). Using NCCL backend.")
elseif(NOT CUSOLVERMP_VERSION_STR)
message(WARNING
"Unable to detect cuSolverMp version from header. Using NCCL backend by default.")
endif()

# Backend selection:
# - _use_cal=ON -> cal communicator backend
# - _use_cal=OFF -> NCCL communicator backend
if(_use_cal)
add_compile_definitions(__USE_CAL)

find_library(CAL_LIBRARY NAMES cal
HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR}
PATH_SUFFIXES lib lib64 math_libs/lib64)
find_path(CAL_INCLUDE_DIR NAMES cal.h
HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR}
PATH_SUFFIXES include math_libs/include)

if(NOT CAL_LIBRARY OR NOT CAL_INCLUDE_DIR)
message(FATAL_ERROR "CAL not found. Set CAL_PATH or NVHPC_ROOT_DIR.")
endif()

message(STATUS "Found CAL: ${CAL_LIBRARY}")
if(NOT TARGET CAL::CAL)
add_library(CAL::CAL IMPORTED INTERFACE)
set_target_properties(CAL::CAL PROPERTIES
INTERFACE_LINK_LIBRARIES "${CAL_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${CAL_INCLUDE_DIR}")
endif()
else()

find_library(NCCL_LIBRARY NAMES nccl
HINTS ${NCCL_PATH} ${NVHPC_ROOT_DIR}
PATH_SUFFIXES lib lib64 comm_libs/nccl/lib)
find_path(NCCL_INCLUDE_DIR NAMES nccl.h
HINTS ${NCCL_PATH} ${NVHPC_ROOT_DIR}
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES include comm_libs/nccl/include)

if(NOT NCCL_LIBRARY OR NOT NCCL_INCLUDE_DIR)
message(FATAL_ERROR "NCCL not found. Set NCCL_PATH or NVHPC_ROOT_DIR.")
endif()

message(STATUS "Found NCCL: ${NCCL_LIBRARY}")
if(NOT TARGET NCCL::NCCL)
add_library(NCCL::NCCL IMPORTED INTERFACE)
set_target_properties(NCCL::NCCL PROPERTIES
INTERFACE_LINK_LIBRARIES "${NCCL_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}")
endif()
endif()

# Create cusolverMp::cusolverMp imported target
if(NOT TARGET cusolverMp::cusolverMp)
add_library(cusolverMp::cusolverMp IMPORTED INTERFACE)
set_target_properties(cusolverMp::cusolverMp PROPERTIES
INTERFACE_LINK_LIBRARIES "${CUSOLVERMP_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${CUSOLVERMP_INCLUDE_DIR}")
endif()

# === Link libraries ===
if(_use_cal)
target_link_libraries(${target_name}
CAL::CAL
cusolverMp::cusolverMp)
else()
target_link_libraries(${target_name}
NCCL::NCCL
cusolverMp::cusolverMp)
endif()
endfunction()
18 changes: 18 additions & 0 deletions source/source_base/module_device/device_check.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@ static const char* _cufftGetErrorString(cufftResult_t error)
#ifdef __CUSOLVERMP
#include <cusolverMp.h>

#ifdef __USE_CAL
#include <cal.h>

static const char* _calGetErrorString(calError_t error)
{
switch (error)
Expand Down Expand Up @@ -259,6 +262,21 @@ static const char* _calGetErrorString(calError_t error)
exit(EXIT_FAILURE); \
} \
} while (0)
#else // !__USE_CAL (use NCCL)
#include <nccl.h>

#define CHECK_NCCL(func) \
do \
{ \
ncclResult_t status = (func); \
if (status != ncclSuccess) \
{ \
fprintf(stderr, "In File %s : NCCL API failed at line %d with error: %s (%d)\n", __FILE__, __LINE__, \
ncclGetErrorString(status), status); \
exit(EXIT_FAILURE); \
} \
} while (0)
#endif // __USE_CAL

#endif // __CUSOLVERMP

Expand Down
45 changes: 38 additions & 7 deletions source/source_hsolver/kernels/cuda/diag_cusolvermp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ extern "C"
#include <cstdint>
#include "source_base/global_function.h"
#include "source_base/module_device/device.h"
#include "source_base/module_device/device_check.h"

#ifdef __USE_CAL
// ============================================================================
// CAL callback functions for MPI communication
// ============================================================================

static calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request)
{
MPI_Request req;
Expand Down Expand Up @@ -43,6 +50,7 @@ static calError_t request_free(void* request)
{
return CAL_OK;
}
#endif // __USE_CAL

template <typename inputT>
Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
Expand Down Expand Up @@ -73,7 +81,8 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
int local_device_id = base_device::DeviceContext::instance().get_device_id();
Cblacs_gridinfo(this->cblacs_ctxt, &this->nprows, &this->npcols, &this->myprow, &this->mypcol);

this->cusolverCalComm = NULL;
#ifdef __USE_CAL
// Initialize CAL communicator
cal_comm_create_params_t params;
params.allgather = allgather;
params.req_test = request_test;
Expand All @@ -82,8 +91,16 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
params.rank = this->globalMpiRank;
params.nranks = this->globalMpiSize;
params.local_device = local_device_id;

CHECK_CAL(cal_comm_create(params, &this->cusolverCalComm));
#else
// Initialize NCCL communicator
ncclUniqueId ncclId;
if (this->globalMpiRank == 0) {
CHECK_NCCL(ncclGetUniqueId(&ncclId));
}
MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
CHECK_NCCL(ncclCommInitRank(&this->ncclComm, this->globalMpiSize, ncclId, this->globalMpiRank));
#endif

CHECK_CUDA(cudaStreamCreate(&this->localStream));
CHECK_CUSOLVER(cusolverMpCreate(&cusolverMpHandle, local_device_id, this->localStream));
Expand Down Expand Up @@ -116,7 +133,11 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
// Use ROW_MAJOR to match BLACS grid initialization (order='R' in parallel_2d.cpp)
CHECK_CUSOLVER(cusolverMpCreateDeviceGrid(cusolverMpHandle,
&this->grid,
#ifdef __USE_CAL
this->cusolverCalComm,
#else
this->ncclComm,
#endif
this->nprows,
this->npcols,
CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
Expand All @@ -140,12 +161,22 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
template <typename inputT>
Diag_CusolverMP_gvd<inputT>::~Diag_CusolverMP_gvd()
{
#ifdef __USE_CAL
CHECK_CAL(cal_comm_barrier(this->cusolverCalComm, this->localStream));
CHECK_CAL(cal_stream_sync(this->cusolverCalComm, this->localStream));
CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(this->desc_for_cusolvermp));
CHECK_CUSOLVER(cusolverMpDestroyGrid(this->grid));
CHECK_CUSOLVER(cusolverMpDestroy(this->cusolverMpHandle));
CHECK_CAL(cal_comm_destroy(this->cusolverCalComm));
CHECK_CUDA(cudaStreamDestroy(this->localStream));
#else
CHECK_CUDA(cudaStreamSynchronize(this->localStream));
CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(this->desc_for_cusolvermp));
CHECK_CUSOLVER(cusolverMpDestroyGrid(this->grid));
CHECK_CUSOLVER(cusolverMpDestroy(this->cusolverMpHandle));
CHECK_NCCL(ncclCommDestroy(this->ncclComm));
CHECK_CUDA(cudaStreamDestroy(this->localStream));
#endif
}


Expand All @@ -166,7 +197,7 @@ int Diag_CusolverMP_gvd<inputT>::generalized_eigenvector(inputT* A, inputT* B, o
cudaMemcpy(d_A, (void*)A, this->n_local * this->m_local * sizeof(inputT), cudaMemcpyHostToDevice));
CHECK_CUDA(
cudaMemcpy(d_B, (void*)B, this->n_local * this->m_local * sizeof(inputT), cudaMemcpyHostToDevice));
CHECK_CAL(cal_stream_sync(this->cusolverCalComm, this->localStream));
CHECK_CUDA(cudaStreamSynchronize(this->localStream));

size_t sygvdWorkspaceInBytesOnDevice = 0;
size_t sygvdWorkspaceInBytesOnHost = 0;
Expand Down Expand Up @@ -203,7 +234,7 @@ int Diag_CusolverMP_gvd<inputT>::generalized_eigenvector(inputT* A, inputT* B, o
CHECK_CUDA(cudaMemset(d_sygvdInfo, 0, sizeof(int)));

/* sync wait for data to arrive to device */
CHECK_CAL(cal_stream_sync(cusolverCalComm, localStream));
CHECK_CUDA(cudaStreamSynchronize(this->localStream));

CHECK_CUSOLVER(cusolverMpSygvd(cusolverMpHandle,
CUSOLVER_EIG_TYPE_1,
Expand Down Expand Up @@ -238,7 +269,7 @@ int Diag_CusolverMP_gvd<inputT>::generalized_eigenvector(inputT* A, inputT* B, o
{
ModuleBase::WARNING_QUIT("cusolvermp", "cusolverMpSygvd failed with error");
}
CHECK_CAL(cal_stream_sync(this->cusolverCalComm, this->localStream));
CHECK_CUDA(cudaStreamSynchronize(this->localStream));

CHECK_CUDA(cudaFree(d_sygvdWork));
CHECK_CUDA(cudaFree(d_sygvdInfo));
Expand All @@ -254,7 +285,7 @@ int Diag_CusolverMP_gvd<inputT>::generalized_eigenvector(inputT* A, inputT* B, o
// I move the free operations from destructor to here.
// Because I think it is more reasonable to free the memory in the function where it is allocated.
// Destructor is used to release resources that allocated in the constructor.
// And currently, we construct and destruct the object in every SCF iteration. Maybe one day we
// And currently, we construct and destruct the object in every SCF iteration. Maybe one day we
// will construct the object only once during the whole program life cycle.
// In that case, allocate and free memory in compute function is more reasonable.
CHECK_CUDA(cudaFree(d_A));
Expand Down Expand Up @@ -283,4 +314,4 @@ void Diag_CusolverMP_gvd<inputT>::outputParameters()

template class Diag_CusolverMP_gvd<double>;
template class Diag_CusolverMP_gvd<std::complex<double>>;
#endif
#endif
14 changes: 10 additions & 4 deletions source/source_hsolver/kernels/cuda/diag_cusolvermp.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@
#include <complex>
#include <fstream>
#include <vector>
#include <cal.h>
#include <cusolverMp.h>
#include "source_base/macros.h"

#ifdef __USE_CAL
#include <cal.h>
#else
#include <nccl.h>
#endif

template<typename inputT>
class Diag_CusolverMP_gvd
{
Expand Down Expand Up @@ -53,7 +58,11 @@ class Diag_CusolverMP_gvd
int globalMpiSize;
cudaDataType_t datatype;

#ifdef __USE_CAL
cal_comm_t cusolverCalComm = NULL;
#else
ncclComm_t ncclComm = NULL;
#endif
cudaStream_t localStream = NULL;
cusolverMpHandle_t cusolverMpHandle = NULL;
cusolverMpGrid_t grid = NULL;
Expand All @@ -64,6 +73,3 @@ class Diag_CusolverMP_gvd
int64_t matrix_i;
int64_t matrix_j;
};

// 实现模板类的成员函数

Loading
Loading