diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f8fc6fe0e..703fda65a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,12 @@ option(ENABLE_RAPIDJSON "Enable rapid-json usage" OFF) option(ENABLE_CNPY "Enable cnpy usage" OFF) option(ENABLE_CUSOLVERMP "Enable cusolvermp" OFF) +if(NOT DEFINED NVHPC_ROOT_DIR AND DEFINED ENV{NVHPC_ROOT}) + set(NVHPC_ROOT_DIR + "$ENV{NVHPC_ROOT}" + CACHE PATH "Path to NVIDIA HPC SDK root directory.") +endif() + # enable json support if(ENABLE_RAPIDJSON) find_package(RapidJSON) @@ -397,21 +403,9 @@ if(USE_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}" CACHE STRING "CUDA flags" FORCE) endif() if (ENABLE_CUSOLVERMP) - add_compile_definitions(__CUSOLVERMP) - find_library(CAL_LIBRARY - NAMES cal - PATHS ${CAL_CUSOLVERMP_PATH} - NO_DEFAULT_PATH - ) - find_library(CUSOLVERMP_LIBRARY - NAMES cusolverMp - PATHS ${CAL_CUSOLVERMP_PATH} - NO_DEFAULT_PATH - ) - target_link_libraries(${ABACUS_BIN_NAME} - ${CAL_LIBRARY} - ${CUSOLVERMP_LIBRARY} - ) + # Keep cuSolverMp discovery/linking logic in a dedicated module. + include(cmake/SetupCuSolverMp.cmake) + abacus_setup_cusolvermp(${ABACUS_BIN_NAME}) endif() endif() endif() diff --git a/cmake/SetupCuSolverMp.cmake b/cmake/SetupCuSolverMp.cmake new file mode 100644 index 0000000000..f0505c77c0 --- /dev/null +++ b/cmake/SetupCuSolverMp.cmake @@ -0,0 +1,127 @@ +# ============================================================================= +# Configure cuSolverMp dependencies and linking for ABACUS +# ============================================================================= + +include_guard(GLOBAL) + +function(abacus_setup_cusolvermp target_name) + add_compile_definitions(__CUSOLVERMP) + + # Find cuSolverMp first, then decide communicator backend. + find_library(CUSOLVERMP_LIBRARY NAMES cusolverMp + HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR} + PATH_SUFFIXES lib lib64 math_libs/lib math_libs/lib64) + + find_path(CUSOLVERMP_INCLUDE_DIR NAMES cusolverMp.h + HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR} + PATH_SUFFIXES include math_libs/include) + + if(NOT CUSOLVERMP_LIBRARY OR NOT CUSOLVERMP_INCLUDE_DIR) + message(FATAL_ERROR + "cusolverMp not found. Set CUSOLVERMP_PATH or NVHPC_ROOT_DIR." + ) + endif() + + message(STATUS "Found cusolverMp: ${CUSOLVERMP_LIBRARY}") + + set(CUSOLVERMP_VERSION_STR "") + set(CUSOLVERMP_VERSION_HEADER "${CUSOLVERMP_INCLUDE_DIR}/cusolverMp.h") + if(EXISTS "${CUSOLVERMP_VERSION_HEADER}") + file(STRINGS "${CUSOLVERMP_VERSION_HEADER}" CUSOLVERMP_MAJOR_LINE + REGEX "^#define[ \t]+CUSOLVERMP_VER_MAJOR[ \t]+[0-9]+") + file(STRINGS "${CUSOLVERMP_VERSION_HEADER}" CUSOLVERMP_MINOR_LINE + REGEX "^#define[ \t]+CUSOLVERMP_VER_MINOR[ \t]+[0-9]+") + file(STRINGS "${CUSOLVERMP_VERSION_HEADER}" CUSOLVERMP_PATCH_LINE + REGEX "^#define[ \t]+CUSOLVERMP_VER_PATCH[ \t]+[0-9]+") + string(REGEX MATCH "([0-9]+)" CUSOLVERMP_VER_MAJOR "${CUSOLVERMP_MAJOR_LINE}") + string(REGEX MATCH "([0-9]+)" CUSOLVERMP_VER_MINOR "${CUSOLVERMP_MINOR_LINE}") + string(REGEX MATCH "([0-9]+)" CUSOLVERMP_VER_PATCH "${CUSOLVERMP_PATCH_LINE}") + if(NOT CUSOLVERMP_VER_MAJOR STREQUAL "" + AND NOT CUSOLVERMP_VER_MINOR STREQUAL "" + AND NOT CUSOLVERMP_VER_PATCH STREQUAL "") + set(CUSOLVERMP_VERSION_STR + "${CUSOLVERMP_VER_MAJOR}.${CUSOLVERMP_VER_MINOR}.${CUSOLVERMP_VER_PATCH}") + endif() + endif() + + # Auto-select communicator backend by cuSolverMp version. + # cuSolverMp < 0.7.0 -> CAL, otherwise -> NCCL. + set(_use_cal OFF) + if(CUSOLVERMP_VERSION_STR AND CUSOLVERMP_VERSION_STR VERSION_LESS "0.7.0") + set(_use_cal ON) + message(STATUS + "Detected cuSolverMp ${CUSOLVERMP_VERSION_STR} (< 0.7.0). Using CAL backend.") + elseif(CUSOLVERMP_VERSION_STR) + message(STATUS + "Detected cuSolverMp ${CUSOLVERMP_VERSION_STR} (>= 0.7.0). Using NCCL backend.") + elseif(NOT CUSOLVERMP_VERSION_STR) + message(WARNING + "Unable to detect cuSolverMp version from header. Using NCCL backend by default.") + endif() + + # Backend selection: + # - _use_cal=ON -> cal communicator backend + # - _use_cal=OFF -> NCCL communicator backend + if(_use_cal) + add_compile_definitions(__USE_CAL) + + find_library(CAL_LIBRARY NAMES cal + HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR} + PATH_SUFFIXES lib lib64 math_libs/lib64) + find_path(CAL_INCLUDE_DIR NAMES cal.h + HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR} + PATH_SUFFIXES include math_libs/include) + + if(NOT CAL_LIBRARY OR NOT CAL_INCLUDE_DIR) + message(FATAL_ERROR "CAL not found. Set CAL_PATH or NVHPC_ROOT_DIR.") + endif() + + message(STATUS "Found CAL: ${CAL_LIBRARY}") + if(NOT TARGET CAL::CAL) + add_library(CAL::CAL IMPORTED INTERFACE) + set_target_properties(CAL::CAL PROPERTIES + INTERFACE_LINK_LIBRARIES "${CAL_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${CAL_INCLUDE_DIR}") + endif() + else() + + find_library(NCCL_LIBRARY NAMES nccl + HINTS ${NCCL_PATH} ${NVHPC_ROOT_DIR} + PATH_SUFFIXES lib lib64 comm_libs/nccl/lib) + find_path(NCCL_INCLUDE_DIR NAMES nccl.h + HINTS ${NCCL_PATH} ${NVHPC_ROOT_DIR} + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES include comm_libs/nccl/include) + + if(NOT NCCL_LIBRARY OR NOT NCCL_INCLUDE_DIR) + message(FATAL_ERROR "NCCL not found. Set NCCL_PATH or NVHPC_ROOT_DIR.") + endif() + + message(STATUS "Found NCCL: ${NCCL_LIBRARY}") + if(NOT TARGET NCCL::NCCL) + add_library(NCCL::NCCL IMPORTED INTERFACE) + set_target_properties(NCCL::NCCL PROPERTIES + INTERFACE_LINK_LIBRARIES "${NCCL_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}") + endif() + endif() + + # Create cusolverMp::cusolverMp imported target + if(NOT TARGET cusolverMp::cusolverMp) + add_library(cusolverMp::cusolverMp IMPORTED INTERFACE) + set_target_properties(cusolverMp::cusolverMp PROPERTIES + INTERFACE_LINK_LIBRARIES "${CUSOLVERMP_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${CUSOLVERMP_INCLUDE_DIR}") + endif() + + # === Link libraries === + if(_use_cal) + target_link_libraries(${target_name} + CAL::CAL + cusolverMp::cusolverMp) + else() + target_link_libraries(${target_name} + NCCL::NCCL + cusolverMp::cusolverMp) + endif() +endfunction() diff --git a/source/source_base/module_device/device_check.h b/source/source_base/module_device/device_check.h index f0e55dbb49..f649676001 100644 --- a/source/source_base/module_device/device_check.h +++ b/source/source_base/module_device/device_check.h @@ -223,6 +223,9 @@ static const char* _cufftGetErrorString(cufftResult_t error) #ifdef __CUSOLVERMP #include +#ifdef __USE_CAL +#include + static const char* _calGetErrorString(calError_t error) { switch (error) @@ -259,6 +262,21 @@ static const char* _calGetErrorString(calError_t error) exit(EXIT_FAILURE); \ } \ } while (0) +#else // !__USE_CAL (use NCCL) +#include + +#define CHECK_NCCL(func) \ + do \ + { \ + ncclResult_t status = (func); \ + if (status != ncclSuccess) \ + { \ + fprintf(stderr, "In File %s : NCCL API failed at line %d with error: %s (%d)\n", __FILE__, __LINE__, \ + ncclGetErrorString(status), status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#endif // __USE_CAL #endif // __CUSOLVERMP diff --git a/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu b/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu index 128e179e1a..c53139897f 100644 --- a/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu +++ b/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu @@ -13,6 +13,13 @@ extern "C" #include #include "source_base/global_function.h" #include "source_base/module_device/device.h" +#include "source_base/module_device/device_check.h" + +#ifdef __USE_CAL +// ============================================================================ +// CAL callback functions for MPI communication +// ============================================================================ + static calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request) { MPI_Request req; @@ -43,6 +50,7 @@ static calError_t request_free(void* request) { return CAL_OK; } +#endif // __USE_CAL template Diag_CusolverMP_gvd::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm, @@ -73,7 +81,8 @@ Diag_CusolverMP_gvd::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm, int local_device_id = base_device::DeviceContext::instance().get_device_id(); Cblacs_gridinfo(this->cblacs_ctxt, &this->nprows, &this->npcols, &this->myprow, &this->mypcol); - this->cusolverCalComm = NULL; +#ifdef __USE_CAL + // Initialize CAL communicator cal_comm_create_params_t params; params.allgather = allgather; params.req_test = request_test; @@ -82,8 +91,16 @@ Diag_CusolverMP_gvd::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm, params.rank = this->globalMpiRank; params.nranks = this->globalMpiSize; params.local_device = local_device_id; - CHECK_CAL(cal_comm_create(params, &this->cusolverCalComm)); +#else + // Initialize NCCL communicator + ncclUniqueId ncclId; + if (this->globalMpiRank == 0) { + CHECK_NCCL(ncclGetUniqueId(&ncclId)); + } + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm); + CHECK_NCCL(ncclCommInitRank(&this->ncclComm, this->globalMpiSize, ncclId, this->globalMpiRank)); +#endif CHECK_CUDA(cudaStreamCreate(&this->localStream)); CHECK_CUSOLVER(cusolverMpCreate(&cusolverMpHandle, local_device_id, this->localStream)); @@ -116,7 +133,11 @@ Diag_CusolverMP_gvd::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm, // Use ROW_MAJOR to match BLACS grid initialization (order='R' in parallel_2d.cpp) CHECK_CUSOLVER(cusolverMpCreateDeviceGrid(cusolverMpHandle, &this->grid, +#ifdef __USE_CAL this->cusolverCalComm, +#else + this->ncclComm, +#endif this->nprows, this->npcols, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR)); @@ -140,12 +161,22 @@ Diag_CusolverMP_gvd::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm, template Diag_CusolverMP_gvd::~Diag_CusolverMP_gvd() { +#ifdef __USE_CAL CHECK_CAL(cal_comm_barrier(this->cusolverCalComm, this->localStream)); + CHECK_CAL(cal_stream_sync(this->cusolverCalComm, this->localStream)); CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(this->desc_for_cusolvermp)); CHECK_CUSOLVER(cusolverMpDestroyGrid(this->grid)); CHECK_CUSOLVER(cusolverMpDestroy(this->cusolverMpHandle)); CHECK_CAL(cal_comm_destroy(this->cusolverCalComm)); CHECK_CUDA(cudaStreamDestroy(this->localStream)); +#else + CHECK_CUDA(cudaStreamSynchronize(this->localStream)); + CHECK_CUSOLVER(cusolverMpDestroyMatrixDesc(this->desc_for_cusolvermp)); + CHECK_CUSOLVER(cusolverMpDestroyGrid(this->grid)); + CHECK_CUSOLVER(cusolverMpDestroy(this->cusolverMpHandle)); + CHECK_NCCL(ncclCommDestroy(this->ncclComm)); + CHECK_CUDA(cudaStreamDestroy(this->localStream)); +#endif } @@ -166,7 +197,7 @@ int Diag_CusolverMP_gvd::generalized_eigenvector(inputT* A, inputT* B, o cudaMemcpy(d_A, (void*)A, this->n_local * this->m_local * sizeof(inputT), cudaMemcpyHostToDevice)); CHECK_CUDA( cudaMemcpy(d_B, (void*)B, this->n_local * this->m_local * sizeof(inputT), cudaMemcpyHostToDevice)); - CHECK_CAL(cal_stream_sync(this->cusolverCalComm, this->localStream)); + CHECK_CUDA(cudaStreamSynchronize(this->localStream)); size_t sygvdWorkspaceInBytesOnDevice = 0; size_t sygvdWorkspaceInBytesOnHost = 0; @@ -203,7 +234,7 @@ int Diag_CusolverMP_gvd::generalized_eigenvector(inputT* A, inputT* B, o CHECK_CUDA(cudaMemset(d_sygvdInfo, 0, sizeof(int))); /* sync wait for data to arrive to device */ - CHECK_CAL(cal_stream_sync(cusolverCalComm, localStream)); + CHECK_CUDA(cudaStreamSynchronize(this->localStream)); CHECK_CUSOLVER(cusolverMpSygvd(cusolverMpHandle, CUSOLVER_EIG_TYPE_1, @@ -238,7 +269,7 @@ int Diag_CusolverMP_gvd::generalized_eigenvector(inputT* A, inputT* B, o { ModuleBase::WARNING_QUIT("cusolvermp", "cusolverMpSygvd failed with error"); } - CHECK_CAL(cal_stream_sync(this->cusolverCalComm, this->localStream)); + CHECK_CUDA(cudaStreamSynchronize(this->localStream)); CHECK_CUDA(cudaFree(d_sygvdWork)); CHECK_CUDA(cudaFree(d_sygvdInfo)); @@ -254,7 +285,7 @@ int Diag_CusolverMP_gvd::generalized_eigenvector(inputT* A, inputT* B, o // I move the free operations from destructor to here. // Because I think it is more reasonable to free the memory in the function where it is allocated. // Destructor is used to release resources that allocated in the constructor. - // And currently, we construct and destruct the object in every SCF iteration. Maybe one day we + // And currently, we construct and destruct the object in every SCF iteration. Maybe one day we // will construct the object only once during the whole program life cycle. // In that case, allocate and free memory in compute function is more reasonable. CHECK_CUDA(cudaFree(d_A)); @@ -283,4 +314,4 @@ void Diag_CusolverMP_gvd::outputParameters() template class Diag_CusolverMP_gvd; template class Diag_CusolverMP_gvd>; -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/kernels/cuda/diag_cusolvermp.cuh b/source/source_hsolver/kernels/cuda/diag_cusolvermp.cuh index 0e330908dc..fe49677447 100644 --- a/source/source_hsolver/kernels/cuda/diag_cusolvermp.cuh +++ b/source/source_hsolver/kernels/cuda/diag_cusolvermp.cuh @@ -4,10 +4,15 @@ #include #include #include -#include #include #include "source_base/macros.h" +#ifdef __USE_CAL +#include +#else +#include +#endif + template class Diag_CusolverMP_gvd { @@ -53,7 +58,11 @@ class Diag_CusolverMP_gvd int globalMpiSize; cudaDataType_t datatype; +#ifdef __USE_CAL cal_comm_t cusolverCalComm = NULL; +#else + ncclComm_t ncclComm = NULL; +#endif cudaStream_t localStream = NULL; cusolverMpHandle_t cusolverMpHandle = NULL; cusolverMpGrid_t grid = NULL; @@ -64,6 +73,3 @@ class Diag_CusolverMP_gvd int64_t matrix_i; int64_t matrix_j; }; - -// 实现模板类的成员函数 - diff --git a/toolchain/README.md b/toolchain/README.md index 8727e7f35d..e6b81038a4 100644 --- a/toolchain/README.md +++ b/toolchain/README.md @@ -292,32 +292,36 @@ cmake -B $BUILD_DIR \ #### Multi-GPU with cuSolverMP -1. **Check or install cuSolverMP manually:** -One may use NVIDIA HPC_SDK as an easy way to install cuSolverMP. +**Option 1: Using NVIDIA HPC SDK (Recommended)** -2. **Install dependencies normally:** +1. Load the NVHPC module: ```bash -./toolchain_gnu.sh +module load nvhpc ``` -3. **Build with cuSolverMP:** +2. Build with cuSolverMP enabled: ```bash cmake -B $BUILD_DIR \ -DUSE_CUDA=ON \ -DENABLE_CUSOLVERMP=ON \ - -DCAL_CUSOLVERMP_PATH=/path/to/math_libs/lib \ # ... other options ``` -3. **Set environment variables:** +The module file will set `NVHPC_ROOT` automatically, and CMake will find cuSolverMP and NCCL(or CAL) from it. + +**Option 2: Manual Installation** + +If you installed cuSolverMP and NCCL separately and not in system default paths, specify the paths explicitly: + ```bash -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/hpcx/ucc/lib -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/hpcx/ucx/lib -export CPATH=$CPATH:/path/to/math_libs/include +cmake -B $BUILD_DIR \ + -DUSE_CUDA=ON \ + -DENABLE_CUSOLVERMP=ON \ + -DCUSOLVERMP_PATH=/path/to/cusolvermp/ \ + -DNCCL_PATH=/path/to/nccl/ \ + # ... other options ``` -**Note**: cuSolverMP requires NVIDIA HPC SDK or system installation via package manager. - ## Troubleshooting ### Common Issues diff --git a/toolchain/build_abacus_aocc-aocl.sh b/toolchain/build_abacus_aocc-aocl.sh index a586442421..ac174db099 100755 --- a/toolchain/build_abacus_aocc-aocl.sh +++ b/toolchain/build_abacus_aocc-aocl.sh @@ -66,7 +66,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \ # -Dlibnpy_INCLUDE_DIR=$LIBNPY \ # -DDeePMD_DIR=$DEEPMD \ # -DENABLE_CUSOLVERMP=ON \ -# -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib cmake --build $BUILD_DIR -j `nproc` cmake --install $BUILD_DIR 2>/dev/null diff --git a/toolchain/build_abacus_gcc-aocl.sh b/toolchain/build_abacus_gcc-aocl.sh index 65e2491ee9..e2515f609b 100755 --- a/toolchain/build_abacus_gcc-aocl.sh +++ b/toolchain/build_abacus_gcc-aocl.sh @@ -64,7 +64,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \ # -Dlibnpy_INCLUDE_DIR=$LIBNPY \ # -DDeePMD_DIR=$DEEPMD \ # -DENABLE_CUSOLVERMP=ON \ -# -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib cmake --build $BUILD_DIR -j `nproc` cmake --install $BUILD_DIR 2>/dev/null diff --git a/toolchain/build_abacus_gnu.sh b/toolchain/build_abacus_gnu.sh index 37f5603f6e..0485738c1e 100755 --- a/toolchain/build_abacus_gnu.sh +++ b/toolchain/build_abacus_gnu.sh @@ -62,7 +62,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \ # -Dlibnpy_INCLUDE_DIR=$LIBNPY \ # -DDeePMD_DIR=$DEEPMD \ # -DENABLE_CUSOLVERMP=ON \ -# -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib cmake --build $BUILD_DIR -j `nproc` cmake --install $BUILD_DIR 2>/dev/null diff --git a/toolchain/build_abacus_intel.sh b/toolchain/build_abacus_intel.sh index c347b623a1..06fcdc5ac8 100755 --- a/toolchain/build_abacus_intel.sh +++ b/toolchain/build_abacus_intel.sh @@ -60,7 +60,6 @@ cmake -B $BUILD_DIR -DCMAKE_INSTALL_PREFIX=$PREFIX \ # -Dlibnpy_INCLUDE_DIR=$LIBNPY \ # -DDeePMD_DIR=$DEEPMD \ # -DENABLE_CUSOLVERMP=ON \ -# -D CAL_CUSOLVERMP_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/2x.xx/math_libs/1x.x/targets/x86_64-linux/lib cmake --build $BUILD_DIR -j `nproc` cmake --install $BUILD_DIR 2>/dev/null