From 6dea7c7d62a31e690e6ad7ace4bf2704203d575a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 07:01:13 +0000 Subject: [PATCH 1/5] Initial plan From 996199b079057803cc7c434acb7fdd9b57d221e7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 07:04:22 +0000 Subject: [PATCH 2/5] Fix CUDA GPU detection when device is set to CPU - Add runtime check for PARAM.inp.device before calling GPU detection functions - Prevent cudaErrcheck from exiting when no GPU is available but device is set to "cpu" - Apply same logic as non-MPI version to MPI version in output_device_info Co-authored-by: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> --- .../source_base/module_device/output_device.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/source/source_base/module_device/output_device.cpp b/source/source_base/module_device/output_device.cpp index 1d0f018814..41b4c6d082 100644 --- a/source/source_base/module_device/output_device.cpp +++ b/source/source_base/module_device/output_device.cpp @@ -115,7 +115,13 @@ void output_device_info(std::ostream &output) int local_rank = get_node_rank_with_mpi_shared(MPI_COMM_WORLD); // Get local hardware info - int local_gpu_count = local_rank == 0 ? get_device_num("gpu") : 0; + int local_gpu_count = 0; + #if defined(__CUDA) || defined(__ROCM) + if(PARAM.inp.device == "gpu" && local_rank == 0) + { + local_gpu_count = get_device_num("gpu"); + } + #endif int local_cpu_sockets = local_rank == 0 ? get_device_num("cpu") : 0; // Prepare vectors to gather data from all ranks @@ -133,7 +139,13 @@ void output_device_info(std::ostream &output) // Get device model names (from rank 0 node) std::string cpu_name = get_device_name("cpu"); - std::string gpu_name = get_device_name("gpu"); + std::string gpu_name; + #if defined(__CUDA) || defined(__ROCM) + if(PARAM.inp.device == "gpu" && total_gpus > 0) + { + gpu_name = get_device_name("gpu"); + } + #endif // Output all collected information output << " RUNNING WITH DEVICE : " << "CPU" << " / " From f4653cb14472f96a738509aa08234ed22ed73938 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 13:03:44 +0000 Subject: [PATCH 3/5] Implement device=auto with safe GPU probe and MPI broadcast - Add probe_gpu_availability() function that safely checks GPU without exiting - Refactor get_device_flag() to support device=auto with clear user feedback - Implement MPI broadcast for consistent device selection across all ranks - Add safety check in get_device_kpar() to prevent GPU calls when unavailable - Add informative messages for device selection (INFO/WARNING) Co-authored-by: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> --- source/source_base/module_device/device.cpp | 146 +++++++++++++------- source/source_base/module_device/device.h | 6 + 2 files changed, 105 insertions(+), 47 deletions(-) diff --git a/source/source_base/module_device/device.cpp b/source/source_base/module_device/device.cpp index 96deae8baf..416988ae5d 100644 --- a/source/source_base/module_device/device.cpp +++ b/source/source_base/module_device/device.cpp @@ -147,58 +147,110 @@ int set_device_by_rank(const MPI_Comm mpi_comm) { #endif -std::string get_device_flag(const std::string &device, - const std::string &basis_type) { -if (device == "cpu") { - return "cpu"; // no extra checks required -} -std::string error_message; -if (device != "auto" and device != "gpu") -{ - error_message += "Parameter \"device\" can only be set to \"cpu\" or \"gpu\"!"; - ModuleBase::WARNING_QUIT("device", error_message); -} - -// Get available GPU count -int device_count = -1; -#if ((defined __CUDA) || (defined __ROCM)) +bool probe_gpu_availability() { #if defined(__CUDA) -cudaGetDeviceCount(&device_count); + int device_count = 0; + // Directly call cudaGetDeviceCount without cudaErrcheck to prevent program exit + cudaError_t error_id = cudaGetDeviceCount(&device_count); + if (error_id == cudaSuccess && device_count > 0) { + return true; + } + return false; #elif defined(__ROCM) -hipGetDeviceCount(&device_count); -/***auto start_time = std::chrono::high_resolution_clock::now(); -std::cout << "Starting hipGetDeviceCount.." << std::endl; -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast>(end_time - start_time); -std::cout << "hipGetDeviceCount took " << duration.count() << "seconds" << std::endl;***/ + int device_count = 0; + hipError_t error_id = hipGetDeviceCount(&device_count); + if (error_id == hipSuccess && device_count > 0) { + return true; + } + return false; +#else + // If not compiled with GPU support, GPU is not available + return false; #endif -if (device_count <= 0) -{ - error_message += "Cannot find GPU on this computer!\n"; } -#else // CPU only -error_message += "ABACUS is built with CPU support only. Please rebuild with GPU support.\n"; + +std::string get_device_flag(const std::string &device, + const std::string &basis_type) { + // 1. Validate input string + if (device != "cpu" && device != "gpu" && device != "auto") { + ModuleBase::WARNING_QUIT("device", "Parameter \"device\" can only be set to \"cpu\", \"gpu\", or \"auto\"!"); + } + + int decision = 0; // 0 for CPU, 1 for GPU + +#ifdef __MPI + int world_rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + if (world_rank == 0) { + // Rank 0 makes the decision + if (device == "gpu") { + if (probe_gpu_availability()) { + decision = 1; + std::cout << " INFO: 'device=gpu' specified. GPU will be used." << std::endl; + } else { + ModuleBase::WARNING_QUIT("device", "Device is set to 'gpu', but no available GPU was found. Please check your hardware/drivers or set 'device=cpu'."); + } + } else if (device == "auto") { + if (probe_gpu_availability()) { + decision = 1; + std::cout << " INFO: 'device=auto' specified. GPU detected and will be used." << std::endl; + } else { + decision = 0; + std::cout << " WARNING: 'device=auto' specified, but no GPU was found. Falling back to CPU." << std::endl; + std::cout << " To suppress this warning, please explicitly set 'device=cpu' in your input." << std::endl; + } + } else { // device == "cpu" + decision = 0; + std::cout << " INFO: 'device=cpu' specified. CPU will be used." << std::endl; + } + } + + // Rank 0 broadcasts the final decision to all other ranks + MPI_Bcast(&decision, 1, MPI_INT, 0, MPI_COMM_WORLD); +#else + // Non-MPI case: single process makes the decision + if (device == "gpu") { + if (probe_gpu_availability()) { + decision = 1; + std::cout << " INFO: 'device=gpu' specified. GPU will be used." << std::endl; + } else { + ModuleBase::WARNING_QUIT("device", "Device is set to 'gpu', but no available GPU was found. Please check your hardware/drivers or set 'device=cpu'."); + } + } else if (device == "auto") { + if (probe_gpu_availability()) { + decision = 1; + std::cout << " INFO: 'device=auto' specified. GPU detected and will be used." << std::endl; + } else { + decision = 0; + std::cout << " WARNING: 'device=auto' specified, but no GPU was found. Falling back to CPU." << std::endl; + std::cout << " To suppress this warning, please explicitly set 'device=cpu' in your input." << std::endl; + } + } else { // device == "cpu" + decision = 0; + std::cout << " INFO: 'device=cpu' specified. CPU will be used." << std::endl; + } #endif -if (basis_type == "lcao_in_pw") { - error_message += - "The GPU currently does not support the basis type \"lcao_in_pw\"!"; -} -if(error_message.empty()) -{ - return "gpu"; // possibly automatically set to GPU -} -else if (device == "gpu") -{ - ModuleBase::WARNING_QUIT("device", error_message); -} -else { return "cpu"; -} + // 2. Final check for incompatible basis type + if (decision == 1 && basis_type == "lcao_in_pw") { + ModuleBase::WARNING_QUIT("device", "The GPU currently does not support the basis type \"lcao_in_pw\"!"); + } + + // 3. Return the final decision + return (decision == 1) ? "gpu" : "cpu"; } int get_device_kpar(const int& kpar, const int& bndpar) { #if __MPI && (__CUDA || __ROCM) + // This function should only be called when GPU mode is active + // We use probe_gpu_availability to ensure GPU is actually available + if (!probe_gpu_availability()) { + // If no GPU available, return kpar unchanged + return kpar; + } + int temp_nproc = 0; int new_kpar = kpar; MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc); @@ -213,15 +265,15 @@ int get_device_kpar(const int& kpar, const int& bndpar) int device_num = -1; #if defined(__CUDA) - cudaGetDeviceCount(&device_num); // get the number of GPU devices of current node - cudaSetDevice(node_rank % device_num); // band the CPU processor to the devices + cudaErrcheck(cudaGetDeviceCount(&device_num)); // get the number of GPU devices of current node + cudaErrcheck(cudaSetDevice(node_rank % device_num)); // bind the CPU processor to the devices #elif defined(__ROCM) - hipGetDeviceCount(&device_num); - hipSetDevice(node_rank % device_num); + hipErrcheck(hipGetDeviceCount(&device_num)); + hipErrcheck(hipSetDevice(node_rank % device_num)); #endif - return new_kpar; + return new_kpar; #endif - return kpar; + return kpar; } } // end of namespace information diff --git a/source/source_base/module_device/device.h b/source/source_base/module_device/device.h index 7b8dd0c6ae..f6dfd3f207 100644 --- a/source/source_base/module_device/device.h +++ b/source/source_base/module_device/device.h @@ -44,6 +44,12 @@ void output_device_info(std::ostream& output); */ int get_device_kpar(const int& kpar, const int& bndpar); +/** + * @brief Safely probes for GPU availability without exiting on error. + * @return True if at least one GPU is found and usable, false otherwise. + */ +bool probe_gpu_availability(); + /** * @brief Get the device flag object * for source_io PARAM.inp.device From e12b1cd5c2651e3ed77c8956fcb18000d6247ae9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 13:05:28 +0000 Subject: [PATCH 4/5] Fix MPI synchronization issue in get_device_kpar - Remove redundant probe_gpu_availability check that could cause MPI inconsistency - Function is only called when device=gpu is already confirmed - Add comment clarifying the function should only be called in GPU mode Co-authored-by: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> --- source/source_base/module_device/device.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/source/source_base/module_device/device.cpp b/source/source_base/module_device/device.cpp index 416988ae5d..1ae0eeb00e 100644 --- a/source/source_base/module_device/device.cpp +++ b/source/source_base/module_device/device.cpp @@ -244,13 +244,8 @@ std::string get_device_flag(const std::string &device, int get_device_kpar(const int& kpar, const int& bndpar) { #if __MPI && (__CUDA || __ROCM) - // This function should only be called when GPU mode is active - // We use probe_gpu_availability to ensure GPU is actually available - if (!probe_gpu_availability()) { - // If no GPU available, return kpar unchanged - return kpar; - } - + // This function should only be called when device mode is GPU + // The device decision has already been made by get_device_flag() int temp_nproc = 0; int new_kpar = kpar; MPI_Comm_size(MPI_COMM_WORLD, &temp_nproc); From 8e8331cbae96ac270d21ad7e1c2800061f96b135 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Thu, 4 Dec 2025 22:14:31 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E6=B3=A8=E9=87=8A=E6=8E=89=20get=5Fdevice?= =?UTF-8?q?=5Fflag=20=E4=B8=AD=E6=89=80=E6=9C=89=20std::cout=20=E8=BE=93?= =?UTF-8?q?=E5=87=BA=EF=BC=8C=E9=81=BF=E5=85=8D=E5=86=97=E4=BD=99=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E6=89=93=E5=8D=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/source_base/module_device/device.cpp | 60 ++++++--------------- 1 file changed, 15 insertions(+), 45 deletions(-) diff --git a/source/source_base/module_device/device.cpp b/source/source_base/module_device/device.cpp index 1ae0eeb00e..bddfbaa62e 100644 --- a/source/source_base/module_device/device.cpp +++ b/source/source_base/module_device/device.cpp @@ -1,4 +1,3 @@ - #include "device.h" #include "source_base/tool_quit.h" @@ -176,69 +175,40 @@ std::string get_device_flag(const std::string &device, ModuleBase::WARNING_QUIT("device", "Parameter \"device\" can only be set to \"cpu\", \"gpu\", or \"auto\"!"); } - int decision = 0; // 0 for CPU, 1 for GPU - -#ifdef __MPI - int world_rank = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + // NOTE: This function is called only on rank 0 during input parsing. + // The result will be broadcast to other ranks via the standard bcast mechanism. + // DO NOT use MPI_Bcast here as other ranks are not in this code path. - if (world_rank == 0) { - // Rank 0 makes the decision - if (device == "gpu") { - if (probe_gpu_availability()) { - decision = 1; - std::cout << " INFO: 'device=gpu' specified. GPU will be used." << std::endl; - } else { - ModuleBase::WARNING_QUIT("device", "Device is set to 'gpu', but no available GPU was found. Please check your hardware/drivers or set 'device=cpu'."); - } - } else if (device == "auto") { - if (probe_gpu_availability()) { - decision = 1; - std::cout << " INFO: 'device=auto' specified. GPU detected and will be used." << std::endl; - } else { - decision = 0; - std::cout << " WARNING: 'device=auto' specified, but no GPU was found. Falling back to CPU." << std::endl; - std::cout << " To suppress this warning, please explicitly set 'device=cpu' in your input." << std::endl; - } - } else { // device == "cpu" - decision = 0; - std::cout << " INFO: 'device=cpu' specified. CPU will be used." << std::endl; - } - } + std::string result = "cpu"; - // Rank 0 broadcasts the final decision to all other ranks - MPI_Bcast(&decision, 1, MPI_INT, 0, MPI_COMM_WORLD); -#else - // Non-MPI case: single process makes the decision if (device == "gpu") { if (probe_gpu_availability()) { - decision = 1; - std::cout << " INFO: 'device=gpu' specified. GPU will be used." << std::endl; + result = "gpu"; + // std::cout << " INFO: 'device=gpu' specified. GPU will be used." << std::endl; } else { ModuleBase::WARNING_QUIT("device", "Device is set to 'gpu', but no available GPU was found. Please check your hardware/drivers or set 'device=cpu'."); } } else if (device == "auto") { if (probe_gpu_availability()) { - decision = 1; - std::cout << " INFO: 'device=auto' specified. GPU detected and will be used." << std::endl; + result = "gpu"; + // std::cout << " INFO: 'device=auto' specified. GPU detected and will be used." << std::endl; } else { - decision = 0; - std::cout << " WARNING: 'device=auto' specified, but no GPU was found. Falling back to CPU." << std::endl; - std::cout << " To suppress this warning, please explicitly set 'device=cpu' in your input." << std::endl; + result = "cpu"; + // std::cout << " WARNING: 'device=auto' specified, but no GPU was found. Falling back to CPU." << std::endl; + // std::cout << " To suppress this warning, please explicitly set 'device=cpu' in your input." << std::endl; } } else { // device == "cpu" - decision = 0; - std::cout << " INFO: 'device=cpu' specified. CPU will be used." << std::endl; + result = "cpu"; + // std::cout << " INFO: 'device=cpu' specified. CPU will be used." << std::endl; } -#endif // 2. Final check for incompatible basis type - if (decision == 1 && basis_type == "lcao_in_pw") { + if (result == "gpu" && basis_type == "lcao_in_pw") { ModuleBase::WARNING_QUIT("device", "The GPU currently does not support the basis type \"lcao_in_pw\"!"); } // 3. Return the final decision - return (decision == 1) ? "gpu" : "cpu"; + return result; } int get_device_kpar(const int& kpar, const int& bndpar)