From 56e36b761c81d196eba7b9e5763b4aabb8f7bdf1 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 28 Sep 2023 17:12:10 -0400 Subject: [PATCH 1/4] forward error messages from GPU libraries to deepmd_exception Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 66 ++++++++++++++++++----------------- source/lib/include/gpu_rocm.h | 24 +++++++++---- 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 1e750e0ea0..8962255aff 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -24,27 +24,31 @@ inline void DPAssert(cudaError_t code, int line, bool abort = true) { if (code != cudaSuccess) { - fprintf(stderr, "cuda assert: %s %s %d\n", cudaGetErrorString(code), file, - line); + std::string error_msg = "CUDA Runtime library throws an error: " + + std::string(cudaGetErrorString(code)) + + ", in file " + std::string(file) + ": " + + std::to_string(line); if (code == 2) { // out of memory - fprintf(stderr, - "Your memory is not enough, thus an error has been raised " - "above. You need to take the following actions:\n" - "1. Check if the network size of the model is too large.\n" - "2. Check if the batch size of training or testing is too large. " - "You can set the training batch size to `auto`.\n" - "3. Check if the number of atoms is too large.\n" - "4. Check if another program is using the same GPU by execuating " - "`nvidia-smi`. " - "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " - "environment variable.\n"); + error_msg += + "\nYour memory is not enough, thus an error has been raised " + "above. You need to take the following actions:\n" + "1. Check if the network size of the model is too large.\n" + "2. Check if the batch size of training or testing is too large. " + "You can set the training batch size to `auto`.\n" + "3. Check if the number of atoms is too large.\n" + "4. Check if another program is using the same GPU by execuating " + "`nvidia-smi`. " + "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " + "environment variable."; if (abort) { - throw deepmd::deepmd_exception_oom("CUDA Assert"); + throw deepmd::deepmd_exception_oom(error_msg); } } if (abort) { - throw deepmd::deepmd_exception("CUDA Assert"); + throw deepmd::deepmd_exception(error_msg); + } else { + printf(stderr, error_msg + "\n"); } } } @@ -56,30 +60,28 @@ inline void nborAssert(cudaError_t code, int line, bool abort = true) { if (code != cudaSuccess) { - fprintf(stderr, "cuda assert: %s %s %d\n", - "DeePMD-kit:\tillegal nbor list sorting", file, line); - if (code == 2) { - // out of memory - fprintf(stderr, - "Your memory is not enough, thus an error has been raised " - "above. You need to take the following actions:\n" - "1. Check if the network size of the model is too large.\n" - "2. Check if the batch size of training or testing is too large. " - "You can set the training batch size to `auto`.\n" - "3. Check if the number of atoms is too large.\n" - "4. Check if another program is using the same GPU by execuating " - "`nvidia-smi`. " - "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " - "environment variable.\n"); + std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: "; + try { + DPAssert(code, file, line, true); + } catch (deepmd::deepmd_exception_oom &e) { + error_msg += e.what(); if (abort) { - throw deepmd::deepmd_exception_oom("CUDA Assert"); + throw deepmd::deepmd_exception_oom(error_msg); + } else { + fprintf(stderr, err_msg + "\n"); } } + } + catch (deepmd::deepmd_exception &e) { + error_msg += e.what(); if (abort) { - throw deepmd::deepmd_exception("CUDA Assert"); + throw deepmd::deepmd_exception(error_msg); + } else { + fprintf(stderr, err_msg + "\n"); } } } +} #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double *address, double val) { diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index bb404720bc..ea00694f3c 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -26,10 +26,14 @@ inline void DPAssert(hipError_t code, int line, bool abort = true) { if (code != hipSuccess) { - fprintf(stderr, "hip assert: %s %s %d\n", hipGetErrorString(code), file, - line); + std::string error_msg = "HIP runtime library throws an error: " + + std::string(hipGetErrorString(code)) + + ", in file " + std::string(file) + ": " + + std::to_string(line); if (abort) { - throw deepmd::deepmd_exception("HIP Assert"); + throw deepmd::deepmd_exception(error_msg); + } else { + fprintf(stderr, err_msg + "\n"); } } } @@ -41,10 +45,16 @@ inline void nborAssert(hipError_t code, int line, bool abort = true) { if (code != hipSuccess) { - fprintf(stderr, "hip assert: %s %s %d\n", - "DeePMD-kit:\tillegal nbor list sorting", file, line); - if (abort) { - throw deepmd::deepmd_exception("HIP Assert: illegal nbor list sorting"); + std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: "; + try { + DPAssert(code, file, line, true); + } catch (deepmd::deepmd_exception &e) { + error_msg += e.what(); + if (abort) { + throw deepmd::deepmd_exception(error_msg); + } else { + fprintf(stderr, err_msg + "\n"); + } } } } From eb475208aed11f9d3f6dcc439dd251122d91e8c2 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 28 Sep 2023 17:22:45 -0400 Subject: [PATCH 2/4] fix typo; include Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 3 ++- source/lib/include/gpu_rocm.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 8962255aff..5632845bce 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -4,6 +4,7 @@ #include #include +#include #include #include "errors.h" @@ -48,7 +49,7 @@ inline void DPAssert(cudaError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - printf(stderr, error_msg + "\n"); + fprintf(stderr, error_msg + "\n"); } } } diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index ea00694f3c..2833778557 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -4,6 +4,7 @@ #include #include +#include #include // #include // #include From 8a3478d6dfab9f017234e918fa1846f97902c060 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 28 Sep 2023 17:28:41 -0400 Subject: [PATCH 3/4] fix typo Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 4 ++-- source/lib/include/gpu_rocm.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 5632845bce..0876515ccc 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -69,7 +69,7 @@ inline void nborAssert(cudaError_t code, if (abort) { throw deepmd::deepmd_exception_oom(error_msg); } else { - fprintf(stderr, err_msg + "\n"); + fprintf(stderr, error_msg + "\n"); } } } @@ -78,7 +78,7 @@ inline void nborAssert(cudaError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - fprintf(stderr, err_msg + "\n"); + fprintf(stderr, error_msg + "\n"); } } } diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index 2833778557..500df4ecd1 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -34,7 +34,7 @@ inline void DPAssert(hipError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - fprintf(stderr, err_msg + "\n"); + fprintf(stderr, error_msg + "\n"); } } } @@ -54,7 +54,7 @@ inline void nborAssert(hipError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - fprintf(stderr, err_msg + "\n"); + fprintf(stderr, error_msg + "\n"); } } } From 89a423ce22a94d2104d4be92738228690a9693b4 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 28 Sep 2023 17:36:45 -0400 Subject: [PATCH 4/4] fix errors Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 20 +++++++++----------- source/lib/include/gpu_rocm.h | 4 ++-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 0876515ccc..fb467674cb 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -49,7 +49,7 @@ inline void DPAssert(cudaError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - fprintf(stderr, error_msg + "\n"); + fprintf(stderr, "%s\n", error_msg.c_str()); } } } @@ -69,19 +69,17 @@ inline void nborAssert(cudaError_t code, if (abort) { throw deepmd::deepmd_exception_oom(error_msg); } else { - fprintf(stderr, error_msg + "\n"); + fprintf(stderr, "%s\n", error_msg.c_str()); + } + } catch (deepmd::deepmd_exception &e) { + error_msg += e.what(); + if (abort) { + throw deepmd::deepmd_exception(error_msg); + } else { + fprintf(stderr, "%s\n", error_msg.c_str()); } } } - catch (deepmd::deepmd_exception &e) { - error_msg += e.what(); - if (abort) { - throw deepmd::deepmd_exception(error_msg); - } else { - fprintf(stderr, error_msg + "\n"); - } - } -} } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index 500df4ecd1..fbd5e1ce3f 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -34,7 +34,7 @@ inline void DPAssert(hipError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - fprintf(stderr, error_msg + "\n"); + fprintf(stderr, "%s\n", error_msg.c_str()); } } } @@ -54,7 +54,7 @@ inline void nborAssert(hipError_t code, if (abort) { throw deepmd::deepmd_exception(error_msg); } else { - fprintf(stderr, error_msg + "\n"); + fprintf(stderr, "%s\n", error_msg.c_str()); } } }