From 56e36b761c81d196eba7b9e5763b4aabb8f7bdf1 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 28 Sep 2023 17:12:10 -0400
Subject: [PATCH 1/4] forward error messages from GPU libraries to
 deepmd_exception

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h | 66 ++++++++++++++++++-----------------
 source/lib/include/gpu_rocm.h | 24 +++++++++----
 2 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 1e750e0ea0..8962255aff 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -24,27 +24,31 @@ inline void DPAssert(cudaError_t code,
                      int line,
                      bool abort = true) {
   if (code != cudaSuccess) {
-    fprintf(stderr, "cuda assert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
+    std::string error_msg = "CUDA Runtime library throws an error: " +
+                            std::string(cudaGetErrorString(code)) +
+                            ", in file " + std::string(file) + ": " +
+                            std::to_string(line);
     if (code == 2) {
       // out of memory
-      fprintf(stderr,
-              "Your memory is not enough, thus an error has been raised "
-              "above. You need to take the following actions:\n"
-              "1. Check if the network size of the model is too large.\n"
-              "2. Check if the batch size of training or testing is too large. "
-              "You can set the training batch size to `auto`.\n"
-              "3. Check if the number of atoms is too large.\n"
-              "4. Check if another program is using the same GPU by execuating "
-              "`nvidia-smi`. "
-              "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
-              "environment variable.\n");
+      error_msg +=
+          "\nYour memory is not enough, thus an error has been raised "
+          "above. You need to take the following actions:\n"
+          "1. Check if the network size of the model is too large.\n"
+          "2. Check if the batch size of training or testing is too large. "
+          "You can set the training batch size to `auto`.\n"
+          "3. Check if the number of atoms is too large.\n"
+          "4. Check if another program is using the same GPU by execuating "
+          "`nvidia-smi`. "
+          "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
+          "environment variable.";
       if (abort) {
-        throw deepmd::deepmd_exception_oom("CUDA Assert");
+        throw deepmd::deepmd_exception_oom(error_msg);
       }
     }
     if (abort) {
-      throw deepmd::deepmd_exception("CUDA Assert");
+      throw deepmd::deepmd_exception(error_msg);
+    } else {
+      printf(stderr, error_msg + "\n");
     }
   }
 }
@@ -56,30 +60,28 @@ inline void nborAssert(cudaError_t code,
                        int line,
                        bool abort = true) {
   if (code != cudaSuccess) {
-    fprintf(stderr, "cuda assert: %s %s %d\n",
-            "DeePMD-kit:\tillegal nbor list sorting", file, line);
-    if (code == 2) {
-      // out of memory
-      fprintf(stderr,
-              "Your memory is not enough, thus an error has been raised "
-              "above. You need to take the following actions:\n"
-              "1. Check if the network size of the model is too large.\n"
-              "2. Check if the batch size of training or testing is too large. "
-              "You can set the training batch size to `auto`.\n"
-              "3. Check if the number of atoms is too large.\n"
-              "4. Check if another program is using the same GPU by execuating "
-              "`nvidia-smi`. "
-              "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
-              "environment variable.\n");
+    std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
+    try {
+      DPAssert(code, file, line, true);
+    } catch (deepmd::deepmd_exception_oom &e) {
+      error_msg += e.what();
       if (abort) {
-        throw deepmd::deepmd_exception_oom("CUDA Assert");
+        throw deepmd::deepmd_exception_oom(error_msg);
+      } else {
+        fprintf(stderr, err_msg + "\n");
       }
     }
+  }
+  catch (deepmd::deepmd_exception &e) {
+    error_msg += e.what();
     if (abort) {
-      throw deepmd::deepmd_exception("CUDA Assert");
+      throw deepmd::deepmd_exception(error_msg);
+    } else {
+      fprintf(stderr, err_msg + "\n");
     }
   }
 }
+}
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double *address, double val) {
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index bb404720bc..ea00694f3c 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -26,10 +26,14 @@ inline void DPAssert(hipError_t code,
                      int line,
                      bool abort = true) {
   if (code != hipSuccess) {
-    fprintf(stderr, "hip assert: %s %s %d\n", hipGetErrorString(code), file,
-            line);
+    std::string error_msg = "HIP runtime library throws an error: " +
+                            std::string(hipGetErrorString(code)) +
+                            ", in file " + std::string(file) + ": " +
+                            std::to_string(line);
     if (abort) {
-      throw deepmd::deepmd_exception("HIP Assert");
+      throw deepmd::deepmd_exception(error_msg);
+    } else {
+      fprintf(stderr, err_msg + "\n");
     }
   }
 }
@@ -41,10 +45,16 @@ inline void nborAssert(hipError_t code,
                        int line,
                        bool abort = true) {
   if (code != hipSuccess) {
-    fprintf(stderr, "hip assert: %s %s %d\n",
-            "DeePMD-kit:\tillegal nbor list sorting", file, line);
-    if (abort) {
-      throw deepmd::deepmd_exception("HIP Assert: illegal nbor list sorting");
+    std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
+    try {
+      DPAssert(code, file, line, true);
+    } catch (deepmd::deepmd_exception &e) {
+      error_msg += e.what();
+      if (abort) {
+        throw deepmd::deepmd_exception(error_msg);
+      } else {
+        fprintf(stderr, err_msg + "\n");
+      }
     }
   }
 }

From eb475208aed11f9d3f6dcc439dd251122d91e8c2 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 28 Sep 2023 17:22:45 -0400
Subject: [PATCH 2/4] fix typo; include <string>

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h | 3 ++-
 source/lib/include/gpu_rocm.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 8962255aff..5632845bce 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -4,6 +4,7 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
+#include <string>
 #include <vector>
 
 #include "errors.h"
@@ -48,7 +49,7 @@ inline void DPAssert(cudaError_t code,
     if (abort) {
       throw deepmd::deepmd_exception(error_msg);
     } else {
-      printf(stderr, error_msg + "\n");
+      fprintf(stderr, error_msg + "\n");
     }
   }
 }
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index ea00694f3c..2833778557 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -4,6 +4,7 @@
 #include <hip/hip_runtime.h>
 #include <stdio.h>
 
+#include <string>
 #include <vector>
 // #include<rocprim/rocprim.hpp>
 // #include <hipcub/hipcub.hpp>

From 8a3478d6dfab9f017234e918fa1846f97902c060 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 28 Sep 2023 17:28:41 -0400
Subject: [PATCH 3/4] fix typo

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h | 4 ++--
 source/lib/include/gpu_rocm.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 5632845bce..0876515ccc 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -69,7 +69,7 @@ inline void nborAssert(cudaError_t code,
       if (abort) {
         throw deepmd::deepmd_exception_oom(error_msg);
       } else {
-        fprintf(stderr, err_msg + "\n");
+        fprintf(stderr, error_msg + "\n");
       }
     }
   }
@@ -78,7 +78,7 @@ inline void nborAssert(cudaError_t code,
     if (abort) {
       throw deepmd::deepmd_exception(error_msg);
     } else {
-      fprintf(stderr, err_msg + "\n");
+      fprintf(stderr, error_msg + "\n");
     }
   }
 }
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 2833778557..500df4ecd1 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -34,7 +34,7 @@ inline void DPAssert(hipError_t code,
     if (abort) {
       throw deepmd::deepmd_exception(error_msg);
     } else {
-      fprintf(stderr, err_msg + "\n");
+      fprintf(stderr, error_msg + "\n");
     }
   }
 }
@@ -54,7 +54,7 @@ inline void nborAssert(hipError_t code,
       if (abort) {
         throw deepmd::deepmd_exception(error_msg);
       } else {
-        fprintf(stderr, err_msg + "\n");
+        fprintf(stderr, error_msg + "\n");
       }
     }
   }

From 89a423ce22a94d2104d4be92738228690a9693b4 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 28 Sep 2023 17:36:45 -0400
Subject: [PATCH 4/4] fix errors

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h | 20 +++++++++-----------
 source/lib/include/gpu_rocm.h |  4 ++--
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 0876515ccc..fb467674cb 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -49,7 +49,7 @@ inline void DPAssert(cudaError_t code,
     if (abort) {
       throw deepmd::deepmd_exception(error_msg);
     } else {
-      fprintf(stderr, error_msg + "\n");
+      fprintf(stderr, "%s\n", error_msg.c_str());
     }
   }
 }
@@ -69,19 +69,17 @@ inline void nborAssert(cudaError_t code,
       if (abort) {
         throw deepmd::deepmd_exception_oom(error_msg);
       } else {
-        fprintf(stderr, error_msg + "\n");
+        fprintf(stderr, "%s\n", error_msg.c_str());
+      }
+    } catch (deepmd::deepmd_exception &e) {
+      error_msg += e.what();
+      if (abort) {
+        throw deepmd::deepmd_exception(error_msg);
+      } else {
+        fprintf(stderr, "%s\n", error_msg.c_str());
       }
     }
   }
-  catch (deepmd::deepmd_exception &e) {
-    error_msg += e.what();
-    if (abort) {
-      throw deepmd::deepmd_exception(error_msg);
-    } else {
-      fprintf(stderr, error_msg + "\n");
-    }
-  }
-}
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 500df4ecd1..fbd5e1ce3f 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -34,7 +34,7 @@ inline void DPAssert(hipError_t code,
     if (abort) {
       throw deepmd::deepmd_exception(error_msg);
     } else {
-      fprintf(stderr, error_msg + "\n");
+      fprintf(stderr, "%s\n", error_msg.c_str());
     }
   }
 }
@@ -54,7 +54,7 @@ inline void nborAssert(hipError_t code,
       if (abort) {
         throw deepmd::deepmd_exception(error_msg);
       } else {
-        fprintf(stderr, error_msg + "\n");
+        fprintf(stderr, "%s\n", error_msg.c_str());
       }
     }
   }