diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index 404ee9e1de..a8890498e5 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -1,38 +1,11 @@
 #include "DeepPot.h"
 #include "AtomMap.h"
 #include <stdexcept>	
+#include "device.h"
 
 using namespace tensorflow;
 using namespace deepmd;
 
-#if  GOOGLE_CUDA
-#include "cuda_runtime.h"
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess)
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-#endif
-
-#if  TENSORFLOW_USE_ROCM
-#include<hip/hip_runtime.h>
-
-#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); }
-inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != hipSuccess)
-    {
-        fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-#endif //TENSORFLOW_USE_ROCM
-
 static 
 std::vector<int> cum_sum (const std::vector<int32> & n_sel) {
     std::vector<int> sec;
@@ -218,32 +191,18 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_
   else
     graph_def.ParseFromString(file_content);
   int gpu_num = -1;
-  #if GOOGLE_CUDA
-  cudaGetDeviceCount(&gpu_num); // check current device environment
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num); // check current device environment
   if (gpu_num > 0) {
     options.config.set_allow_soft_placement(true);
     options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
     options.config.mutable_gpu_options()->set_allow_growth(true);
-    cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
+    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
     std::string str = "/gpu:";
     str += std::to_string(gpu_rank % gpu_num);
     graph::SetDefaultDevice(str, &graph_def);
   }
-  #endif // GOOGLE_CUDA
-
-  #if TENSORFLOW_USE_ROCM
-  hipGetDeviceCount(&gpu_num); // check current device environment
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    hipErrcheck(hipSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, &graph_def);
-  }
-  #endif // TENSORFLOW_USE_ROCM
-
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   check_status (NewSession(options, &session));
   check_status (session->Create(graph_def));
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
@@ -552,13 +511,9 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
   graph_defs.resize(numb_models);
   
   int gpu_num = -1;
-  #if GOOGLE_CUDA 
-  cudaGetDeviceCount(&gpu_num);
-  #endif // GOOGLE_CUDA
-
-  #if TENSORFLOW_USE_ROCM
-  hipGetDeviceCount(&gpu_num);
-  #endif //TENSORFLOW_USE_ROCM
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num);
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
@@ -569,24 +524,14 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
     else
       graph_defs[ii].ParseFromString(file_contents[ii]);
   }
-  #if GOOGLE_CUDA 
-  if (gpu_num > 0) {
-      options.config.set_allow_soft_placement(true);
-      options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-      options.config.mutable_gpu_options()->set_allow_growth(true);
-      cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
-  }
-  #endif // GOOGLE_CUDA
-
-
-  #if TENSORFLOW_USE_ROCM
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (gpu_num > 0) {
       options.config.set_allow_soft_placement(true);
       options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
       options.config.mutable_gpu_options()->set_allow_growth(true);
-      hipErrcheck(hipSetDevice(gpu_rank % gpu_num));
+      DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
   }
-  #endif // TENSORFLOW_USE_ROCM
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   
   for (unsigned ii = 0; ii < numb_models; ++ii) {
     if (gpu_num > 0) {
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 8464a8f46c..cd82ee4657 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -5,8 +5,9 @@
 #include <cuda_runtime.h>
 
 #define GPU_MAX_NBOR_SIZE 4096
-#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+#define DPErrcheck(res) {DPAssert((res), __FILE__, __LINE__);}
+inline void DPAssert(cudaError_t code, const char *file, int line, bool abort=true) 
+{
   if (code != cudaSuccess) {
     fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
     if (code == 2) {
@@ -27,7 +28,8 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
 }
 
 #define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);}
-inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) 
+{
     if (code != cudaSuccess) {
         fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line);
         if (code == 2) {
@@ -65,12 +67,17 @@ static __inline__ __device__ double atomicAdd(
 #endif
 
 namespace deepmd {
+  
+inline void DPGetDeviceCount(int &gpu_num) { cudaGetDeviceCount(&gpu_num) ;}
+
+inline cudaError_t DPSetDevice(int rank) { return  cudaSetDevice(rank); }
+
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
     const std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice));  
+  DPErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -79,7 +86,7 @@ void memcpy_host_to_device(
     const FPTYPE * host,
     const int size) 
 {
-  cudaErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));  
+  DPErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -87,7 +94,7 @@ void memcpy_device_to_host(
     const FPTYPE * device, 
     std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost));  
+  DPErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost));  
 }
 
 template <typename FPTYPE>
@@ -96,7 +103,7 @@ void memcpy_device_to_host(
     FPTYPE * host,
     const int size) 
 {
-  cudaErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));  
+  DPErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));  
 }
 
 template <typename FPTYPE>
@@ -104,7 +111,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     const std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
 }
 
 template <typename FPTYPE>
@@ -112,7 +119,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     const int size) 
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
@@ -120,7 +127,7 @@ void malloc_device_memory_sync(
     FPTYPE * &device,
     const std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
   memcpy_host_to_device(device, host);
 }
 
@@ -130,7 +137,7 @@ void malloc_device_memory_sync(
     const FPTYPE * host,
     const int size)
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
   memcpy_host_to_device(device, host, size);
 }
 
@@ -139,7 +146,7 @@ void delete_device_memory(
     FPTYPE * &device) 
 {
   if (device != NULL) {
-    cudaErrcheck(cudaFree(device));
+    DPErrcheck(cudaFree(device));
   }
 }
 
@@ -149,6 +156,6 @@ void memset_device_memory(
     const FPTYPE var,
     const int size) 
 {
-  cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
+  DPErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
 }
 } // end of namespace deepmd
\ No newline at end of file
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index ee3e88ee9e..955ffe5bf7 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -8,8 +8,8 @@
 
 #define GPU_MAX_NBOR_SIZE 4096
 
-#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); }
-inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) {
+#define DPErrcheck(res) { DPAssert((res), __FILE__, __LINE__); }
+inline void DPAssert(hipError_t code, const char *file, int line, bool abort=true) {
     if (code != hipSuccess) {
         fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line);
         if (abort) exit(code);
@@ -24,13 +24,18 @@ inline void nborAssert(hipError_t code, const char *file, int line, bool abort=t
     }
 }
 
+
 namespace deepmd {
+inline void DPGetDeviceCount(int &gpu_num) { hipGetDeviceCount(&gpu_num) ;}
+
+inline hipError_t DPSetDevice(int rank) { return  hipSetDevice(rank); }
+
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice));  
+  DPErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -39,7 +44,7 @@ void memcpy_host_to_device(
     const FPTYPE * host,
     const int size) 
 {
-  hipErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));  
+  DPErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -47,7 +52,7 @@ void memcpy_device_to_host(
     FPTYPE * device, 
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost));  
+  DPErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost));  
 }
 template <typename FPTYPE>
 void memcpy_device_to_host(
@@ -55,7 +60,7 @@ void memcpy_device_to_host(
     FPTYPE * host,
     const int size) 
 {
-  hipErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));  
+  DPErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));  
 }
 
 template <typename FPTYPE>
@@ -63,7 +68,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
 }
 
 template <typename FPTYPE>
@@ -71,7 +76,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     const int size) 
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
@@ -79,7 +84,7 @@ void malloc_device_memory_sync(
     FPTYPE * &device,
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
   memcpy_host_to_device(device, host);
 }
 template <typename FPTYPE>
@@ -88,7 +93,7 @@ void malloc_device_memory_sync(
     const FPTYPE * host,
     const int size)
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
   memcpy_host_to_device(device, host, size);
 }
 
@@ -97,7 +102,7 @@ void delete_device_memory(
     FPTYPE * &device) 
 {
   if (device != NULL) {
-    hipErrcheck(hipFree(device));
+    DPErrcheck(hipFree(device));
   }
 }
 
@@ -107,7 +112,7 @@ void memset_device_memory(
   const FPTYPE var,
   const int size)
   {
-    hipErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size));
+    DPErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size));
   }
 }
 
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index bc717255b9..e0901c26d0 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -63,16 +63,25 @@ build_nlist_cpu(
     const int & mem_size,
     const float & rcut);
 
-#if GOOGLE_CUDA
-void convert_nlist_gpu_cuda(
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+void convert_nlist_gpu_device(
     InputNlist & gpu_nlist,
     InputNlist & cpu_nlist,
     int* & gpu_memory,
     const int & max_nbor_size);
 
-void free_nlist_gpu_cuda(
+void free_nlist_gpu_device(
     InputNlist & gpu_nlist);
 
+void use_nlist_map(
+    int * nlist, 
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei);
+
+#endif //GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if GOOGLE_CUDA
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
@@ -96,25 +105,10 @@ build_nlist_gpu(
     const int & mem_size,
     const float & rcut);
 
-void use_nlist_map(
-    int * nlist, 
-    const int * nlist_map, 
-    const int nloc, 
-    const int nnei);
-	
 #endif // GOOGLE_CUDA
 
 
 #if TENSORFLOW_USE_ROCM
-void convert_nlist_gpu_rocm(
-    InputNlist & gpu_nlist,
-    InputNlist & cpu_nlist,
-    int* & gpu_memory,
-    const int & max_nbor_size);
-
-void free_nlist_gpu_rocm(
-    InputNlist & gpu_nlist);
-
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
@@ -137,12 +131,6 @@ build_nlist_gpu_rocm(
     const int & nall, 
     const int & mem_size,
     const float & rcut);
-
-void use_nlist_map(
-    int * nlist, 
-    const int * nlist_map, 
-    const int nloc, 
-    const int nnei);
 	
 #endif // TENSORFLOW_USE_ROCM
 
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu
index c11609c57f..982e603f42 100644
--- a/source/lib/src/cuda/coord.cu
+++ b/source/lib/src/cuda/coord.cu
@@ -291,14 +291,20 @@ void compute_int_data(
     const int nblock_loc=(nloc+TPB-1)/TPB;
     _fill_idx_cellmap<<<nblock_loc, TPB>>>(idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, 
         nat_stt, nat_end, ext_stt, ext_end, nloc);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB;
     _fill_loc_cellnum_map<<<nblock_loc_cellnum, TPB>>>(temp_idx_order, loc_cellnum_map, 
         idx_cellmap_noshift, nloc, loc_cellnum);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB;
     _fill_total_cellnum_map<<<nblock_total_cellnum, TPB>>>(total_cellnum_map, mask_cellnum_map, cell_map, 
         cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 void build_loc_clist(
@@ -313,6 +319,8 @@ void build_loc_clist(
     const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum;
     int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1;
     _build_loc_clist<<<nblock, TPB>>>(loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -340,6 +348,8 @@ void copy_coord(
     const FPTYPE *rec_boxt = region.rec_boxt;
     _copy_coord<<<nblock, TPB>>>(out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, 
         sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 namespace deepmd {
@@ -354,6 +364,8 @@ normalize_coord_gpu(
     const FPTYPE * rec_boxt=region.rec_boxt;
     const int nblock=(natom+TPB-1)/TPB;
     normalize_one<<<nblock, TPB>>>(coord, boxt, rec_boxt, natom);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 //  int_data(temp cuda memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map,
@@ -377,7 +389,7 @@ copy_coord_gpu(
 {
     compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum);
     int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-    cudaErrcheck(cudaMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), cudaMemcpyDeviceToHost));
+    DPErrcheck(cudaMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), cudaMemcpyDeviceToHost));
     int * loc_cellnum_map=int_data_cpu;
     int * total_cellnum_map=loc_cellnum_map+loc_cellnum;
     int * mask_cellnum_map=total_cellnum_map+total_cellnum;
@@ -399,7 +411,7 @@ copy_coord_gpu(
         return 1;
     }
     else{
-        cudaErrcheck(cudaMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
+        DPErrcheck(cudaMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
             sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), cudaMemcpyHostToDevice));
         delete[] int_data_cpu;
         build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
index ba9cdad4b7..ca96751895 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/cuda/gelu.cu
@@ -62,6 +62,8 @@ void gelu_gpu_cuda(
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -78,6 +80,8 @@ void gelu_grad_gpu_cuda(
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -95,6 +99,8 @@ void gelu_grad_grad_gpu_cuda(
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
   
   gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, dy_2, size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void gelu_gpu_cuda<float>(float * out, const float * x, const int size);
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
index 33bf33aa3c..66bd122079 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/cuda/neighbor_list.cu
@@ -124,7 +124,7 @@ int build_nlist_gpu(
     int * ilist = nlist.ilist;
     int * numneigh = nlist.numneigh;
     int ** firstneigh = nlist.firstneigh;
-    cudaErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+    DPErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
     int * temp_nlist = nlist_data; //nloc*mem_size
     int * nei_order = temp_nlist + nloc * mem_size;
     nlist.inum = nloc;
@@ -141,6 +141,8 @@ int build_nlist_gpu(
                 nloc,
                 nall,
                 mem_size);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
     const int nblock_ = (nloc+TPB-1)/TPB;
     scan_nlist<<<nblock_, TPB>>>(
                 numneigh, 
@@ -149,15 +151,18 @@ int build_nlist_gpu(
                 mem_size, 
                 nloc, 
                 nall);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
     fill_nlist<<<block_grid, thread_grid>>>(
                 firstneigh,
                 temp_nlist,
                 nei_order,
                 mem_size,
-                nall
-    );
+                nall);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
     int * numneigh_host = new int[nloc];
-    cudaErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, cudaMemcpyDeviceToHost));
+    DPErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, cudaMemcpyDeviceToHost));
     int max_nei = 0;
     for(int ii=0;ii<nloc;ii++){
         if(numneigh_host[ii]>max_nei)max_nei=numneigh_host[ii];
@@ -177,6 +182,8 @@ void use_nlist_map(
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, TPB);
     map_nlist<<<block_grid,thread_grid>>>(nlist, nlist_map, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template int build_nlist_gpu<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index ccea65d448..7787fa0355 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -213,12 +213,16 @@ void format_nbor_list_1024 (
   format_nlist_fill_a<<<block_grid, thread_grid>>> (
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -239,12 +243,16 @@ void format_nbor_list_2048 (
   format_nlist_fill_a<<<block_grid, thread_grid>>> (
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -265,12 +273,16 @@ void format_nbor_list_4096 (
   format_nlist_fill_a<<<block_grid, thread_grid>>> (
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<
@@ -449,13 +461,15 @@ void format_nbor_list_gpu_cuda(
   int * i_idx = array_int + sec.size() + nloc * sec.size();
   uint_64 * key = array_longlong;
   assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
-  cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
+  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
+  DPErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
 
   get_i_idx<<<nblock, LEN>>>(
       i_idx,
       nloc, gpu_inlist.ilist);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 
   if (max_nbor_size == 1024) {
     format_nbor_list_1024 (
@@ -476,6 +490,8 @@ void format_nbor_list_gpu_cuda(
   format_nlist_fill_b<<<nblock, LEN>>> (
       nlist,
       nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -500,9 +516,9 @@ void prod_env_mat_a_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  cudaErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
@@ -513,6 +529,8 @@ void prod_env_mat_a_gpu_cuda(
   compute_env_mat_a<FPTYPE, TPB> <<<nloc, TPB>>> (
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -537,9 +555,9 @@ void prod_env_mat_r_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  cudaErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
@@ -550,6 +568,8 @@ void prod_env_mat_r_gpu_cuda(
   compute_env_mat_r<FPTYPE, TPB> <<<nloc, TPB>>> (
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -566,6 +586,8 @@ void test_encoding_decoding_nbor_info_gpu_cuda(
   encoding_decoding_nbor_info<<<nblock, TPB>>> (
       key, out_type, out_index,
       in_type, in_dist, in_index, size_of_array);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index 62c7ce8926..124f6c806d 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -108,13 +108,15 @@ void prod_force_a_gpu_cuda(
     const int nnei)
 {
   const int ndescrpt = nnei * 4;
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       force, 
       0.0, sizeof(FPTYPE) * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB> <<<nloc, TPB>>>(
       force, 
       net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -123,6 +125,8 @@ void prod_force_a_gpu_cuda(
   force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       force, 
       net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE> 
@@ -136,13 +140,15 @@ void prod_force_r_gpu_cuda(
     const int nnei)
 {
   const int ndescrpt = nnei * 1;
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       force, 
       0.0, sizeof(FPTYPE) * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB> <<<nloc, TPB>>>(
       force, 
       net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -151,6 +157,8 @@ void prod_force_r_gpu_cuda(
   force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       force, 
       net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_force_a_gpu_cuda<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
index 7fd9359cfe..11af70c245 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/cuda/prod_force_grad.cu
@@ -88,7 +88,7 @@ void prod_force_grad_a_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -97,6 +97,8 @@ void prod_force_grad_a_gpu_cuda(
     force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, ndescrpt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
@@ -105,6 +107,8 @@ void prod_force_grad_a_gpu_cuda(
     force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -117,7 +121,7 @@ void prod_force_grad_r_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei * 1;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -126,6 +130,8 @@ void prod_force_grad_r_gpu_cuda(
     force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, ndescrpt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
@@ -134,6 +140,8 @@ void prod_force_grad_r_gpu_cuda(
     force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_force_grad_a_gpu_cuda<float>(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
index 08a64d71fe..06d1cefa42 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/cuda/prod_virial.cu
@@ -115,10 +115,10 @@ void prod_virial_a_gpu_cuda(
     const int nall, 
     const int nnei)
 {
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       virial, 
       0.0, sizeof(FPTYPE) * 9));
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -130,10 +130,14 @@ void prod_virial_a_gpu_cuda(
   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB> <<<9, TPB>>>(
       virial, 
       atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -148,10 +152,10 @@ void prod_virial_r_gpu_cuda(
     const int nall, 
     const int nnei)
 {
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       virial, 
       0.0, sizeof(FPTYPE) * 9));
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -163,10 +167,14 @@ void prod_virial_r_gpu_cuda(
   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB> <<<9, TPB>>>(
     virial, 
     atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_virial_a_gpu_cuda<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
index 2cdd25ec38..0b9affa948 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/cuda/prod_virial_grad.cu
@@ -99,7 +99,7 @@ void prod_virial_grad_a_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -109,6 +109,8 @@ void prod_virial_grad_a_gpu_cuda(
     virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -122,7 +124,7 @@ void prod_virial_grad_r_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -132,6 +134,8 @@ void prod_virial_grad_r_gpu_cuda(
     virial_grad_wrt_neighbors_r<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_virial_grad_a_gpu_cuda<float>(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/cuda/region.cu
index 4a95e5f9da..99e203cfcc 100644
--- a/source/lib/src/cuda/region.cu
+++ b/source/lib/src/cuda/region.cu
@@ -39,6 +39,8 @@ convert_to_inter_gpu(
     const FPTYPE * rp)
 {
     _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -49,6 +51,8 @@ convert_to_phys_gpu(
     const FPTYPE * ri)
 {
     _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -58,6 +62,8 @@ volume_gpu(
     const Region<FPTYPE> & region)
 {
     _compute_volume<<<1, 1>>>(volume, region.boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void convert_to_inter_gpu<float>(float * ri, const Region<float> & region, const float * rp);
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index b71a989819..2cab6d75a7 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -203,6 +203,8 @@ void tabulate_fusion_gpu_cuda(
   tabulate_fusion_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, last_layer_size>>>(
       out, 
       table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -219,16 +221,18 @@ void tabulate_fusion_grad_gpu_cuda(
     const int last_layer_size)
 {
   if (nloc <= 0) {return;}
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       dy_dem_x,
       0.0, sizeof(FPTYPE) * nloc * nnei));
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       dy_dem,
       0.0, sizeof(FPTYPE) * nloc * nnei * 4));
 
   tabulate_fusion_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
       dy_dem_x, dy_dem,
       table, em_x, em, dy,  table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void tabulate_fusion_gpu_cuda<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index a41784bc29..c3cd376fbe 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -843,8 +843,8 @@ build_nlist_cpu<float>(
     const int & mem_size,
     const float & rcut);
 
-#if GOOGLE_CUDA
-void deepmd::convert_nlist_gpu_cuda(
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+void deepmd::convert_nlist_gpu_device(
     InputNlist & gpu_nlist,
     InputNlist & cpu_nlist,
     int* & gpu_memory,
@@ -867,44 +867,11 @@ void deepmd::convert_nlist_gpu_cuda(
   free(_firstneigh);
 }
 
-void deepmd::free_nlist_gpu_cuda(
+void deepmd::free_nlist_gpu_device(
     InputNlist & gpu_nlist)
 {
   delete_device_memory(gpu_nlist.ilist);
   delete_device_memory(gpu_nlist.numneigh);
   delete_device_memory(gpu_nlist.firstneigh);
 }
-#endif // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-void deepmd::convert_nlist_gpu_rocm(
-    InputNlist & gpu_nlist,
-    InputNlist & cpu_nlist,
-    int* & gpu_memory,
-    const int & max_nbor_size)
-{
-  const int inum = cpu_nlist.inum;
-  gpu_nlist.inum = inum;
-  malloc_device_memory(gpu_nlist.ilist, inum);
-  malloc_device_memory(gpu_nlist.numneigh, inum);
-  malloc_device_memory(gpu_nlist.firstneigh, inum);
-  memcpy_host_to_device(gpu_nlist.ilist, cpu_nlist.ilist, inum);
-  memcpy_host_to_device(gpu_nlist.numneigh, cpu_nlist.numneigh, inum);
-  int ** _firstneigh = NULL;
-  _firstneigh = (int**)malloc(sizeof(int*) * inum);
-  for (int ii = 0; ii < inum; ii++) {
-    memcpy_host_to_device(gpu_memory + ii * max_nbor_size, cpu_nlist.firstneigh[ii], cpu_nlist.numneigh[ii]);
-    _firstneigh[ii] = gpu_memory + ii * max_nbor_size;
-  }
-  memcpy_host_to_device(gpu_nlist.firstneigh, _firstneigh, inum);
-  free(_firstneigh);
-}
-
-void deepmd::free_nlist_gpu_rocm(
-    InputNlist & gpu_nlist)
-{
-  delete_device_memory(gpu_nlist.ilist);
-  delete_device_memory(gpu_nlist.numneigh);
-  delete_device_memory(gpu_nlist.firstneigh);
-}
-#endif // TENSORFLOW_USE_ROCM
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 4ef5af49e3..1ac944786c 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -256,7 +256,7 @@ prod_env_mat_r_cpu<float>(
     const float rcut_smth, 
     const std::vector<int> sec);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 void deepmd::env_mat_nbor_update(
     InputNlist &inlist,
     InputNlist &gpu_inlist,
@@ -266,7 +266,7 @@ void deepmd::env_mat_nbor_update(
     const int size)
 {
   int *mesh_host = new int[size];
-  cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost));
+  memcpy_device_to_host(mesh, mesh_host, size);
   memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
 	memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
 	memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
@@ -319,69 +319,4 @@ void deepmd::env_mat_nbor_update(
   }
   delete [] mesh_host;
 }
-#endif // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-void deepmd::env_mat_nbor_update(
-    InputNlist &inlist,
-    InputNlist &gpu_inlist,
-    int &max_nbor_size,
-    int* &nbor_list_dev,
-    const int * mesh, 
-    const int size)
-{
-  int *mesh_host = new int[size];
-  hipErrcheck(hipMemcpy(mesh_host, mesh, sizeof(int) * size, hipMemcpyDeviceToHost));
-  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
-	memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
-	memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
-  const int ago = mesh_host[0];
-  if (ago == 0) {
-    const int inum = inlist.inum;
-    if (gpu_inlist.inum < inum) {
-      delete_device_memory(gpu_inlist.ilist);
-      delete_device_memory(gpu_inlist.numneigh);
-      delete_device_memory(gpu_inlist.firstneigh);
-      malloc_device_memory(gpu_inlist.ilist, inum);
-      malloc_device_memory(gpu_inlist.numneigh, inum);
-      malloc_device_memory(gpu_inlist.firstneigh, inum);
-    }
-    memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
-    memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
-    int _max_nbor_size = max_numneigh(inlist);
-    if (_max_nbor_size <= 1024) {
-      _max_nbor_size = 1024;
-    }
-    else if (_max_nbor_size <= 2048) {
-      _max_nbor_size = 2048;
-    }
-    else {
-      _max_nbor_size = 4096;
-    }
-    if ( nbor_list_dev == NULL 
-      || _max_nbor_size > max_nbor_size 
-      || inum > gpu_inlist.inum) 
-    {
-      delete_device_memory(nbor_list_dev);
-      malloc_device_memory(nbor_list_dev, inum * _max_nbor_size);
-    }
-    // update info
-    gpu_inlist.inum = inum;
-    max_nbor_size = _max_nbor_size;
-
-    // copy nbor list from host to the device
-    std::vector<int> nbor_list_host(inum * max_nbor_size, 0);
-    int ** _firstneigh = (int**)malloc(sizeof(int*) * inum);
-    for (int ii = 0; ii < inum; ii++) {
-      _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
-      for (int jj = 0; jj < inlist.numneigh[ii]; jj++) {
-        nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj];
-      }
-    }
-    memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0], inum * max_nbor_size);
-    memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum);
-    free(_firstneigh);
-  }
-  delete [] mesh_host;
-}
-#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
index 12c797ab3e..1d01e1da30 100644
--- a/source/lib/src/rocm/coord.hip.cu
+++ b/source/lib/src/rocm/coord.hip.cu
@@ -292,14 +292,20 @@ void compute_int_data(
     const int nblock_loc=(nloc+TPB-1)/TPB;
     hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, 
         nat_stt, nat_end, ext_stt, ext_end, nloc);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 
     const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB;
     hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0, temp_idx_order, loc_cellnum_map, 
         idx_cellmap_noshift, nloc, loc_cellnum);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 
     const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB;
     hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0, total_cellnum_map, mask_cellnum_map, cell_map, 
         cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 void build_loc_clist(
@@ -314,6 +320,8 @@ void build_loc_clist(
     const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum;
     int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1;
     hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -341,6 +349,8 @@ void copy_coord(
     const FPTYPE *rec_boxt = region.rec_boxt;
     hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, 
         sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 namespace deepmd {
@@ -355,6 +365,8 @@ normalize_coord_gpu_rocm(
     const FPTYPE * rec_boxt=region.rec_boxt;
     const int nblock=(natom+TPB-1)/TPB;
     hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt, natom);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -376,7 +388,9 @@ copy_coord_gpu_rocm(
 {
     compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum);
     int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-    hipErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost));
+    DPErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost));
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     int * loc_cellnum_map=int_data_cpu;
     int * total_cellnum_map=loc_cellnum_map+loc_cellnum;
     int * mask_cellnum_map=total_cellnum_map+total_cellnum;
@@ -398,7 +412,7 @@ copy_coord_gpu_rocm(
         return 1;
     }
     else{
-        hipErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
+        DPErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
             sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), hipMemcpyHostToDevice));
         delete[] int_data_cpu;
         build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
index 83e7a3be6d..9cabb53c08 100644
--- a/source/lib/src/rocm/gelu.hip.cu
+++ b/source/lib/src/rocm/gelu.hip.cu
@@ -64,6 +64,8 @@ namespace deepmd {
     const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
   
     hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template<typename FPTYPE>
@@ -81,6 +83,8 @@ namespace deepmd {
     const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
   
     hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template<typename FPTYPE>
@@ -99,6 +103,8 @@ namespace deepmd {
     const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
     
     hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, dy_2, size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template void gelu_gpu_rocm<float>(float * out, const float * x, const int size);
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
index a0da866d12..243ea0507a 100644
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ b/source/lib/src/rocm/neighbor_list.hip.cu
@@ -124,7 +124,7 @@ int build_nlist_gpu_rocm(
     int * ilist = nlist.ilist;
     int * numneigh = nlist.numneigh;
     int ** firstneigh = nlist.firstneigh;
-    hipErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+    DPErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
     int * temp_nlist = nlist_data; //nloc*mem_size
     int * nei_order = temp_nlist + nloc * mem_size;
     nlist.inum = nloc;
@@ -141,6 +141,8 @@ int build_nlist_gpu_rocm(
                 nloc,
                 nall,
                 mem_size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     const int nblock_ = (nloc+TPB-1)/TPB;
     hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0, 
                 numneigh, 
@@ -149,15 +151,18 @@ int build_nlist_gpu_rocm(
                 mem_size, 
                 nloc, 
                 nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, 
                 firstneigh,
                 temp_nlist,
                 nei_order,
                 mem_size,
-                nall
-    );
+                nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     int * numneigh_host = new int[nloc];
-    hipErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost));
+    DPErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost));
     int max_nei = 0;
     for(int ii=0;ii<nloc;ii++){
         if(numneigh_host[ii]>max_nei)max_nei=numneigh_host[ii];
@@ -177,6 +182,8 @@ void use_nlist_map(
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, TPB);
     hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template int build_nlist_gpu_rocm<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
index b1251c6a42..f3cd3bf31a 100644
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -234,12 +234,16 @@ void format_nbor_list_1024 (
   hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -260,12 +264,16 @@ void format_nbor_list_2048 (
   hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -286,12 +294,16 @@ void format_nbor_list_4096 (
   hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template<
@@ -470,13 +482,15 @@ void format_nbor_list_gpu_rocm(
   int * i_idx = array_int + sec.size() + nloc * sec.size();
   uint_64 * key = array_longlong;
   assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  hipErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  hipErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
-  hipErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice));   
+  DPErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei));
+  DPErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice));   
 
   hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, 
       i_idx,
       nloc, gpu_inlist.ilist);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 
   if (max_nbor_size == 1024) {
     format_nbor_list_1024 (
@@ -501,6 +515,8 @@ void format_nbor_list_gpu_rocm(
   hipLaunchKernelGGL(format_nlist_fill_b, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN, 0, 0, 
       nlist,
       nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -525,9 +541,9 @@ void prod_env_mat_a_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  hipErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
@@ -538,6 +554,8 @@ void prod_env_mat_a_gpu_rocm(
   hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a<FPTYPE, TPB>), nloc, TPB, 0, 0, 
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -562,9 +580,9 @@ void prod_env_mat_r_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  hipErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
@@ -575,6 +593,8 @@ void prod_env_mat_r_gpu_rocm(
   hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r<FPTYPE, TPB>), nloc, TPB, 0, 0, 
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -591,6 +611,8 @@ void test_encoding_decoding_nbor_info_gpu_rocm(
   hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, 
       key, out_type, out_index,
       in_type, in_dist, in_index, size_of_array);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_env_mat_a_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
index 48b12dfa50..815fed44c5 100644
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ b/source/lib/src/rocm/prod_force.hip.cu
@@ -109,13 +109,15 @@ namespace deepmd {
       const int nnei)
   {
     const int ndescrpt = nnei * 4;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         force, 
         0.0, sizeof(FPTYPE) * nall * 3));
   
     hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
         force, 
         net_deriv, in_deriv, ndescrpt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   
     const int LEN = 64;
     const int nblock = (nnei + LEN - 1) / LEN;
@@ -124,6 +126,8 @@ namespace deepmd {
     hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
         force, 
         net_deriv, in_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template<typename FPTYPE> 
@@ -137,13 +141,15 @@ namespace deepmd {
       const int nnei)
   {
     const int ndescrpt = nnei * 1;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         force, 
         0.0, sizeof(FPTYPE) * nall * 3));
   
     hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
         force, 
         net_deriv, in_deriv, ndescrpt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   
     const int LEN = 64;
     const int nblock = (nnei + LEN -1) / LEN;
@@ -152,6 +158,8 @@ namespace deepmd {
     hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
         force, 
         net_deriv, in_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template void prod_force_a_gpu_rocm<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
index 3a3fd8ea58..f7af9a1746 100644
--- a/source/lib/src/rocm/prod_force_grad.hip.cu
+++ b/source/lib/src/rocm/prod_force_grad.hip.cu
@@ -88,7 +88,7 @@ void prod_force_grad_a_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -97,7 +97,8 @@ void prod_force_grad_a_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, ndescrpt);
-
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
     dim3 block_grid_(nblock_, nnei);
@@ -105,6 +106,8 @@ void prod_force_grad_a_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, 0, 
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -117,7 +120,7 @@ void prod_force_grad_r_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei * 1;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -126,6 +129,8 @@ void prod_force_grad_r_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, ndescrpt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
@@ -134,6 +139,8 @@ void prod_force_grad_r_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, 0, 
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_force_grad_a_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
index ff8017a687..9c4ec5010b 100644
--- a/source/lib/src/rocm/prod_virial.hip.cu
+++ b/source/lib/src/rocm/prod_virial.hip.cu
@@ -113,10 +113,10 @@ void prod_virial_a_gpu_rocm(
     const int nall, 
     const int nnei)
 {
-  hipErrcheck(hipMemset(
-      virial, 
-      0.0, sizeof(FPTYPE) * 9));
-  hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
+        virial, 
+        0.0, sizeof(FPTYPE) * 9));
+    DPErrcheck(hipMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -128,10 +128,14 @@ void prod_virial_a_gpu_rocm(
   hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   // reduction atom_virial to virial
   hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9, TPB, 0, 0, 
       virial, 
       atom_virial, nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -146,10 +150,10 @@ void prod_virial_r_gpu_rocm(
     const int nall, 
     const int nnei)
 {
-  hipErrcheck(hipMemset(
-      virial, 
-      0.0, sizeof(FPTYPE) * 9));
-  hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
+        virial, 
+        0.0, sizeof(FPTYPE) * 9));
+    DPErrcheck(hipMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -161,10 +165,14 @@ void prod_virial_r_gpu_rocm(
   hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   // reduction atom_virial to virial
   hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9, TPB, 0, 0, 
     virial, 
     atom_virial, nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_virial_a_gpu_rocm<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
index c4d8a5c19a..4c729453f7 100644
--- a/source/lib/src/rocm/prod_virial_grad.hip.cu
+++ b/source/lib/src/rocm/prod_virial_grad.hip.cu
@@ -99,7 +99,7 @@ void prod_virial_grad_a_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -109,6 +109,8 @@ void prod_virial_grad_a_gpu_rocm(
     hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -122,7 +124,7 @@ void prod_virial_grad_r_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -132,6 +134,8 @@ void prod_virial_grad_r_gpu_rocm(
     hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_virial_grad_a_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu
index 7f883b14c3..ab40f6bf20 100644
--- a/source/lib/src/rocm/region.hip.cu
+++ b/source/lib/src/rocm/region.hip.cu
@@ -39,6 +39,8 @@ convert_to_inter_gpu_rocm(
     const FPTYPE * rp)
 {
     hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -49,6 +51,8 @@ convert_to_phys_gpu_rocm(
     const FPTYPE * ri)
 {
     hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -58,6 +62,8 @@ volume_gpu_rocm(
     const Region<FPTYPE> & region)
 {
     hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void convert_to_inter_gpu_rocm<float>(float * ri, const Region<float> & region, const float * rp);
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
index b4097083b0..497f4931cd 100644
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -213,6 +213,8 @@ template<typename FPTYPE>
       hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, 
           out, 
           table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      DPErrcheck(hipGetLastError());
+      DPErrcheck(hipDeviceSynchronize());
     }
     
     template<typename FPTYPE>
@@ -229,16 +231,18 @@ template<typename FPTYPE>
         const int last_layer_size)
     {
       if( nloc<=0 ) { return;}
-      hipErrcheck(hipMemset(
+      DPErrcheck(hipMemset(
           dy_dem_x,
           0.0, sizeof(FPTYPE) * nloc * nnei));
-      hipErrcheck(hipMemset(
+      DPErrcheck(hipMemset(
           dy_dem,
           0.0, sizeof(FPTYPE) * nloc * nnei * 4));
     
       hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, 
           dy_dem_x, dy_dem,
           table, em_x, em, dy,  table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      DPErrcheck(hipGetLastError());
+      DPErrcheck(hipDeviceSynchronize());
     }
     
     template void tabulate_fusion_gpu_rocm<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index cfb70acfe9..df7b5f38a0 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -557,7 +557,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
@@ -588,7 +588,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -648,7 +648,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
@@ -682,7 +682,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
@@ -770,7 +770,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_rocm(    
       em_dev, 
@@ -801,7 +801,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -861,7 +861,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_rocm(    
       em_dev, 
@@ -895,7 +895,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 1b232e28b5..39d36be42d 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -400,7 +400,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
@@ -431,7 +431,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -490,7 +490,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
@@ -524,7 +524,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
@@ -603,7 +603,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_rocm(    
       em_dev, 
@@ -634,7 +634,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -693,7 +693,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_rocm(    
       em_dev, 
@@ -727,7 +727,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index d2de6e8855..844d110de5 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -381,7 +381,7 @@ TEST_F(TestFormatNlist, gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_cuda(
       nlist_dev, 
@@ -393,7 +393,7 @@ TEST_F(TestFormatNlist, gpu_cuda)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
@@ -437,7 +437,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_cuda(
       nlist_dev, 
@@ -449,7 +449,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
@@ -565,7 +565,7 @@ TEST_F(TestFormatNlist, gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_rocm(
       nlist_dev, 
@@ -577,7 +577,7 @@ TEST_F(TestFormatNlist, gpu_rocm)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
@@ -621,7 +621,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_rocm(
       nlist_dev, 
@@ -633,7 +633,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 7c7130cda0..22fb223289 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -995,7 +995,7 @@ _norm_copy_coord_gpu(
   FPTYPE_shape.AddDim(nall*3);
   context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
   FPTYPE * tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  cudaErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice));
+  DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice));
   
   deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);
@@ -1210,7 +1210,7 @@ _norm_copy_coord_gpu_rocm(
   FPTYPE_shape.AddDim(nall*3);
   context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
   FPTYPE * tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  hipErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice));
+  DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice));
   
   deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);