diff --git a/.github/labeler.yml b/.github/labeler.yml
index ad10d7cb7d..195d2cd217 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -5,8 +5,8 @@ Python:
 Docs: doc/**/*
 Examples: examples/**/*
 Core: source/lib/**/*
-CUDA: source/lib/src/cuda/**/*
-ROCM: source/lib/src/rocm/**/*
+CUDA: source/lib/src/gpu/**/*
+ROCM: source/lib/src/gpu/**/*
 OP: source/op/**/*
 C++: source/api_cc/**/*
 C: source/api_c/**/*
diff --git a/.gitmodules b/.gitmodules
index 7f3510b9d6..849b21ced5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "source/lib/src/cuda/cub"]
-	path = source/lib/src/cuda/cub
+[submodule "source/lib/src/gpu/cub"]
+	path = source/lib/src/gpu/cub
 	url = https://github.com/NVIDIA/cub.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8125324ea1..7ea4915f6e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
     rev: v16.0.6
     hooks:
     -   id: clang-format
-        exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc
+        exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc
 # CSS
 -   repo: https://github.com/pre-commit/mirrors-csslint
     rev: v1.0.5
@@ -83,7 +83,7 @@ repos:
         - --comment-style
         - //
         - --no-extra-eol
-        exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc
+        exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc
     # CSS
     -   id: insert-license
         files: \.(css|scss)$
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index e6a4b1a7cb..1447823c08 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -74,7 +74,7 @@ One may set the following environment variables before executing `pip`:
 | Environment variables | Allowed value          | Default value | Usage                      |
 | --------------------- | ---------------------- | ------------- | -------------------------- |
 | DP_VARIANT            | `cpu`, `cuda`, `rocm`  | `cpu`         | Build CPU variant or GPU variant with CUDA or ROCM support. |
-| CUDAToolkit_ROOT | Path                   | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. |
+| CUDAToolkit_ROOT | Path                   | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. |
 | ROCM_ROOT             | Path                   | Detected automatically | The path to the ROCM toolkit directory. |
 | TENSORFLOW_ROOT       | Path                   | Detected automatically | The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.|
 | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1           | 0             | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. |
@@ -188,7 +188,7 @@ One may add the following arguments to `cmake`:
 | -DTENSORFLOW_ROOT=&lt;value&gt;  | Path              | -             | The Path to TensorFlow's C++ interface. |
 | -DCMAKE_INSTALL_PREFIX=&lt;value&gt; | Path          | -             | The Path where DeePMD-kit will be installed. |
 | -DUSE_CUDA_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with CUDA toolkit. |
-| -DCUDAToolkit_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. |
+| -DCUDAToolkit_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. |
 | -DUSE_ROCM_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with ROCM toolkit. |
 | -DCMAKE_HIP_COMPILER_ROCM_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the ROCM toolkit directory. |
 | -DLAMMPS_SOURCE_ROOT=&lt;value&gt; | Path         | - | Only neccessary for LAMMPS plugin mode. The path to the [LAMMPS source code](install-lammps.md). LAMMPS 8Apr2021 or later is supported. If not assigned, the plugin mode will not be enabled. |
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 5f5528de3e..323bf2d7c0 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -11,7 +11,7 @@ target_include_directories(
 
 if(USE_CUDA_TOOLKIT)
   add_definitions("-DGOOGLE_CUDA")
-  add_subdirectory(src/cuda)
+  add_subdirectory(src/gpu)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
   target_link_libraries(${libname} INTERFACE deepmd_dyn_cudart ${EXTRA_LIBS})
   # gpu_cuda.h
@@ -22,7 +22,7 @@ endif()
 
 if(USE_ROCM_TOOLKIT)
   add_definitions("-DTENSORFLOW_USE_ROCM")
-  add_subdirectory(src/rocm)
+  add_subdirectory(src/gpu)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
   target_link_libraries(${libname} INTERFACE ${ROCM_LIBRARIES} ${EXTRA_LIBS})
   # gpu_rocm.h
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index bf8c325b14..73dfed1404 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -8,6 +8,13 @@
 
 #include "errors.h"
 
+#define gpuGetLastError cudaGetLastError
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemset cudaMemset
+
 #define GPU_MAX_NBOR_SIZE 4096
 #define DPErrcheck(res) \
   { DPAssert((res), __FILE__, __LINE__); }
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 4c3c1b41a9..3a65a57b01 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -11,6 +11,13 @@
 
 #define GPU_MAX_NBOR_SIZE 4096
 
+#define gpuGetLastError hipGetLastError
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemset hipMemset
+
 #define DPErrcheck(res) \
   { DPAssert((res), __FILE__, __LINE__); }
 inline void DPAssert(hipError_t code,
diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt
deleted file mode 100644
index 1d5ae690e1..0000000000
--- a/source/lib/src/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-# required cmake version 3.23: CMAKE_CUDA_ARCHITECTURES all
-cmake_minimum_required(VERSION 3.23)
-# project name
-project(deepmd_op_cuda)
-
-set(CMAKE_CUDA_ARCHITECTURES all)
-enable_language(CUDA)
-set(CMAKE_CUDA_STANDARD 11)
-add_compile_definitions(
-  "$<$<COMPILE_LANGUAGE:CUDA>:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>")
-
-find_package(CUDAToolkit REQUIRED)
-
-# take dynamic open cudart library replace of static one so it's not required
-# when using CPUs
-add_subdirectory(cudart)
-
-# nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true
-# -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu
-# -L/usr/local/cuda/lib64 -lcudadevrt very important here! Include path to cub.
-# for searching device compute capability,
-# https://developer.nvidia.com/cuda-gpus
-
-# cub has been included in CUDA Toolkit 11, we do not need to include it any
-# more see https://github.com/NVIDIA/cub
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11")
-  include_directories(cub)
-endif()
-
-message(STATUS "NVCC version is " ${CMAKE_CUDA_COMPILER_VERSION})
-
-# arch will be configured by CMAKE_CUDA_ARCHITECTURES
-set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT"
-)
-
-file(GLOB SOURCE_FILES "*.cu")
-
-add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
-target_link_libraries(deepmd_op_cuda PRIVATE deepmd_dyn_cudart)
-target_include_directories(
-  deepmd_op_cuda
-  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/>
-         $<INSTALL_INTERFACE:include>)
-target_precompile_headers(deepmd_op_cuda PUBLIC [["device.h"]])
-if(APPLE)
-  set_target_properties(deepmd_op_cuda PROPERTIES INSTALL_RPATH @loader_path)
-else()
-  set_target_properties(deepmd_op_cuda PROPERTIES INSTALL_RPATH "$ORIGIN")
-endif()
-
-if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
-  install(
-    TARGETS deepmd_op_cuda
-    EXPORT ${CMAKE_PROJECT_NAME}Targets
-    DESTINATION lib/)
-endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
-if(BUILD_PY_IF)
-  install(TARGETS deepmd_op_cuda DESTINATION deepmd/lib/)
-endif(BUILD_PY_IF)
diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt
new file mode 100644
index 0000000000..25223c82bf
--- /dev/null
+++ b/source/lib/src/gpu/CMakeLists.txt
@@ -0,0 +1,95 @@
+if(USE_CUDA_TOOLKIT)
+  # required cmake version 3.23: CMAKE_CUDA_ARCHITECTURES all
+  cmake_minimum_required(VERSION 3.23)
+  # project name
+  project(deepmd_op_cuda)
+  set(GPU_LIB_NAME deepmd_op_cuda)
+
+  set(CMAKE_CUDA_ARCHITECTURES all)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_STANDARD 11)
+  add_compile_definitions(
+    "$<$<COMPILE_LANGUAGE:CUDA>:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>")
+
+  find_package(CUDAToolkit REQUIRED)
+
+  # take dynamic open cudart library replace of static one so it's not required
+  # when using CPUs
+  add_subdirectory(cudart)
+
+  # nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true
+  # -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler
+  # -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt very important here!
+  # Include path to cub. for searching device compute capability,
+  # https://developer.nvidia.com/cuda-gpus
+
+  # cub has been included in CUDA Toolkit 11, we do not need to include it any
+  # more see https://github.com/NVIDIA/cub
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11")
+    include_directories(cub)
+  endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "9")
+    message(FATAL_ERROR "CUDA version must be >= 9.0")
+  endif()
+
+  message(STATUS "NVCC version is " ${CMAKE_CUDA_COMPILER_VERSION})
+
+  # arch will be configured by CMAKE_CUDA_ARCHITECTURES
+  set(CMAKE_CUDA_FLAGS
+      "${CMAKE_CUDA_FLAGS} -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT"
+  )
+
+  file(GLOB SOURCE_FILES "*.cu")
+
+  add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES})
+  target_link_libraries(${GPU_LIB_NAME} PRIVATE deepmd_dyn_cudart)
+
+elseif(USE_ROCM_TOOLKIT)
+
+  # required cmake version
+  cmake_minimum_required(VERSION 3.21)
+  # project name
+  project(deepmd_op_rocm)
+  set(GPU_LIB_NAME deepmd_op_rocm)
+  set(CMAKE_LINK_WHAT_YOU_USE TRUE)
+
+  # set c++ version c++11
+  set(CMAKE_CXX_STANDARD 14)
+  set(CMAKE_HIP_STANDARD 14)
+  add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
+  add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
+
+  message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
+
+  set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS}
+  )# --amdgpu-target=gfx906
+  if(HIP_VERSION VERSION_LESS 3.5.1)
+    set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS})
+  endif()
+
+  file(GLOB SOURCE_FILES "*.cu")
+
+  hip_add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES})
+
+endif()
+
+target_include_directories(
+  ${GPU_LIB_NAME}
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/>
+         $<INSTALL_INTERFACE:include>)
+target_precompile_headers(${GPU_LIB_NAME} PUBLIC [["device.h"]])
+if(APPLE)
+  set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH @loader_path)
+else()
+  set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+
+if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+  install(
+    TARGETS ${GPU_LIB_NAME}
+    EXPORT ${CMAKE_PROJECT_NAME}Targets
+    DESTINATION lib/)
+endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+if(BUILD_PY_IF)
+  install(TARGETS ${GPU_LIB_NAME} DESTINATION deepmd/lib/)
+endif(BUILD_PY_IF)
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/gpu/coord.cu
similarity index 93%
rename from source/lib/src/cuda/coord.cu
rename to source/lib/src/gpu/coord.cu
index d37e5de9cf..52ec9ff09d 100644
--- a/source/lib/src/cuda/coord.cu
+++ b/source/lib/src/gpu/coord.cu
@@ -266,21 +266,21 @@ void compute_int_data(int *int_data,
   _fill_idx_cellmap<<<nblock_loc, TPB>>>(idx_cellmap, idx_cellmap_noshift, in_c,
                                          rec_boxt, nat_stt, nat_end, ext_stt,
                                          ext_end, nloc);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int nblock_loc_cellnum = (loc_cellnum + TPB - 1) / TPB;
   _fill_loc_cellnum_map<<<nblock_loc_cellnum, TPB>>>(
       temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc, loc_cellnum);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int nblock_total_cellnum = (total_cellnum + TPB - 1) / TPB;
   _fill_total_cellnum_map<<<nblock_total_cellnum, TPB>>>(
       total_cellnum_map, mask_cellnum_map, cell_map, cell_shift_map, nat_stt,
       nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 void build_loc_clist(int *int_data,
@@ -297,8 +297,8 @@ void build_loc_clist(int *int_data,
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1;
   _build_loc_clist<<<nblock, TPB>>>(loc_clist, idx_cellmap_noshift,
                                     temp_idx_order, sec_loc_cellnum_map, nloc);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -326,8 +326,8 @@ void copy_coord(FPTYPE *out_c,
                                cell_shift_map, sec_loc_cellnum_map,
                                sec_total_cellnum_map, loc_clist, nloc, nall,
                                total_cellnum, boxt, rec_boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 namespace deepmd {
@@ -335,14 +335,14 @@ template <typename FPTYPE>
 void normalize_coord_gpu(FPTYPE *coord,
                          const int natom,
                          const Region<FPTYPE> &region) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const FPTYPE *boxt = region.boxt;
   const FPTYPE *rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
   normalize_one<<<nblock, TPB>>>(coord, boxt, rec_boxt, natom);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 //  int_data(temp cuda
@@ -362,16 +362,17 @@ int copy_coord_gpu(FPTYPE *out_c,
                    const int &total_cellnum,
                    const int *cell_info,
                    const Region<FPTYPE> &region) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
   int *int_data_cpu = new int
       [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum +
        1];  // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-  DPErrcheck(cudaMemcpy(int_data_cpu, int_data + 3 * nloc,
-                        sizeof(int) * (loc_cellnum + 2 * total_cellnum),
-                        cudaMemcpyDeviceToHost));
+  DPErrcheck(gpuMemcpy(int_data_cpu, int_data + 3 * nloc,
+                       sizeof(int) * (loc_cellnum + 2 * total_cellnum),
+                       gpuMemcpyDeviceToHost));
+  DPErrcheck(gpuGetLastError());
   int *loc_cellnum_map = int_data_cpu;
   int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
   int *mask_cellnum_map = total_cellnum_map + total_cellnum;
@@ -397,11 +398,12 @@ int copy_coord_gpu(FPTYPE *out_c,
     // size of the output arrays is not large enough
     return 1;
   } else {
-    DPErrcheck(cudaMemcpy(int_data + nloc * 3 + loc_cellnum +
-                              total_cellnum * 3 + total_cellnum * 3,
-                          sec_loc_cellnum_map,
-                          sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1),
-                          cudaMemcpyHostToDevice));
+    DPErrcheck(gpuMemcpy(int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
+                             total_cellnum * 3,
+                         sec_loc_cellnum_map,
+                         sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1),
+                         gpuMemcpyHostToDevice));
+    DPErrcheck(gpuGetLastError());
     delete[] int_data_cpu;
     build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
     copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall,
diff --git a/source/lib/src/cuda/cub b/source/lib/src/gpu/cub
similarity index 100%
rename from source/lib/src/cuda/cub
rename to source/lib/src/gpu/cub
diff --git a/source/lib/src/cuda/cudart/CMakeLists.txt b/source/lib/src/gpu/cudart/CMakeLists.txt
similarity index 100%
rename from source/lib/src/cuda/cudart/CMakeLists.txt
rename to source/lib/src/gpu/cudart/CMakeLists.txt
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_10_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_10_0.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_1.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_1.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_10_1.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_10_1.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_2.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_2.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_10_2.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_10_2.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_11_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_11_0.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_2.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_2.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_11_2.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_11_2.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_8.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_8.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_11_8.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_11_8.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_12_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_12_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_12_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_12_0.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_9_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_9_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_9_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_9_0.inc
diff --git a/source/lib/src/cuda/cudart/cudart_stub.cc b/source/lib/src/gpu/cudart/cudart_stub.cc
similarity index 100%
rename from source/lib/src/cuda/cudart/cudart_stub.cc
rename to source/lib/src/gpu/cudart/cudart_stub.cc
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/gpu/gelu.cu
similarity index 90%
rename from source/lib/src/cuda/gelu.cu
rename to source/lib/src/gpu/gelu.cu
index 823a843b2a..ac6020ea7a 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/gpu/gelu.cu
@@ -32,7 +32,7 @@ __global__ void gelu_grad(FPTYPE* out,
             (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx]));
   out[idx] =
       dy[idx] * ((FPTYPE)0.5 * SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) *
-                     ((FPTYPE)0.134145 * xx[idx] * xx[idx] + 1) +
+                     ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) +
                  (FPTYPE)0.5 * var + (FPTYPE)0.5);
 }
 
@@ -67,14 +67,14 @@ void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -85,14 +85,14 @@ void gelu_grad_gpu(FPTYPE* out,
   if (size <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -104,14 +104,14 @@ void gelu_grad_grad_gpu(FPTYPE* out,
   if (size <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, dy_2, size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void gelu_gpu<float>(float* out, const float* x, const int_64 size);
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu
similarity index 89%
rename from source/lib/src/cuda/neighbor_list.cu
rename to source/lib/src/gpu/neighbor_list.cu
index 7cac07690b..fc4e784915 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/gpu/neighbor_list.cu
@@ -1,4 +1,11 @@
+#if GOOGLE_CUDA
 #include <cub/block/block_scan.cuh>
+#elif TENSORFLOW_USE_ROCM
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
+#error "should not touch here"
+#endif
 
 #include "device.h"
 #include "neighbor_list.h"
@@ -187,13 +194,13 @@ int build_nlist_gpu(InputNlist &nlist,
   if (mem_size < nall) {
     return 1;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
   int *ilist = nlist.ilist;
   int *numneigh = nlist.numneigh;
   int **firstneigh = nlist.firstneigh;
-  DPErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+  DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
   int *temp_nlist = nlist_data;  // nloc*mem_size
   int *nei_order = temp_nlist + nloc * mem_size;
   nlist.inum = nloc;
@@ -203,19 +210,19 @@ int build_nlist_gpu(InputNlist &nlist,
   dim3 thread_grid(1, TPB);
   build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, rcut2,
                                            nloc, nall, mem_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   parallel_prefix_scan<TPB>
       <<<nloc, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   fill_nlist<<<block_grid, thread_grid>>>(firstneigh, temp_nlist, nei_order,
                                           mem_size, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int *numneigh_host = new int[nloc];
-  DPErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
-                        cudaMemcpyDeviceToHost));
+  DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
+                       gpuMemcpyDeviceToHost));
   int max_nei = 0;
   for (int ii = 0; ii < nloc; ii++) {
     if (numneigh_host[ii] > max_nei) {
@@ -231,14 +238,14 @@ void use_nlist_map(int *nlist,
                    const int *nlist_map,
                    const int nloc,
                    const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
   map_nlist<<<block_grid, thread_grid>>>(nlist, nlist_map, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 void use_nei_info_gpu(int *nlist,
@@ -250,13 +257,13 @@ void use_nei_info_gpu(int *nlist,
                       const int nnei,
                       const int ntypes,
                       const bool b_nlist_map) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
-  DPErrcheck(cudaMemset(ntype, 0, sizeof(int) * nloc * nnei));
-  DPErrcheck(cudaMemset(nmask, 0, sizeof(bool) * nloc * nnei));
+  DPErrcheck(gpuMemset(ntype, 0, sizeof(int) * nloc * nnei));
+  DPErrcheck(gpuMemset(nmask, 0, sizeof(bool) * nloc * nnei));
   if (b_nlist_map) {
     map_nei_info<<<block_grid, thread_grid>>>(nlist, ntype, nmask, type,
                                               nlist_map, nloc, nnei, ntypes);
@@ -264,8 +271,8 @@ void use_nei_info_gpu(int *nlist,
     map_nei_info_noconvert<<<block_grid, thread_grid>>>(
         nlist, ntype, nmask, type, nloc, nnei, ntypes);
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template int build_nlist_gpu<float>(InputNlist &nlist,
@@ -295,12 +302,12 @@ __global__ void map_filter_ftype(int *ftype_out,
 }
 
 void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
   map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/gpu/prod_env_mat.cu
similarity index 93%
rename from source/lib/src/cuda/prod_env_mat.cu
rename to source/lib/src/gpu/prod_env_mat.cu
index e603b25db7..a69e014272 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/gpu/prod_env_mat.cu
@@ -1,6 +1,13 @@
+#if GOOGLE_CUDA
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_store.cuh>
+#elif TENSORFLOW_USE_ROCM
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
+#error "should not touch here"
+#endif
 
 #include "device.h"
 #include "fmt_nlist.h"
@@ -83,7 +90,13 @@ __device__ inline uint_64 encoding_nbor_info(const int type,
   // the index of nbor atom(including ghost region) must be smaller than
   // 16777216(1 << 24)
   if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
+#if GOOGLE_CUDA
     asm("trap;");
+#elif TENSORFLOW_USE_ROCM
+    __builtin_trap();
+#else
+#error "should not touch here"
+#endif
   }
   return ((uint_64)type << 57) +
          (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) +
@@ -222,16 +235,16 @@ void format_nbor_list_256(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 4;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -250,16 +263,16 @@ void format_nbor_list_512(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 4;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -278,16 +291,16 @@ void format_nbor_list_1024(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -306,16 +319,16 @@ void format_nbor_list_2048(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -334,16 +347,16 @@ void format_nbor_list_4096(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE, int THREADS_PER_BLOCK>
@@ -376,9 +389,9 @@ __global__ void compute_env_mat_a(FPTYPE* em,
     const int idx_value = ii * 4;   // 4 components
     const int idx_deriv = ii * 12;  // 4 components time 3 directions
     if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3] = {0};
-      FPTYPE dd[4] = {0};
-      FPTYPE vv[12] = {0};
+      FPTYPE rr[3] = {(FPTYPE)0.};
+      FPTYPE dd[4] = {(FPTYPE)0.};
+      FPTYPE vv[12] = {(FPTYPE)0.};
       const int j_idx = row_nlist[ii];
       for (int kk = 0; kk < 3; kk++) {
         rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
@@ -569,8 +582,8 @@ void format_nbor_list_gpu(int* nlist,
                           const int nall,
                           const float rcut,
                           const std::vector<int> sec) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int LEN = 256;
   const int nnei = sec.back();
   const int nblock = (nloc + LEN - 1) / LEN;
@@ -581,15 +594,15 @@ void format_nbor_list_gpu(int* nlist,
   assert(max_nbor_size == 256 || max_nbor_size == 512 ||
          max_nbor_size == 1024 || max_nbor_size == 2048 ||
          max_nbor_size == 4096);
-  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
-  DPErrcheck(cudaMemset(key, 0xffffffff,
-                        sizeof(uint_64) * int_64(nloc) * max_nbor_size));
-  DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
-                        cudaMemcpyHostToDevice));
+  DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
+  DPErrcheck(gpuMemset(key, 0xffffffff,
+                       sizeof(uint_64) * int_64(nloc) * max_nbor_size));
+  DPErrcheck(gpuMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
+                       gpuMemcpyHostToDevice));
 
   get_i_idx<<<nblock, LEN>>>(i_idx, nloc, gpu_inlist.ilist);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   if (max_nbor_size == 256) {
     format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
@@ -608,8 +621,8 @@ void format_nbor_list_gpu(int* nlist,
 
   format_nlist_fill_b<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
       nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -631,27 +644,27 @@ void prod_env_mat_a_gpu(FPTYPE* em,
                         const float rcut_smth,
                         const std::vector<int> sec,
                         const int* f_type) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   if (f_type == NULL) {
     f_type = type;
   }
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
   DPErrcheck(
-      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+      gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
                        array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(cudaGetLastError());
-  nborErrcheck(cudaDeviceSynchronize());
+  nborErrcheck(gpuGetLastError());
+  nborErrcheck(gpuDeviceSynchronize());
 
   compute_env_mat_a<FPTYPE, TPB><<<nloc, TPB>>>(
       em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -672,24 +685,24 @@ void prod_env_mat_r_gpu(FPTYPE* em,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
   DPErrcheck(
-      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+      gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
                        array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(cudaGetLastError());
-  nborErrcheck(cudaDeviceSynchronize());
+  nborErrcheck(gpuGetLastError());
+  nborErrcheck(gpuDeviceSynchronize());
 
   compute_env_mat_r<FPTYPE, TPB><<<nloc, TPB>>>(
       em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -703,8 +716,8 @@ void test_encoding_decoding_nbor_info_gpu(uint_64* key,
   const int nblock = (size_of_array + TPB - 1) / TPB;
   encoding_decoding_nbor_info<<<nblock, TPB>>>(
       key, out_type, out_index, in_type, in_dist, in_index, size_of_array);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_env_mat_a_gpu<float>(float* em,
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/gpu/prod_force.cu
similarity index 91%
rename from source/lib/src/cuda/prod_force.cu
rename to source/lib/src/gpu/prod_force.cu
index d85de26394..7b1359b3b0 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/gpu/prod_force.cu
@@ -12,7 +12,7 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
   int_64 bid = blockIdx.x;
   unsigned int tid = threadIdx.x;
   for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
-    data[ii] = 0.f;
+    data[ii] = (FPTYPE)0.;
   }
   for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
     for (int jj = 0; jj < 3; jj++) {
@@ -64,7 +64,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
   if (j_idx < 0) {
     return;
   }
-  FPTYPE force_tmp = 0.f;
+  FPTYPE force_tmp = (FPTYPE)0.;
   for (int idw = 0; idw < 4; ++idw) {
     force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
@@ -110,15 +110,15 @@ void prod_force_a_gpu(FPTYPE* force,
                       const int nall,
                       const int nnei,
                       const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
+  DPErrcheck(gpuMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB><<<nframes * nloc, TPB>>>(
       force, net_deriv, in_deriv, ndescrpt, nloc, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -126,8 +126,8 @@ void prod_force_a_gpu(FPTYPE* force,
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -139,15 +139,15 @@ void prod_force_r_gpu(FPTYPE* force,
                       const int nall,
                       const int nnei,
                       const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
+  DPErrcheck(gpuMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB><<<nframes * nloc, TPB>>>(
       force, net_deriv, in_deriv, ndescrpt, nloc, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -155,8 +155,8 @@ void prod_force_r_gpu(FPTYPE* force,
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_force_a_gpu<float>(float* force,
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/gpu/prod_force_grad.cu
similarity index 91%
rename from source/lib/src/cuda/prod_force_grad.cu
rename to source/lib/src/gpu/prod_force_grad.cu
index b54676586c..c784d6ba65 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/gpu/prod_force_grad.cu
@@ -88,18 +88,18 @@ void prod_force_grad_a_gpu(FPTYPE* grad_net,
                            const int nloc,
                            const int nnei,
                            const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(
-      cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
+      gpuMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
   const int nblock = (ndescrpt + TPB - 1) / TPB;
   dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(TPB, 1);
   force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
                                                           env_deriv, ndescrpt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 128;
   const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
@@ -107,8 +107,8 @@ void prod_force_grad_a_gpu(FPTYPE* grad_net,
   dim3 thread_grid_(LEN, 4);
   force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
       grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -119,18 +119,18 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net,
                            const int nloc,
                            const int nnei,
                            const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(
-      cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
+      gpuMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
   const int nblock = (ndescrpt + TPB - 1) / TPB;
   dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(TPB, 1);
   force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
                                                           env_deriv, ndescrpt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 128;
   const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
@@ -138,8 +138,8 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net,
   dim3 thread_grid_(LEN, 1);
   force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
       grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_force_grad_a_gpu<float>(float* grad_net,
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/gpu/prod_virial.cu
similarity index 91%
rename from source/lib/src/cuda/prod_virial.cu
rename to source/lib/src/gpu/prod_virial.cu
index e96bacf1d3..ab9c5326e3 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/gpu/prod_virial.cu
@@ -113,10 +113,10 @@ void prod_virial_a_gpu(FPTYPE* virial,
                        const int nloc,
                        const int nall,
                        const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(gpuMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN - 1) / LEN;
@@ -125,12 +125,12 @@ void prod_virial_a_gpu(FPTYPE* virial,
   // compute virial of a frame
   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -143,10 +143,10 @@ void prod_virial_r_gpu(FPTYPE* virial,
                        const int nloc,
                        const int nall,
                        const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(gpuMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN - 1) / LEN;
@@ -155,12 +155,12 @@ void prod_virial_r_gpu(FPTYPE* virial,
   // compute virial of a frame
   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_virial_a_gpu<float>(float* virial,
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/gpu/prod_virial_grad.cu
similarity index 92%
rename from source/lib/src/cuda/prod_virial_grad.cu
rename to source/lib/src/gpu/prod_virial_grad.cu
index 047d8ae17f..dac5b20ba8 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/gpu/prod_virial_grad.cu
@@ -92,18 +92,18 @@ void prod_virial_grad_a_gpu(FPTYPE* grad_net,
                             const int* nlist,
                             const int nloc,
                             const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(gpuMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
   const int nblock = (nloc + LEN - 1) / LEN;
   dim3 block_grid(nblock, nnei);
   dim3 thread_grid(LEN, 4);
   virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -114,18 +114,18 @@ void prod_virial_grad_r_gpu(FPTYPE* grad_net,
                             const int* nlist,
                             const int nloc,
                             const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei;
-  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(gpuMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
   const int nblock = (nloc + LEN - 1) / LEN;
   dim3 block_grid(nblock, nnei);
   dim3 thread_grid(LEN, 1);
   virial_grad_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_virial_grad_a_gpu<float>(float* grad_net,
diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/gpu/region.cu
similarity index 83%
rename from source/lib/src/cuda/region.cu
rename to source/lib/src/gpu/region.cu
index eb8d191a8c..849eecfc3e 100644
--- a/source/lib/src/cuda/region.cu
+++ b/source/lib/src/gpu/region.cu
@@ -27,31 +27,31 @@ template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE *ri,
                           const Region<FPTYPE> &region,
                           const FPTYPE *rp) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
 void convert_to_phys_gpu(FPTYPE *rp,
                          const Region<FPTYPE> &region,
                          const FPTYPE *ri) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   _compute_volume<<<1, 1>>>(volume, region.boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void convert_to_inter_gpu<float>(float *ri,
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/gpu/tabulate.cu
similarity index 86%
rename from source/lib/src/cuda/tabulate.cu
rename to source/lib/src/gpu/tabulate.cu
index 30695a6e05..f424006940 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/gpu/tabulate.cu
@@ -4,9 +4,49 @@
 #define MM 4
 #define KK 4
 #define TPB 256
+#if GOOGLE_CUDA
 #define WARP_SIZE 32
+#elif TENSORFLOW_USE_ROCM
+// See https://github.com/pytorch/pytorch/pull/64302
+#define WARP_SIZE warpSize  // = 64 or 32 (Defined in hip_runtime.h)
+#else
+#error "should not touch here"
+#endif
 #define FULL_MASK 0xffffffff
 
+#if GOOGLE_CUDA
+#define GPU_DYNAMIC_SHARED_MEM_DECL(TYPE, NAME) extern __shared__ TYPE NAME[]
+#elif TENSORFLOW_USE_ROCM
+#define GPU_DYNAMIC_SHARED_MEM_DECL(TYPE, NAME) HIP_DYNAMIC_SHARED(TYPE, NAME)
+#else
+#error "should not touch here"
+#endif
+
+// Copyright 2017 The TensorFlow Authors.
+// Licensed under the Apache License, Version 2.0
+template <typename T>
+__device__ T
+GpuShuffleSync(unsigned mask, T value, int src_lane, int width = warpSize) {
+#if GOOGLE_CUDA
+  return __shfl_sync(mask, value, src_lane, width);
+#elif TENSORFLOW_USE_ROCM
+  return __shfl(value, src_lane, width);
+#else
+#error "should not touch here"
+#endif
+}
+
+__device__ void GpuSyncThreads() {
+#if GOOGLE_CUDA
+  __syncwarp();
+#elif TENSORFLOW_USE_ROCM
+  //__syncwarp();->syncwrap
+  __syncthreads();
+#else
+#error "should not touch here"
+#endif
+}
+
 template <typename FPTYPE>
 __forceinline__ __device__ void locate_xx_se_a(FPTYPE& xx,
                                                int& table_idx,
@@ -110,8 +150,14 @@ __forceinline__ __device__ FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) {
 
 template <typename FPTYPE>
 __forceinline__ __device__ void warp_reduce(FPTYPE& val) {
-  for (int offset = 16; offset > 0; offset >>= 1) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+#if GOOGLE_CUDA
     val += __shfl_down_sync(FULL_MASK, val, offset);
+#elif TENSORFLOW_USE_ROCM
+    val += __shfl_down(val, offset);  // ########????
+#else
+#error "should not touch here"
+#endif
   }
 }
 
@@ -131,13 +177,25 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     const int last_layer_size,
     const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
+#if TENSORFLOW_USE_ROCM
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data)
+#endif
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
+  FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
-
+#if GOOGLE_CUDA
   FPTYPE sum[MTILE] = {(FPTYPE)0.};
+#elif TENSORFLOW_USE_ROCM
+  FPTYPE* iteratorC = (FPTYPE*)&_data[0];
+  for (int kk = 0; kk < MTILE; kk++) {
+    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
+  }
+  __syncthreads();
+#else
+#error "should not touch here"
+#endif
   int mark_table_idx = -1;
   FPTYPE var[6];
   for (int ii = 0; ii < nnei; ii++) {
@@ -163,8 +221,15 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     }
 
     for (int kk = 0; kk < MTILE; kk++) {
-      sum[kk] += (nnei - breakpoint) *
-                 em[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+#if GOOGLE_CUDA
+      sum[kk]
+#elif TENSORFLOW_USE_ROCM
+      iteratorC[kk * last_layer_size + thread_idx]
+#else
+#error "should not touch here"
+#endif
+          += (nnei - breakpoint) *
+             em[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
     }
     if (unloop) {
       break;
@@ -173,7 +238,14 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
   }
   for (int ii = 0; ii < MTILE; ii++) {
     out[block_idx * MTILE * last_layer_size + ii * last_layer_size +
-        thread_idx] = sum[ii];
+        thread_idx] =
+#if GOOGLE_CUDA
+        sum[ii];
+#elif TENSORFLOW_USE_ROCM
+        iteratorC[ii * last_layer_size + thread_idx];
+#else
+#error "should not touch here"
+#endif
   }
 }
 
@@ -195,10 +267,10 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     const int last_layer_size,
     const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
-  extern __shared__ int _data[];
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data);
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
+  int warp_idx = GpuShuffleSync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
   int lane_idx = threadIdx.x % WARP_SIZE;
   int breakpoint = nnei - 1;
   bool unloop = false;
@@ -210,7 +282,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     }
   }
   __syncthreads();
-  FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
+  FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   for (int ii = warp_idx; ii < nnei; ii += KTILE) {
     FPTYPE xx = em_x[block_idx * nnei + ii];
     if (ago == xx && is_sorted) {
@@ -252,12 +324,14 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
       res += reg_em[3] * iteratorA[3 * last_layer_size + jj];
       Csub +=
           (nnei - breakpoint) *
-          (var[1] + (2 * var[2] +
-                     (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
+          (var[1] + ((FPTYPE)2. * var[2] +
+                     ((FPTYPE)3. * var[3] +
+                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                         xx) *
                         xx) *
           (enable_se_atten ? res * t + res : res);
     }
-    __syncwarp();
+    GpuSyncThreads();
     for (int kk = 0; kk < MTILE; kk++) {
       warp_reduce(sum[kk]);
     }
@@ -290,10 +364,10 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     const int nnei,
     const int last_layer_size,
     const bool is_sorted) {
-  extern __shared__ int _data[];
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data);
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
+  FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
   FPTYPE* iteratorC = (FPTYPE*)&_data[0];
@@ -323,9 +397,11 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
         (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
             xx;
     FPTYPE res_grad =
-        var[1] +
-        (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
-            xx;
+        var[1] + ((FPTYPE)2. * var[2] +
+                  ((FPTYPE)3. * var[3] +
+                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                      xx) *
+                     xx;
 
     for (int kk = 0; kk < MTILE; kk++) {
       int em_index = block_idx * nnei * MTILE + ii * MTILE + kk;
@@ -403,10 +479,10 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
     const int nnei_i,
     const int nnei_j,
     const int last_layer_size) {
-  extern __shared__ int _data[];
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data);
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
+  int warp_idx = GpuShuffleSync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
   int lane_idx = threadIdx.x % WARP_SIZE;
   FPTYPE* iteratorA = (FPTYPE*)&_data[0];  // dy
   for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) {
@@ -440,7 +516,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
                            xx) *
                           xx);
       }
-      __syncwarp();
+      GpuSyncThreads();
       warp_reduce(sum);
       warp_reduce(Csub);
       if (lane_idx == 0) {
@@ -551,10 +627,9 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     const FPTYPE stride1,
     const int nnei,
     const int last_layer_size) {
-  extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl_sync(0xffffffff, thread_idx / WARP_SIZE, 0);
+  int warp_idx = GpuShuffleSync(0xffffffff, thread_idx / WARP_SIZE, 0);
   int lane_idx = thread_idx % WARP_SIZE;
   __syncthreads();
   for (int ii = warp_idx; ii < nnei; ii += KTILE) {
@@ -568,12 +643,14 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
       load_polynomial_params(var, table, table_idx, jj, last_layer_size);
       Csub +=
-          (var[1] + (2 * var[2] +
-                     (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
+          (var[1] + ((FPTYPE)2. * var[2] +
+                     ((FPTYPE)3. * var[3] +
+                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                         xx) *
                         xx) *
           dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj];
     }
-    __syncwarp();
+    GpuSyncThreads();
 
     warp_reduce(Csub);
     if (lane_idx == 0) {
@@ -595,10 +672,13 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
     const FPTYPE stride1,
     const int nnei,
     const int last_layer_size) {
-  extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
 
+#if TENSORFLOW_USE_ROCM
+  __syncthreads();
+#endif
+
   int mark_table_idx = -1;
   FPTYPE var[6];
   for (int ii = 0; ii < nnei; ii++) {
@@ -610,9 +690,11 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
                              last_layer_size);
     }
     FPTYPE res_grad =
-        var[1] +
-        (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
-            xx;
+        var[1] + ((FPTYPE)2. * var[2] +
+                  ((FPTYPE)3. * var[3] +
+                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                      xx) *
+                     xx;
     mark_table_idx = table_idx;
     dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size +
           thread_idx] = dz_dy_dem[block_idx * nnei + ii] * res_grad;
@@ -634,15 +716,21 @@ void tabulate_fusion_se_a_gpu(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
-      <<<nloc, last_layer_size>>>(out, table, em_x, em, two_embed,
-                                  table_info[0], table_info[1], table_info[2],
-                                  table_info[3], table_info[4], nnei,
-                                  last_layer_size, is_sorted);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+#if GOOGLE_CUDA
+      <<<nloc, last_layer_size>>>
+#elif TENSORFLOW_USE_ROCM
+      <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>
+#else
+#error "should not touch here"
+#endif
+      (out, table, em_x, em, two_embed, table_info[0], table_info[1],
+       table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
+       is_sorted);
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -661,18 +749,18 @@ void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
-  DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
+  DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
 
   tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0],
           table_info[1], table_info[2], table_info[3], table_info[4], nnei,
           last_layer_size, is_sorted);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -690,16 +778,16 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0],
           table_info[1], table_info[2], table_info[3], table_info[4], nnei,
           last_layer_size, is_sorted);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -715,14 +803,14 @@ void tabulate_fusion_se_t_gpu(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, table_info[0], table_info[1], table_info[2],
           table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -740,18 +828,18 @@ void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
-  DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+  DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
   tabulate_fusion_se_t_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size>>>(
           dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1],
           table_info[2], table_info[3], table_info[4], nnei_i, nnei_j,
           last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -769,17 +857,17 @@ void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
 
   tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(dz_dy, table, em_x, em, dz_dy_dem_x,
                                   dz_dy_dem, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
                                   nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -793,14 +881,14 @@ void tabulate_fusion_se_r_gpu(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
                                   nnei, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -815,16 +903,16 @@ void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
 
   tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dy_dem, table, em, dy, table_info[0], table_info[1], table_info[2],
           table_info[3], table_info[4], nnei, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -839,16 +927,16 @@ void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   DPErrcheck(
-      cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
+      gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dz_dy, table, em, dz_dy_dem, table_info[0], table_info[1],
           table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void tabulate_fusion_se_a_gpu<float>(float* out,
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
deleted file mode 100644
index 1b093977b6..0000000000
--- a/source/lib/src/rocm/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-# required cmake version
-cmake_minimum_required(VERSION 3.21)
-# project name
-project(deepmd_op_rocm)
-set(CMAKE_LINK_WHAT_YOU_USE TRUE)
-
-# set c++ version c++11
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_HIP_STANDARD 14)
-add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
-add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
-
-message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
-
-set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS}
-)# --amdgpu-target=gfx906
-if(HIP_VERSION VERSION_LESS 3.5.1)
-  set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS})
-endif()
-
-file(GLOB SOURCE_FILES "*.hip.cu")
-
-hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES})
-target_include_directories(
-  deepmd_op_rocm
-  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/>
-         $<INSTALL_INTERFACE:include>)
-target_precompile_headers(deepmd_op_rocm PUBLIC [["device.h"]])
-
-install(TARGETS deepmd_op_rocm DESTINATION lib/)
-if(BUILD_CPP_IF)
-  install(
-    TARGETS deepmd_op_rocm
-    EXPORT ${CMAKE_PROJECT_NAME}Targets
-    DESTINATION lib/)
-endif(BUILD_CPP_IF)
-if(BUILD_PY_IF)
-  install(TARGETS deepmd_op_rocm DESTINATION deepmd/lib/)
-endif(BUILD_PY_IF)
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
deleted file mode 100644
index 5416022575..0000000000
--- a/source/lib/src/rocm/coord.hip.cu
+++ /dev/null
@@ -1,444 +0,0 @@
-#include "coord.h"
-#include "device.h"
-#include "region.cuh"
-
-__device__ inline int collapse_index(const int *idx, const int *size) {
-  return (idx[0] * size[1] + idx[1]) * size[2] + idx[2];
-}
-__device__ inline void index_recover(const int in_idx,
-                                     const int *size,
-                                     int *idx) {
-  idx[2] = in_idx % size[2];
-  idx[1] = int(in_idx / size[2]) % size[1];
-  idx[0] = int(int(in_idx / size[2]) / size[1]);
-}
-__device__ inline void idx_addshift(int *idx, const int *shift) {
-  for (int dd = 0; dd < 3; dd++) {
-    idx[dd] += shift[dd];
-  }
-}
-__device__ inline void idx_unshift(int *idx, const int *shift) {
-  for (int dd = 0; dd < 3; dd++) {
-    idx[dd] -= shift[dd];
-  }
-}
-__device__ inline int compute_pbc_shift(int idx, int ncell) {
-  int shift = 0;
-  if (idx < 0) {
-    shift = 1;
-    while (idx + shift * ncell < 0) {
-      shift++;
-    }
-  } else if (idx >= ncell) {
-    shift = -1;
-    while (idx + shift * ncell >= ncell) {
-      shift--;
-    }
-  }
-  return shift;
-}
-
-__device__ inline double _fmod(double x, double y) { return fmod(x, y); }
-__device__ inline float _fmod(float x, float y) { return fmodf(x, y); }
-
-template <typename FPTYPE>
-__global__ void normalize_one(FPTYPE *out_c,
-                              const FPTYPE *boxt,
-                              const FPTYPE *rec_boxt,
-                              const int nall) {
-  // <<<nall/TPB, TPB>>>
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy >= nall) {
-    return;
-  }
-  FPTYPE inter[3];
-  phys2Inter(inter, out_c + idy * 3, rec_boxt);
-  for (int dd = 0; dd < 3; ++dd) {
-    inter[dd] = _fmod(inter[dd], (FPTYPE)1.);
-    if (inter[dd] < (FPTYPE)0.) {
-      inter[dd] += (FPTYPE)1.;
-    }
-  }
-  inter2Phys(out_c + idy * 3, inter, boxt);
-}
-
-template <typename FPTYPE>
-__global__ void _fill_idx_cellmap(int *idx_cellmap,
-                                  int *idx_cellmap_noshift,
-                                  const FPTYPE *in_c,
-                                  const FPTYPE *rec_boxt,
-                                  const int *nat_stt,
-                                  const int *nat_end,
-                                  const int *ext_stt,
-                                  const int *ext_end,
-                                  const int nloc) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  int ext_ncell[3];
-  int global_grid[3];
-  int idx_orig_shift[3];
-  FPTYPE cell_size[3];
-  FPTYPE nat_orig[3];
-  for (int dd = 0; dd < 3; ++dd) {
-    ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
-    global_grid[dd] = nat_end[dd] - nat_stt[dd];
-    idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
-    cell_size[dd] = (FPTYPE)1. / global_grid[dd];
-    nat_orig[dd] = nat_stt[dd] * cell_size[dd];
-  }
-  if (idy < nloc) {
-    int idx_noshift[3];
-    int idx[3];
-    FPTYPE inter[3];
-    phys2Inter(inter, in_c + idy * 3, rec_boxt);
-    for (int dd = 0; dd < 3; ++dd) {
-      idx_noshift[dd] = (inter[dd] - nat_orig[dd]) / cell_size[dd];
-      if (inter[dd] - nat_orig[dd] < 0.) {
-        idx_noshift[dd]--;
-      }
-      if (idx_noshift[dd] < nat_stt[dd]) {
-        idx_noshift[dd] = nat_stt[dd];
-      } else if (idx_noshift[dd] >= nat_end[dd]) {
-        idx_noshift[dd] = nat_end[dd] - 1;
-      }
-      idx[dd] = idx_noshift[dd] + idx_orig_shift[dd];
-    }
-    idx_cellmap_noshift[idy] = collapse_index(idx_noshift, global_grid);
-    idx_cellmap[idy] = collapse_index(idx, ext_ncell);
-  }
-}
-
-__global__ void _fill_loc_cellnum_map(int *temp_idx_order,
-                                      int *loc_cellnum_map,
-                                      const int *idx_cellmap_noshift,
-                                      const int nloc,
-                                      const int loc_cellnum) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy < loc_cellnum) {
-    int num = 0;
-    for (int ii = 0; ii < nloc; ii++) {
-      if (idx_cellmap_noshift[ii] == idy) {
-        temp_idx_order[ii] = num;
-        num++;
-      }
-    }
-    loc_cellnum_map[idy] = num;
-  }
-}
-
-__global__ void _fill_total_cellnum_map(int *total_cellnum_map,
-                                        int *mask_cellnum_map,
-                                        int *cell_map,
-                                        int *cell_shift_map,
-                                        const int *nat_stt,
-                                        const int *nat_end,
-                                        const int *ext_stt,
-                                        const int *ext_end,
-                                        const int *loc_cellnum_map,
-                                        const int total_cellnum) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  int ext_ncell[3];
-  int global_grid[3];
-  int idx_orig_shift[3];
-  for (int dd = 0; dd < 3; ++dd) {
-    ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
-    global_grid[dd] = nat_end[dd] - nat_stt[dd];
-    idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
-  }
-  if (idy < total_cellnum) {
-    int *shift = cell_shift_map + idy * 3;
-    int idx[3];
-    index_recover(idy, ext_ncell, idx);
-    idx_unshift(idx, idx_orig_shift);
-    shift[0] = compute_pbc_shift(idx[0], global_grid[0]);
-    shift[1] = compute_pbc_shift(idx[1], global_grid[1]);
-    shift[2] = compute_pbc_shift(idx[2], global_grid[2]);
-    bool loc = false;
-    if (shift[0] == 0 && shift[1] == 0 && shift[2] == 0) {
-      loc = true;
-    }
-    for (int dd = 0; dd < 3; dd++) {
-      idx[dd] += shift[dd] * global_grid[dd];
-    }
-    int orig_idy = collapse_index(idx, global_grid);
-    mask_cellnum_map[idy] = loc_cellnum_map[orig_idy];
-    total_cellnum_map[idy] = mask_cellnum_map[idy];
-    if (loc) {
-      mask_cellnum_map[idy] = 0;
-    }
-    cell_map[idy] = orig_idy;
-  }
-}
-
-__global__ void _build_loc_clist(int *clist,
-                                 const int *idx_cellmap,
-                                 const int *idx_order,
-                                 const int *sec_num_map,
-                                 const int nloc) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy >= nloc) {
-    return;
-  }
-  int cell_idx = idx_cellmap[idy];
-  int *clist_row = clist + sec_num_map[cell_idx];
-  clist_row[idx_order[idy]] = idy;
-}
-
-template <typename FPTYPE>
-__global__ void _copy_coord(FPTYPE *out_c,
-                            int *out_t,
-                            int *mapping,
-                            const FPTYPE *in_c,
-                            const int *in_t,
-                            const int *cell_map,
-                            const int *cell_shift_map,
-                            const int *sec_loc_cellnum_map,
-                            const int *sec_total_cellnum_map,
-                            const int *loc_clist,
-                            const int nloc,
-                            const int nall,
-                            const int total_cellnum,
-                            const FPTYPE *boxt,
-                            const FPTYPE *rec_boxt) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy >= nall) {
-    return;
-  }
-  if (idy < nloc) {
-    mapping[idy] = idy;
-    out_t[idy] = in_t[idy];
-    for (int dd = 0; dd < 3; dd++) {
-      out_c[idy * 3 + dd] = in_c[idy * 3 + dd];
-    }
-  } else {
-    int cell_idx = 0;
-    int atom_idx = 0;
-    int orig_cell_idx = 0;
-    int orig_idy = 0;
-    int shift[3];
-    FPTYPE d_shift[3];
-    for (int ii = 0; ii < total_cellnum; ii++) {
-      if (idy >= sec_total_cellnum_map[ii + 1]) {
-        cell_idx++;
-      } else {
-        break;
-      }
-    }
-    for (int dd = 0; dd < 3; dd++) {
-      shift[dd] = cell_shift_map[cell_idx * 3 + dd];
-      d_shift[dd] = shift[dd];
-    }
-    atom_idx = idy - sec_total_cellnum_map[cell_idx];
-    orig_cell_idx = cell_map[cell_idx];
-    orig_idy = loc_clist[sec_loc_cellnum_map[orig_cell_idx] + atom_idx];
-    mapping[idy] = orig_idy;
-    out_t[idy] = in_t[orig_idy];
-    FPTYPE shift_v[3];
-    inter2Phys(shift_v, d_shift, boxt);
-    for (int dd = 0; dd < 3; dd++) {
-      out_c[idy * 3 + dd] = in_c[orig_idy * 3 + dd] - shift_v[dd];
-    }
-  }
-}
-
-template <typename FPTYPE>
-void compute_int_data(int *int_data,
-                      const FPTYPE *in_c,
-                      const int *cell_info,
-                      const deepmd::Region<FPTYPE> &region,
-                      const int nloc,
-                      const int loc_cellnum,
-                      const int total_cellnum) {
-  int *idx_cellmap = int_data;
-  int *idx_cellmap_noshift = idx_cellmap + nloc;
-  int *temp_idx_order = idx_cellmap_noshift + nloc;
-  int *loc_cellnum_map = temp_idx_order + nloc;
-  int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
-  int *mask_cellnum_map = total_cellnum_map + total_cellnum;
-  int *cell_map = mask_cellnum_map + total_cellnum;
-  int *cell_shift_map = cell_map + total_cellnum;
-  const int *nat_stt = cell_info;
-  const int *nat_end = cell_info + 3;
-  const int *ext_stt = cell_info + 6;
-  const int *ext_end = cell_info + 9;
-  const FPTYPE *rec_boxt = region.rec_boxt;
-
-  const int nblock_loc = (nloc + TPB - 1) / TPB;
-  hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap,
-                     idx_cellmap_noshift, in_c, rec_boxt, nat_stt, nat_end,
-                     ext_stt, ext_end, nloc);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int nblock_loc_cellnum = (loc_cellnum + TPB - 1) / TPB;
-  hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0,
-                     temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc,
-                     loc_cellnum);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int nblock_total_cellnum = (total_cellnum + TPB - 1) / TPB;
-  hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0,
-                     total_cellnum_map, mask_cellnum_map, cell_map,
-                     cell_shift_map, nat_stt, nat_end, ext_stt, ext_end,
-                     loc_cellnum_map, total_cellnum);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-void build_loc_clist(int *int_data,
-                     const int nloc,
-                     const int loc_cellnum,
-                     const int total_cellnum) {
-  const int nblock = (nloc + TPB - 1) / TPB;
-  const int *idx_cellmap_noshift = int_data + nloc;
-  const int *temp_idx_order = idx_cellmap_noshift + nloc;
-  const int *sec_loc_cellnum_map = temp_idx_order + nloc + loc_cellnum +
-                                   2 * total_cellnum + total_cellnum +
-                                   3 * total_cellnum;
-  int *loc_clist = int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                   total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1;
-  hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist,
-                     idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map,
-                     nloc);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void copy_coord(FPTYPE *out_c,
-                int *out_t,
-                int *mapping,
-                const int *int_data,
-                const FPTYPE *in_c,
-                const int *in_t,
-                const int nloc,
-                const int nall,
-                const int loc_cellnum,
-                const int total_cellnum,
-                const deepmd::Region<FPTYPE> &region) {
-  const int nblock = (nall + TPB - 1) / TPB;
-  const int *cell_map = int_data + 3 * nloc + loc_cellnum + 2 * total_cellnum;
-  const int *cell_shift_map = cell_map + total_cellnum;
-  const int *sec_loc_cellnum_map = cell_shift_map + 3 * total_cellnum;
-  const int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
-  const int *loc_clist = sec_total_cellnum_map + total_cellnum + 1;
-
-  const FPTYPE *boxt = region.boxt;
-  const FPTYPE *rec_boxt = region.rec_boxt;
-  hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping,
-                     in_c, in_t, cell_map, cell_shift_map, sec_loc_cellnum_map,
-                     sec_total_cellnum_map, loc_clist, nloc, nall,
-                     total_cellnum, boxt, rec_boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void normalize_coord_gpu(FPTYPE *coord,
-                         const int natom,
-                         const Region<FPTYPE> &region) {
-  const FPTYPE *boxt = region.boxt;
-  const FPTYPE *rec_boxt = region.rec_boxt;
-  const int nblock = (natom + TPB - 1) / TPB;
-  hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt,
-                     natom);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-int copy_coord_gpu(FPTYPE *out_c,
-                   int *out_t,
-                   int *mapping,
-                   int *nall,
-                   int *int_data,
-                   const FPTYPE *in_c,
-                   const int *in_t,
-                   const int &nloc,
-                   const int &mem_nall,
-                   const int &loc_cellnum,
-                   const int &total_cellnum,
-                   const int *cell_info,
-                   const Region<FPTYPE> &region) {
-  compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
-                   total_cellnum);
-  int *int_data_cpu = new int
-      [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum +
-       1];  // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-  DPErrcheck(hipMemcpy(int_data_cpu, int_data + 3 * nloc,
-                       sizeof(int) * (loc_cellnum + 2 * total_cellnum),
-                       hipMemcpyDeviceToHost));
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  int *loc_cellnum_map = int_data_cpu;
-  int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
-  int *mask_cellnum_map = total_cellnum_map + total_cellnum;
-  int *sec_loc_cellnum_map = mask_cellnum_map + total_cellnum;
-  int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
-  sec_loc_cellnum_map[0] = 0;
-  sec_total_cellnum_map[0] = nloc;
-  int max_cell = 0;
-  for (int iii = 0; iii < total_cellnum; iii++) {
-    if (max_cell < total_cellnum_map[iii]) {
-      max_cell = total_cellnum_map[iii];
-    }
-    if (iii < loc_cellnum) {
-      sec_loc_cellnum_map[iii + 1] =
-          sec_loc_cellnum_map[iii] + loc_cellnum_map[iii];
-    }
-    sec_total_cellnum_map[iii + 1] =
-        sec_total_cellnum_map[iii] + mask_cellnum_map[iii];
-  }
-  *nall = sec_total_cellnum_map[total_cellnum];
-  if (*nall > mem_nall) {
-    delete[] int_data_cpu;
-    // size of the output arrays is not large enough
-    return 1;
-  } else {
-    DPErrcheck(hipMemcpy(int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                             total_cellnum * 3,
-                         sec_loc_cellnum_map,
-                         sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1),
-                         hipMemcpyHostToDevice));
-    delete[] int_data_cpu;
-    build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
-    copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall,
-               loc_cellnum, total_cellnum, region);
-  }
-  return 0;
-}
-
-template void normalize_coord_gpu<float>(float *coord,
-                                         const int natom,
-                                         const Region<float> &region);
-template void normalize_coord_gpu<double>(double *coord,
-                                          const int natom,
-                                          const Region<double> &region);
-template int copy_coord_gpu<float>(float *out_c,
-                                   int *out_t,
-                                   int *mapping,
-                                   int *nall,
-                                   int *int_data,
-                                   const float *in_c,
-                                   const int *in_t,
-                                   const int &nloc,
-                                   const int &mem_nall,
-                                   const int &loc_cellnum,
-                                   const int &total_cellnum,
-                                   const int *cell_info,
-                                   const Region<float> &region);
-template int copy_coord_gpu<double>(double *out_c,
-                                    int *out_t,
-                                    int *mapping,
-                                    int *nall,
-                                    int *int_data,
-                                    const double *in_c,
-                                    const int *in_t,
-                                    const int &nloc,
-                                    const int &mem_nall,
-                                    const int &loc_cellnum,
-                                    const int &total_cellnum,
-                                    const int *cell_info,
-                                    const Region<double> &region);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
deleted file mode 100644
index 76657eea52..0000000000
--- a/source/lib/src/rocm/gelu.hip.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "device.h"
-#include "gelu.h"
-
-__device__ inline double _tanh(double x) { return tanh(x); }
-__device__ inline float _tanh(float x) { return tanhf(x); }
-
-template <typename FPTYPE>
-__global__ void gelu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
-  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx >= size) {
-    return;
-  }
-  out[idx] = xx[idx] * (FPTYPE)0.5 *
-             ((FPTYPE)1.0 +
-              _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] *
-                                                       xx[idx] * xx[idx])));
-}
-
-template <typename FPTYPE>
-__global__ void gelu_grad(FPTYPE* out,
-                          const FPTYPE* xx,
-                          const FPTYPE* dy,
-                          const int_64 size) {
-  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx >= size) {
-    return;
-  }
-  // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 *
-  // xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var =
-      _tanh((FPTYPE)SQRT_2_PI *
-            (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx]));
-  out[idx] =
-      dy[idx] *
-      ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) *
-           ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) +
-       (FPTYPE)0.5 * var + (FPTYPE)0.5);
-}
-
-template <typename FPTYPE>
-__global__ void gelu_grad_grad(FPTYPE* out,
-                               const FPTYPE* xx,
-                               const FPTYPE* dy,
-                               const FPTYPE* dy_2,
-                               const int_64 size) {
-  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx >= size) {
-    return;
-  }
-  // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 *
-  // xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var1 =
-      _tanh((FPTYPE)SQRT_2_PI *
-            (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx]));
-  const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) *
-                      ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.);
-  out[idx] = dy[idx] * dy_2[idx] *
-             ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] *
-                  ((FPTYPE)1. - var1 * var1) -
-              (FPTYPE)SQRT_2_PI * xx[idx] * var2 *
-                  ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 +
-              var2);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
-  if (size <= 0) {
-    return;
-  }
-  const int THREAD_ITEMS = 1024;
-  const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
-
-  hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void gelu_grad_gpu(FPTYPE* out,
-                   const FPTYPE* xx,
-                   const FPTYPE* dy,
-                   const int_64 size) {
-  if (size <= 0) {
-    return;
-  }
-  const int THREAD_ITEMS = 1024;
-  const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
-
-  hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy,
-                     size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void gelu_grad_grad_gpu(FPTYPE* out,
-                        const FPTYPE* xx,
-                        const FPTYPE* dy,
-                        const FPTYPE* dy_2,
-                        const int_64 size) {
-  if (size <= 0) {
-    return;
-  }
-  const int THREAD_ITEMS = 1024;
-  const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
-
-  hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx,
-                     dy, dy_2, size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void gelu_gpu<float>(float* out, const float* x, const int_64 size);
-template void gelu_gpu<double>(double* out, const double* x, const int_64 size);
-template void gelu_grad_gpu<float>(float* out,
-                                   const float* x,
-                                   const float* dy,
-                                   const int_64 size);
-template void gelu_grad_gpu<double>(double* out,
-                                    const double* x,
-                                    const double* dy,
-                                    const int_64 size);
-template void gelu_grad_grad_gpu<float>(float* out,
-                                        const float* x,
-                                        const float* dy,
-                                        const float* dy_2,
-                                        const int_64 size);
-template void gelu_grad_grad_gpu<double>(double* out,
-                                         const double* x,
-                                         const double* dy,
-                                         const double* dy_2,
-                                         const int_64 size);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
deleted file mode 100644
index 736f2f9e9a..0000000000
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-#include "device.h"
-#include "hipcub/hipcub.hpp"
-#include "neighbor_list.h"
-// A stateful callback functor that maintains a running prefix to be applied
-// during consecutive scan operations.
-struct parallel_prefix_scan_op {
-  // Running prefix
-  int running_total;
-  // Constructor
-  __device__ parallel_prefix_scan_op(int running_total)
-      : running_total(running_total) {}
-  // Callback operator to be entered by the first warp of threads in the block.
-  // Thread-0 is responsible for returning a value for seeding the block-wide
-  // scan.
-  __device__ int operator()(int block_aggregate) {
-    int old_prefix = running_total;
-    running_total += block_aggregate;
-    return old_prefix;
-  }
-};
-
-template <int THREADS_PER_BLOCK>
-__global__ void parallel_prefix_scan(int *numneigh,
-                                     int *nei_order,
-                                     const int *temp_nlist,
-                                     const int mem_size,
-                                     const int nloc,
-                                     const int nall) {
-  // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128
-  // threads, 4 ints per thread
-  typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScan;
-  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-  __shared__ typename BlockScan::TempStorage temp_storage;
-
-  // Initialize running total
-  parallel_prefix_scan_op prefix_op(0);
-
-  // Have the block iterate over segments of items
-  for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK) {
-    int block_offset = blockIdx.x * mem_size;
-    // Load a segment of consecutive items that are blocked across threads
-    int i_data = temp_nlist[block_offset + ii];
-    int o_data = i_data == -1 ? 0 : 1;
-
-    // Collectively compute the block-wide exclusive prefix sum
-    BlockScan(temp_storage).ExclusiveSum(o_data, o_data, prefix_op);
-
-    __syncthreads();
-    // Store scanned items to output segment
-    if (i_data != -1) {
-      nei_order[block_offset + ii] = o_data;
-    }
-    // Store numneigh into the output array
-    if (ii == nall - 1) {
-      o_data += i_data == -1 ? 0 : 1;
-      numneigh[blockIdx.x] = o_data;
-    }
-  }
-}
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(FPTYPE *arr1, FPTYPE *arr2) {
-  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
-}
-
-template <typename FPTYPE>
-__global__ void build_nlist(int *ilist,
-                            int *temp_nlist,
-                            const FPTYPE *c_cpy,
-                            const FPTYPE rcut2,
-                            const int nloc,
-                            const int nall,
-                            const int mem_size) {
-  const unsigned int atom_idx = blockIdx.x;
-  const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (neighbor_idx < nall) {
-    int *neighbor_row = temp_nlist + atom_idx * mem_size;
-    if (neighbor_idx == atom_idx) {
-      ilist[atom_idx] = atom_idx;
-    } else {
-      const FPTYPE *ccoord = c_cpy + atom_idx * 3;
-      const FPTYPE *ncoord = c_cpy + neighbor_idx * 3;
-      FPTYPE diff[3];
-      for (int kk = 0; kk < 3; kk++) {
-        diff[kk] = ccoord[kk] - ncoord[kk];
-      }
-      FPTYPE r2 = dev_dot(diff, diff);
-      if (r2 < rcut2) {
-        neighbor_row[neighbor_idx] = neighbor_idx;
-      }
-    }
-  }
-}
-
-__global__ void fill_nlist(int **firstneigh,
-                           const int *temp_nlist,
-                           const int *nei_order,
-                           const int mem_size,
-                           const int nall) {
-  const unsigned int atom_idx = blockIdx.x;
-  const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (neighbor_idx < nall) {
-    const int *in_row = temp_nlist + atom_idx * mem_size;
-    int *out_row = firstneigh[atom_idx];
-    int nei = in_row[neighbor_idx];
-    if (nei != -1) {
-      out_row[nei_order[atom_idx * mem_size + neighbor_idx]] = nei;
-    }
-  }
-}
-
-__global__ void map_nlist(int *nlist,
-                          const int *nlist_map,
-                          const int nloc,
-                          const int nnei) {
-  int atom_idx = blockIdx.x;
-  int nei_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (nei_idx >= nnei) {
-    return;
-  }
-  int nlist_idx = atom_idx * nnei + nei_idx;
-  int nlist_item = nlist[nlist_idx];
-  if (nlist_item != -1) {
-    nlist[nlist_idx] = nlist_map[nlist_item];
-  }
-}
-
-__global__ void map_nei_info(int *nlist,
-                             int *ntype,
-                             bool *nmask,
-                             const int *type,
-                             const int *nlist_map,
-                             const int nloc,
-                             const int nnei,
-                             const int ntypes) {
-  int atom_idx = blockIdx.x;
-  int nei_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (nei_idx >= nnei) {
-    return;
-  }
-  int nlist_idx = atom_idx * nnei + nei_idx;
-  int nlist_item = nlist[nlist_idx];
-  int temp = 0;
-  if (nlist_item != -1) {
-    temp = nlist_map[nlist_item];
-    nlist[nlist_idx] = temp;
-    ntype[nlist_idx] = type[temp];
-    nmask[nlist_idx] = true;
-  } else {
-    ntype[nlist_idx] = ntypes;
-  }
-}
-
-__global__ void map_nei_info_noconvert(int *nlist,
-                                       int *ntype,
-                                       bool *nmask,
-                                       const int *type,
-                                       const int nloc,
-                                       const int nnei,
-                                       const int ntypes) {
-  int atom_idx = blockIdx.x;
-  int nei_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (nei_idx >= nnei) {
-    return;
-  }
-  int nlist_idx = atom_idx * nnei + nei_idx;
-  int nlist_item = nlist[nlist_idx];
-  if (nlist_item != -1) {
-    ntype[nlist_idx] = type[nlist_item];
-    nmask[nlist_idx] = true;
-  } else {
-    ntype[nlist_idx] = ntypes;
-  }
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-int build_nlist_gpu(InputNlist &nlist,
-                    int *max_list_size,
-                    int *nlist_data,
-                    const FPTYPE *c_cpy,
-                    const int &nloc,
-                    const int &nall,
-                    const int &mem_size,
-                    const float &rcut) {
-  if (mem_size < nall) {
-    return 1;
-  }
-  const int nblock = (nall + TPB - 1) / TPB;
-  int *ilist = nlist.ilist;
-  int *numneigh = nlist.numneigh;
-  int **firstneigh = nlist.firstneigh;
-  DPErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
-  int *temp_nlist = nlist_data;  // nloc*mem_size
-  int *nei_order = temp_nlist + nloc * mem_size;
-  nlist.inum = nloc;
-  FPTYPE rcut2 = rcut * rcut;
-
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, TPB);
-  hipLaunchKernelGGL(build_nlist, block_grid, thread_grid, 0, 0, ilist,
-                     temp_nlist, c_cpy, rcut2, nloc, nall, mem_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(parallel_prefix_scan<TPB>), nloc, TPB, 0,
-                     0, numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, firstneigh,
-                     temp_nlist, nei_order, mem_size, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  int *numneigh_host = new int[nloc];
-  DPErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
-                       hipMemcpyDeviceToHost));
-  int max_nei = 0;
-  for (int ii = 0; ii < nloc; ii++) {
-    if (numneigh_host[ii] > max_nei) {
-      max_nei = numneigh_host[ii];
-    }
-  }
-  *max_list_size = max_nei;
-  delete[] numneigh_host;
-  return 0;
-}
-
-void use_nlist_map(int *nlist,
-                   const int *nlist_map,
-                   const int nloc,
-                   const int nnei) {
-  int nblock = (nnei + TPB - 1) / TPB;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, TPB);
-  hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map,
-                     nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-void use_nei_info_gpu(int *nlist,
-                      int *ntype,
-                      bool *nmask,
-                      const int *type,
-                      const int *nlist_map,
-                      const int nloc,
-                      const int nnei,
-                      const int ntypes,
-                      const bool b_nlist_map) {
-  int nblock = (nnei + TPB - 1) / TPB;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, TPB);
-  DPErrcheck(hipMemset(ntype, 0, sizeof(int) * nloc * nnei));
-  DPErrcheck(hipMemset(nmask, 0, sizeof(bool) * nloc * nnei));
-  if (b_nlist_map) {
-    hipLaunchKernelGGL(map_nei_info, block_grid, thread_grid, 0, 0, nlist,
-                       ntype, nmask, type, nlist_map, nloc, nnei, ntypes);
-  } else {
-    hipLaunchKernelGGL(map_nei_info_noconvert, block_grid, thread_grid, 0, 0,
-                       nlist, ntype, nmask, type, nloc, nnei, ntypes);
-  }
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template int build_nlist_gpu<float>(InputNlist &nlist,
-                                    int *max_list_size,
-                                    int *nlist_data,
-                                    const float *c_cpy,
-                                    const int &nloc,
-                                    const int &nall,
-                                    const int &mem_size,
-                                    const float &rcut);
-template int build_nlist_gpu<double>(InputNlist &nlist,
-                                     int *max_list_size,
-                                     int *nlist_data,
-                                     const double *c_cpy,
-                                     const int &nloc,
-                                     const int &nall,
-                                     const int &mem_size,
-                                     const float &rcut);
-__global__ void map_filter_ftype(int *ftype_out,
-                                 const int *ftype_in,
-                                 const int nloc) {
-  int ii = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ii < nloc) {
-    ftype_out[ii] = ftype_in[ii] >= 0 ? 0 : -1;
-  }
-}
-
-void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
-  int nblock = (nloc + TPB - 1) / TPB;
-  map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
deleted file mode 100644
index 23e8ce1d0e..0000000000
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ /dev/null
@@ -1,821 +0,0 @@
-#include "device.h"
-#include "fmt_nlist.h"
-#include "hipcub/hipcub.hpp"
-#include "prod_env_mat.h"
-
-__device__ inline double _sqrt(double x) { return sqrt(x); }
-__device__ inline float _sqrt(float x) { return sqrtf(x); }
-__device__ inline double _rsqrt(double x) { return rsqrt(x); }
-__device__ inline float _rsqrt(float x) { return rsqrtf(x); }
-
-// common part of prod_env_mat
-template <typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD>
-__launch_bounds__(BLOCK_THREADS) __global__
-    void BlockSortKernel(Key* d_in,
-                         Key* d_out)  // Tile of output
-{
-  enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-  // Specialize BlockLoad type for our thread block (uses warp-striped loads for
-  // coalescing, then transposes in shared memory to a blocked arrangement)
-  typedef hipcub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD,
-                            hipcub::BLOCK_LOAD_WARP_TRANSPOSE>
-      BlockLoadT;
-  // Specialize BlockRadixSort type for our thread block
-  typedef hipcub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD>
-      BlockRadixSortT;
-  // Shared memory
-  __shared__ union TempStorage {
-    typename BlockLoadT::TempStorage load;
-    typename BlockRadixSortT::TempStorage sort;
-  } temp_storage;
-  // Per-thread tile items
-  Key items[ITEMS_PER_THREAD];
-  // Our current block's offset
-  int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE;
-  // Load items into a blocked arrangement
-  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-  // Barrier for smem reuse
-  __syncthreads();
-  // Sort keys
-  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
-  // Store output in striped fashion
-  hipcub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset,
-                                            items);
-}
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) {
-  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
-}
-
-template <typename FPTYPE>
-__device__ inline void spline5_switch(
-    FPTYPE& vv, FPTYPE& dd, FPTYPE& xx, const float& rmin, const float& rmax) {
-  if (xx < rmin) {
-    dd = (FPTYPE)0.;
-    vv = (FPTYPE)1.;
-  } else if (xx < rmax) {
-    FPTYPE uu = (xx - rmin) / (rmax - rmin);
-    FPTYPE du = (FPTYPE)1. / (rmax - rmin);
-    vv = uu * uu * uu *
-             ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
-         (FPTYPE)1.;
-    dd = ((FPTYPE)3. * uu * uu *
-              ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
-          uu * uu * uu * ((FPTYPE)-12. * uu + (FPTYPE)15.)) *
-         du;
-  } else {
-    dd = (FPTYPE)0.;
-    vv = (FPTYPE)0.;
-  }
-}
-
-template <typename FPTYPE>
-__device__ inline uint_64 encoding_nbor_info(const int type,
-                                             const FPTYPE dist,
-                                             const int index) {
-  // nbor info checking:
-  // the type of nbor atom must be smaller than 128
-  // the distance of center atom between nbor atom must be smaller than 128
-  // the index of nbor atom(including ghost region) must be smaller than
-  // 16777216(1 << 24)
-  if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
-    __builtin_trap();
-  }
-  return ((uint_64)type << 57) +
-         (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) +
-         index;
-}
-
-__device__ inline void decoding_nbor_info(int& type,
-                                          int& index,
-                                          const uint_64 key) {
-  type = key >> 57;
-  index = key & 0xFFFFFF;
-}
-
-template <typename FPTYPE>
-__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) {
-  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= nloc) {
-    return;
-  }
-  i_idx[ilist[idx]] = idx;
-}
-
-template <typename FPTYPE>
-__global__ void format_nlist_fill_a(uint_64* key,
-                                    const FPTYPE* coord,
-                                    const int* type,
-                                    const int* numneigh,
-                                    int** firstneigh,
-                                    const float rcut,
-                                    int* i_idx,
-                                    const int MAX_NBOR_SIZE) {
-  // <<<nloc, MAX_NBOR_SIZE>>>
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-
-  const int nsize = numneigh[i_idx[idx]];
-  if (idy >= nsize) {
-    return;
-  }
-
-  const int* nei_idx = firstneigh[i_idx[idx]];
-  // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
-  uint_64* key_in = key + idx * MAX_NBOR_SIZE;
-  FPTYPE diff[3];
-  const int& j_idx = nei_idx[idy];
-  if (type[j_idx] < 0) {
-    return;
-  }
-  for (int dd = 0; dd < 3; dd++) {
-    diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
-  }
-  FPTYPE rr = _sqrt(dev_dot(diff, diff));
-  if (rr <= rcut) {
-    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
-  }
-}
-
-template <typename FPTYPE>
-__global__ void fill_nei_iter(int* nei_iter_dev,
-                              const FPTYPE* key,
-                              const int nloc,
-                              const int max_nbor_size,
-                              const int sec_size) {
-  int_64 row = blockIdx.x;
-  int col = blockIdx.y * blockDim.x + threadIdx.x;
-  const FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
-  int nei_type_cur = -1, nbor_idx_cur = 0;
-  int nei_type_pre = -1, nbor_idx_pre = 0;
-  if (col < max_nbor_size && key_out[col] != key_out[max_nbor_size - 1]) {
-    if (col >= 1) {
-      decoding_nbor_info(nei_type_pre, nbor_idx_pre, key_out[col - 1]);
-    }
-    decoding_nbor_info(nei_type_cur, nbor_idx_cur, key_out[col]);
-  }
-  if (nei_type_cur != nei_type_pre) {
-    nei_iter_dev[row * sec_size + nei_type_cur] = col;
-  }
-}
-
-template <typename FPTYPE>
-__global__ void format_nlist_fill_b(int* nlist,
-                                    const int nlist_size,
-                                    const int nloc,
-                                    FPTYPE* key,
-                                    const int* sec,
-                                    const int sec_size,
-                                    int* nei_iter_dev,
-                                    const int max_nbor_size) {
-  int_64 row = blockIdx.x;
-  int col = blockIdx.y * blockDim.x + threadIdx.x;
-  int* nei_iter = nei_iter_dev + row * sec_size;
-  FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
-  int* row_nlist = nlist + row * nlist_size;
-  if (col < max_nbor_size) {
-    if (key_out[col] != key_out[max_nbor_size - 1]) {
-      int nei_type = 0, nbor_idx = 0;
-      decoding_nbor_info(nei_type, nbor_idx, key_out[col]);
-      int out_indx = col - nei_iter[nei_type] + sec[nei_type];
-      if (out_indx < sec[nei_type + 1]) {
-        row_nlist[out_indx] = nbor_idx;
-      }
-    }
-  }
-}
-
-template <typename FPTYPE>
-__global__ void encoding_decoding_nbor_info(uint_64* key,
-                                            int* out_type,
-                                            int* out_index,
-                                            const int* in_type,
-                                            const FPTYPE* in_dist,
-                                            const int* in_index,
-                                            const int size_of_array) {
-  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= size_of_array) {
-    return;
-  }
-
-  key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]);
-  decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
-}
-
-template <typename FPTYPE>
-void format_nbor_list_256(uint_64* key,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          const int& nloc,
-                          const float& rcut,
-                          int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 256;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 4;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_512(uint_64* key,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          const int& nloc,
-                          const float& rcut,
-                          int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 512;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 4;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_1024(uint_64* key,
-                           const FPTYPE* coord,
-                           const int* type,
-                           const deepmd::InputNlist& gpu_inlist,
-                           const int& nloc,
-                           const float& rcut,
-                           int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 1024;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 8;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_2048(uint_64* key,
-                           const FPTYPE* coord,
-                           const int* type,
-                           const deepmd::InputNlist& gpu_inlist,
-                           const int& nloc,
-                           const float& rcut,
-                           int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 2048;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 8;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_4096(uint_64* key,
-                           const FPTYPE* coord,
-                           const int* type,
-                           const deepmd::InputNlist& gpu_inlist,
-                           const int& nloc,
-                           const float& rcut,
-                           int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 4096;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 16;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void compute_env_mat_a(FPTYPE* em,
-                                  FPTYPE* em_deriv,
-                                  FPTYPE* rij,
-                                  const FPTYPE* coord,
-                                  const FPTYPE* avg,
-                                  const FPTYPE* std,
-                                  const int* type,
-                                  const int* nlist,
-                                  const int nnei,
-                                  const float rmin,
-                                  const float rmax) {
-  // <<<nloc, TPB>>>
-  const int_64 bid = blockIdx.x;
-  const unsigned int tid = threadIdx.x;
-  if (type[bid] < 0) {
-    return;
-  }
-  if (tid >= nnei) {
-    return;
-  }
-  const int ndescrpt = nnei * 4;
-  const int* row_nlist = nlist + bid * nnei;
-  FPTYPE* row_rij = rij + bid * nnei * 3;
-  FPTYPE* row_descript = em + bid * nnei * 4;
-  FPTYPE* row_descript_deriv = em_deriv + bid * nnei * 12;
-  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
-    const int idx_value = ii * 4;   // 4 components
-    const int idx_deriv = ii * 12;  // 4 components time 3 directions
-    if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3] = {0};
-      FPTYPE dd[4] = {0};
-      FPTYPE vv[12] = {0};
-      const int j_idx = row_nlist[ii];
-      for (int kk = 0; kk < 3; kk++) {
-        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
-        row_rij[ii * 3 + kk] = rr[kk];
-      }
-      // const FPTYPE * rr = &row_rij[ii * 3];
-      FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = _rsqrt(nr2);
-      FPTYPE nr = nr2 * inr;
-      FPTYPE inr2 = inr * inr;
-      FPTYPE inr4 = inr2 * inr2;
-      FPTYPE inr3 = inr4 * nr;
-      FPTYPE sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd[0] = ((FPTYPE)1. / nr);  //* sw;
-      dd[1] = (rr[0] / nr2);      //* sw;
-      dd[2] = (rr[1] / nr2);      //* sw;
-      dd[3] = (rr[2] / nr2);      //* sw;
-      vv[0] = (rr[0] * inr3 * sw -
-               dd[0] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-      vv[1] = (rr[1] * inr3 * sw -
-               dd[0] * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-      vv[2] = (rr[2] * inr3 * sw -
-               dd[0] * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-      // ****deriv of component x/r2
-      vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw -
-               dd[1] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
-      vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4) * sw -
-               dd[1] * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
-      vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4) * sw -
-               dd[1] * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
-      // ***deriv of component y/r2
-      vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4) * sw -
-               dd[2] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
-      vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw -
-               dd[2] * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
-      vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4) * sw -
-               dd[2] * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
-      // ***deriv of component z/r2
-      vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4) * sw -
-               dd[3] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
-      vv[10] =
-          (((FPTYPE)2. * rr[2] * rr[1] * inr4) * sw -
-           dd[3] * dsw * rr[1] *
-               inr);  // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt
-                      // + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
-      vv[11] =
-          (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw -
-           dd[3] * dsw * rr[2] *
-               inr);  // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt
-                      // + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
-      // 4 value components
-      dd[0] *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
-                    // idx_value + 0];
-      dd[1] *= sw;  // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt +
-                    // idx_value + 1];
-      dd[2] *= sw;  // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt +
-                    // idx_value + 2];
-      dd[3] *= sw;  // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt +
-                    // idx_value + 3];
-      for (int ii = 0; ii < 12; ii++) {
-        row_descript_deriv[idx_deriv + ii] =
-            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
-      }
-      for (int ii = 0; ii < 4; ii++) {
-        row_descript[idx_value + ii] =
-            (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) /
-            std[type[bid] * ndescrpt + idx_value + ii];
-      }
-    } else {
-      // TODO: move it to the memset.
-      row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
-                                 std[type[bid] * ndescrpt + idx_value];
-    }
-  }
-}
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void compute_env_mat_r(FPTYPE* em,
-                                  FPTYPE* em_deriv,
-                                  FPTYPE* rij,
-                                  const FPTYPE* coord,
-                                  const FPTYPE* avg,
-                                  const FPTYPE* std,
-                                  const int* type,
-                                  const int* nlist,
-                                  const int nnei,
-                                  const float rmin,
-                                  const float rmax) {
-  // <<<nloc, TPB>>>
-  const int_64 bid = blockIdx.x;
-  const unsigned int tid = threadIdx.x;
-  if (tid >= nnei) {
-    return;
-  }
-  const int ndescrpt = nnei;
-  const int* row_nlist = nlist + bid * nnei;
-  FPTYPE* row_rij = rij + bid * nnei * 3;
-  FPTYPE* row_em = em + bid * nnei;
-  FPTYPE* row_em_deriv = em_deriv + bid * nnei * 3;
-  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
-    const int idx_value = ii;      // 4 components
-    const int idx_deriv = ii * 3;  // 4 components time 3 directions
-    if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3] = {(FPTYPE)0.};
-      FPTYPE vv[3] = {(FPTYPE)0.};
-      FPTYPE dd = (FPTYPE)0.;
-      const int& j_idx = row_nlist[ii];
-      for (int kk = 0; kk < 3; kk++) {
-        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
-        row_rij[ii * 3 + kk] = rr[kk];
-      }
-      // const FPTYPE * rr = &row_rij[ii * 3];
-      FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = _rsqrt(nr2);
-      FPTYPE nr = nr2 * inr;
-      FPTYPE inr2 = inr * inr;
-      FPTYPE inr4 = inr2 * inr2;
-      FPTYPE inr3 = inr4 * nr;
-      FPTYPE sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd = ((FPTYPE)1. / nr);  //* sw;
-      vv[0] = (rr[0] * inr3 * sw -
-               dd * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-      vv[1] = (rr[1] * inr3 * sw -
-               dd * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-      vv[2] = (rr[2] * inr3 * sw -
-               dd * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-
-      // 4 value components
-      dd *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] *
-                 // ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
-                 // idx_value + 0];
-      for (int ii = 0; ii < 3; ii++) {
-        row_em_deriv[idx_deriv + ii] =
-            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
-      }
-      row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) /
-                          std[type[bid] * ndescrpt + idx_value];
-    } else {
-      // TODO: move it to the memset.
-      row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
-                           std[type[bid] * ndescrpt + idx_value];
-    }
-  }
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void format_nbor_list_gpu(int* nlist,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          int* array_int,
-                          uint_64* array_longlong,
-                          const int max_nbor_size,
-                          const int nloc,
-                          const int nall,
-                          const float rcut,
-                          const std::vector<int> sec) {
-  const int LEN = 256;
-  const int nnei = sec.back();
-  const int nblock = (nloc + LEN - 1) / LEN;
-  int* sec_dev = array_int;
-  int* nei_iter = array_int + sec.size();  // = new int[sec_size];
-  int* i_idx = array_int + sec.size() + nloc * sec.size();
-  uint_64* key = array_longlong;
-  assert(max_nbor_size == 256 || max_nbor_size == 512 || 1024 ||
-         max_nbor_size == 2048 || max_nbor_size == 4096);
-  DPErrcheck(hipMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
-  DPErrcheck(hipMemset(key, 0xffffffff,
-                       sizeof(uint_64) * int_64(nloc) * max_nbor_size));
-  DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
-                       hipMemcpyHostToDevice));
-
-  hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, i_idx, nloc,
-                     gpu_inlist.ilist);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  if (max_nbor_size == 256) {
-    format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 512) {
-    format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 1024) {
-    format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 2048) {
-    format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 4096) {
-    format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  }
-
-  hipLaunchKernelGGL(fill_nei_iter, dim3(nloc, (max_nbor_size + LEN - 1) / LEN),
-                     LEN, 0, 0, nei_iter, key, nloc, max_nbor_size, sec.size());
-
-  hipLaunchKernelGGL(
-      format_nlist_fill_b, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN, 0,
-      0, nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_env_mat_a_gpu(FPTYPE* em,
-                        FPTYPE* em_deriv,
-                        FPTYPE* rij,
-                        int* nlist,
-                        const FPTYPE* coord,
-                        const int* type,
-                        const InputNlist& gpu_inlist,
-                        int* array_int,
-                        uint_64* array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE* avg,
-                        const FPTYPE* std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec,
-                        const int* f_type) {
-  if (f_type == NULL) {
-    f_type = type;
-  }
-  const int nnei = sec.back();
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
-  DPErrcheck(
-      hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
-
-  format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
-                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(hipGetLastError());
-  nborErrcheck(hipDeviceSynchronize());
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a<FPTYPE, TPB>), nloc, TPB,
-                     0, 0, em, em_deriv, rij, coord, avg, std, type, nlist,
-                     nnei, rcut_smth, rcut);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_env_mat_r_gpu(FPTYPE* em,
-                        FPTYPE* em_deriv,
-                        FPTYPE* rij,
-                        int* nlist,
-                        const FPTYPE* coord,
-                        const int* type,
-                        const InputNlist& gpu_inlist,
-                        int* array_int,
-                        uint_64* array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE* avg,
-                        const FPTYPE* std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec) {
-  const int nnei = sec.back();
-  const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
-  DPErrcheck(
-      hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
-
-  format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
-                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(hipGetLastError());
-  nborErrcheck(hipDeviceSynchronize());
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r<FPTYPE, TPB>), nloc, TPB,
-                     0, 0, em, em_deriv, rij, coord, avg, std, type, nlist,
-                     nnei, rcut_smth, rcut);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                          int* out_type,
-                                          int* out_index,
-                                          const int* in_type,
-                                          const FPTYPE* in_dist,
-                                          const int* in_index,
-                                          const int size_of_array) {
-  const int nblock = (size_of_array + TPB - 1) / TPB;
-  hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, key,
-                     out_type, out_index, in_type, in_dist, in_index,
-                     size_of_array);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_env_mat_a_gpu<float>(float* em,
-                                        float* em_deriv,
-                                        float* rij,
-                                        int* nlist,
-                                        const float* coord,
-                                        const int* type,
-                                        const InputNlist& gpu_inlist,
-                                        int* array_int,
-                                        unsigned long long* array_longlong,
-                                        const int max_nbor_size,
-                                        const float* avg,
-                                        const float* std,
-                                        const int nloc,
-                                        const int nall,
-                                        const float rcut,
-                                        const float rcut_smth,
-                                        const std::vector<int> sec,
-                                        const int* f_type);
-template void prod_env_mat_a_gpu<double>(double* em,
-                                         double* em_deriv,
-                                         double* rij,
-                                         int* nlist,
-                                         const double* coord,
-                                         const int* type,
-                                         const InputNlist& gpu_inlist,
-                                         int* array_int,
-                                         unsigned long long* array_longlong,
-                                         const int max_nbor_size,
-                                         const double* avg,
-                                         const double* std,
-                                         const int nloc,
-                                         const int nall,
-                                         const float rcut,
-                                         const float rcut_smth,
-                                         const std::vector<int> sec,
-                                         const int* f_type);
-template void prod_env_mat_r_gpu<float>(float* em,
-                                        float* em_deriv,
-                                        float* rij,
-                                        int* nlist,
-                                        const float* coord,
-                                        const int* type,
-                                        const InputNlist& gpu_inlist,
-                                        int* array_int,
-                                        unsigned long long* array_longlong,
-                                        const int max_nbor_size,
-                                        const float* avg,
-                                        const float* std,
-                                        const int nloc,
-                                        const int nall,
-                                        const float rcut,
-                                        const float rcut_smth,
-                                        const std::vector<int> sec);
-template void prod_env_mat_r_gpu<double>(double* em,
-                                         double* em_deriv,
-                                         double* rij,
-                                         int* nlist,
-                                         const double* coord,
-                                         const int* type,
-                                         const InputNlist& gpu_inlist,
-                                         int* array_int,
-                                         unsigned long long* array_longlong,
-                                         const int max_nbor_size,
-                                         const double* avg,
-                                         const double* std,
-                                         const int nloc,
-                                         const int nall,
-                                         const float rcut,
-                                         const float rcut_smth,
-                                         const std::vector<int> sec);
-template void format_nbor_list_gpu<float>(int* nlist,
-                                          const float* coord,
-                                          const int* type,
-                                          const deepmd::InputNlist& gpu_inlist,
-                                          int* array_int,
-                                          uint_64* array_longlong,
-                                          const int max_nbor_size,
-                                          const int nloc,
-                                          const int nall,
-                                          const float rcut,
-                                          const std::vector<int> sec);
-template void format_nbor_list_gpu<double>(int* nlist,
-                                           const double* coord,
-                                           const int* type,
-                                           const deepmd::InputNlist& gpu_inlist,
-                                           int* array_int,
-                                           uint_64* array_longlong,
-                                           const int max_nbor_size,
-                                           const int nloc,
-                                           const int nall,
-                                           const float rcut,
-                                           const std::vector<int> sec);
-template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                                   int* out_type,
-                                                   int* out_index,
-                                                   const int* in_type,
-                                                   const float* in_dist,
-                                                   const int* in_index,
-                                                   const int size_of_array);
-template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                                   int* out_type,
-                                                   int* out_index,
-                                                   const int* in_type,
-                                                   const double* in_dist,
-                                                   const int* in_index,
-                                                   const int size_of_array);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
deleted file mode 100644
index 5b1f91dd49..0000000000
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "device.h"
-#include "prod_force.h"
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void force_deriv_wrt_center_atom(FPTYPE* force,
-                                            const FPTYPE* net_deriv,
-                                            const FPTYPE* in_deriv,
-                                            const int ndescrpt,
-                                            const int nloc,
-                                            const int nall) {
-  __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
-  int_64 bid = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
-    data[ii] = (FPTYPE)0.;
-  }
-  for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
-    for (int jj = 0; jj < 3; jj++) {
-      data[jj * THREADS_PER_BLOCK + tid] +=
-          net_deriv[bid * ndescrpt + ii] *
-          in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
-    }
-  }
-  __syncthreads();
-  // do reduction in shared memory
-  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-    if (tid < ii) {
-      for (int jj = 0; jj < 3; jj++) {
-        data[jj * THREADS_PER_BLOCK + tid] +=
-            data[jj * THREADS_PER_BLOCK + tid + ii];
-      }
-    }
-    __syncthreads();
-  }
-  // write result for this block to global memory
-  const int_64 kk = bid / nloc;  // frame index
-  const int_64 ll = bid % nloc;  // atom index
-  const int_64 i_idx_nall = kk * nall + ll;
-  if (tid == 0) {
-    force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
-    force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
-    force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
-  }
-}
-
-template <typename FPTYPE>
-__global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
-                                            const FPTYPE* net_deriv,
-                                            const FPTYPE* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  if (idy >= nnei) {
-    return;
-  }
-  // deriv wrt neighbors
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE force_tmp = (FPTYPE)0.;
-  for (int idw = 0; idw < 4; ++idw) {
-    force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
-                 in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
-  }
-  const int_64 kk = idx / nloc;  // frame index
-  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp);
-}
-
-template <typename FPTYPE>
-__global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
-                                            const FPTYPE* net_deriv,
-                                            const FPTYPE* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 1;
-  if (idy >= nnei) {
-    return;
-  }
-  // deriv wrt neighbors
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  const int_64 kk = idx / nloc;  // frame index
-  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz,
-            net_deriv[idx * ndescrpt + idy] *
-                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_force_a_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>),
-                     nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv,
-                     ndescrpt, nloc, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int LEN = 64;
-  const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(LEN, 3);
-  hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0,
-                     force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_force_r_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes) {
-  const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>),
-                     nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv,
-                     ndescrpt, nloc, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int LEN = 64;
-  const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(LEN, 3);
-  hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0,
-                     force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_force_a_gpu<float>(float* force,
-                                      const float* net_deriv,
-                                      const float* in_deriv,
-                                      const int* nlist,
-                                      const int nloc,
-                                      const int nall,
-                                      const int nnei,
-                                      const int nframes);
-template void prod_force_a_gpu<double>(double* force,
-                                       const double* net_deriv,
-                                       const double* in_deriv,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei,
-                                       const int nframes);
-template void prod_force_r_gpu<float>(float* force,
-                                      const float* net_deriv,
-                                      const float* in_deriv,
-                                      const int* nlist,
-                                      const int nloc,
-                                      const int nall,
-                                      const int nnei,
-                                      const int nframes);
-template void prod_force_r_gpu<double>(double* force,
-                                       const double* net_deriv,
-                                       const double* in_deriv,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei,
-                                       const int nframes);
-
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
deleted file mode 100644
index 2cb7c4f1d6..0000000000
--- a/source/lib/src/rocm/prod_force_grad.hip.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "device.h"
-#include "prod_force_grad.h"
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) {
-  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
-}
-
-template <typename FPTYPE>
-__global__ void force_grad_wrt_center_atom(FPTYPE* grad_net,
-                                           const FPTYPE* grad,
-                                           const FPTYPE* env_deriv,
-                                           const int ndescrpt) {
-  __shared__ FPTYPE grad_one[3];
-  int_64 center_idx = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  if (tid < 3) {
-    grad_one[tid] = grad[center_idx * 3 + tid];
-  }
-  __syncthreads();
-  unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid;
-  if (descrpt_idx < ndescrpt) {
-    grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot(
-        grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3);
-  }
-}
-
-template <typename FPTYPE>
-__global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net,
-                                           const FPTYPE* grad,
-                                           const FPTYPE* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const unsigned int idy = blockIdx.y;
-  const unsigned int idw = threadIdx.y;
-  if (idx >= nframes * nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  if (j_idx >= nloc) {
-    j_idx = j_idx % nloc;
-  }
-  const int kk = idx / nloc;  // frame index
-  grad_net[idx * nnei * 4 + idy * 4 + idw] +=
-      dev_dot(grad + kk * nloc * 3 + j_idx * 3,
-              env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
-}
-
-template <typename FPTYPE>
-__global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
-                                           const FPTYPE* grad,
-                                           const FPTYPE* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const unsigned int idy = blockIdx.y;
-  if (idx >= nframes * nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  if (j_idx >= nloc) {
-    j_idx = j_idx % nloc;
-  }
-  const int kk = idx / nloc;  // frame index
-  grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3,
-                                        env_deriv + idx * nnei * 3 + idy * 3);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_force_grad_a_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(
-      hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
-  const int nblock = (ndescrpt + TPB - 1) / TPB;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(TPB, 1);
-  hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, ndescrpt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int LEN = 128;
-  const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
-  dim3 block_grid_(nblock_, nnei);
-  dim3 thread_grid_(LEN, 4);
-  hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0,
-                     0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_force_grad_r_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes) {
-  const int ndescrpt = nnei * 1;
-  DPErrcheck(
-      hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
-  const int nblock = (ndescrpt + TPB - 1) / TPB;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(TPB, 1);
-  hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, ndescrpt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int LEN = 128;
-  const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
-  dim3 block_grid_(nblock_, nnei);
-  dim3 thread_grid_(LEN, 1);
-  hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0,
-                     0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_force_grad_a_gpu<float>(float* grad_net,
-                                           const float* grad,
-                                           const float* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_grad_a_gpu<double>(double* grad_net,
-                                            const double* grad,
-                                            const double* env_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei,
-                                            const int nframes);
-template void prod_force_grad_r_gpu<float>(float* grad_net,
-                                           const float* grad,
-                                           const float* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_grad_r_gpu<double>(double* grad_net,
-                                            const double* grad,
-                                            const double* env_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei,
-                                            const int nframes);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
deleted file mode 100644
index ff29c07ffb..0000000000
--- a/source/lib/src/rocm/prod_virial.hip.cu
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "device.h"
-#include "prod_virial.h"
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void atom_virial_reduction(FPTYPE* virial,
-                                      const FPTYPE* atom_virial,
-                                      const int nall) {
-  unsigned int bid = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  __shared__ FPTYPE data[THREADS_PER_BLOCK];
-  data[tid] = (FPTYPE)0.;
-  for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
-    data[tid] += atom_virial[ii * 9 + bid];
-  }
-  __syncthreads();
-  // do reduction in shared memory
-  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-    if (tid < ii) {
-      data[tid] += data[tid + ii];
-    }
-    __syncthreads();
-  }
-  // write result for this block to global memory
-  if (tid == 0) {
-    virial[bid] = data[0];
-  }
-}
-
-template <typename FPTYPE>
-__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
-                                             FPTYPE* atom_virial,
-                                             const FPTYPE* net_deriv,
-                                             const FPTYPE* in_deriv,
-                                             const FPTYPE* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei) {
-  // idx -> nloc
-  // idy -> nnei
-  // idz = dd0 * 3 + dd1
-  // dd0 = idz / 3
-  // dd1 = idz % 3
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  if (idy >= nnei) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE virial_tmp = (FPTYPE)0.;
-  for (int idw = 0; idw < 4; ++idw) {
-    virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
-                  rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
-  }
-  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
-}
-
-template <typename FPTYPE>
-__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
-                                             FPTYPE* atom_virial,
-                                             const FPTYPE* net_deriv,
-                                             const FPTYPE* in_deriv,
-                                             const FPTYPE* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei) {
-  // idx -> nloc
-  // idy -> nnei
-  // idz = dd0 * 3 + dd1
-  // dd0 = idz / 3
-  // dd1 = idz % 3
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 1;
-
-  if (idy >= nnei) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  // atomicAdd(
-  //    virial + idz,
-  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
-  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
-  //    3]);
-  atomicAdd(atom_virial + j_idx * 9 + idz,
-            net_deriv[idx * ndescrpt + idy] *
-                rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_virial_a_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* in_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei) {
-  DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-  const int LEN = 16;
-  int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(LEN, 9);
-  // compute virial of a frame
-  hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0,
-                     0, virial, atom_virial, net_deriv, in_deriv, rij, nlist,
-                     nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  // reduction atom_virial to virial
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9,
-                     TPB, 0, 0, virial, atom_virial, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_virial_r_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* in_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei) {
-  DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-  const int LEN = 16;
-  int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(LEN, 9);
-  // compute virial of a frame
-  hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0,
-                     0, virial, atom_virial, net_deriv, in_deriv, rij, nlist,
-                     nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  // reduction atom_virial to virial
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9,
-                     TPB, 0, 0, virial, atom_virial, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_virial_a_gpu<float>(float* virial,
-                                       float* atom_virial,
-                                       const float* net_deriv,
-                                       const float* in_deriv,
-                                       const float* rij,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei);
-template void prod_virial_a_gpu<double>(double* virial,
-                                        double* atom_virial,
-                                        const double* net_deriv,
-                                        const double* in_deriv,
-                                        const double* rij,
-                                        const int* nlist,
-                                        const int nloc,
-                                        const int nall,
-                                        const int nnei);
-template void prod_virial_r_gpu<float>(float* virial,
-                                       float* atom_virial,
-                                       const float* net_deriv,
-                                       const float* in_deriv,
-                                       const float* rij,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei);
-template void prod_virial_r_gpu<double>(double* virial,
-                                        double* atom_virial,
-                                        const double* net_deriv,
-                                        const double* in_deriv,
-                                        const double* rij,
-                                        const int* nlist,
-                                        const int nloc,
-                                        const int nall,
-                                        const int nnei);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
deleted file mode 100644
index d41a1689ce..0000000000
--- a/source/lib/src/rocm/prod_virial_grad.hip.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "device.h"
-#include "prod_virial_grad.h"
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) {
-  FPTYPE result = (FPTYPE)0.0;
-  for (int ii = 0; ii < 9; ii++) {
-    result += arr1[ii] * arr2[ii];
-  }
-  return result;
-}
-
-template <typename FPTYPE>
-__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net,
-                                            const FPTYPE* grad,
-                                            const FPTYPE* env_deriv,
-                                            const FPTYPE* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei) {
-  // idy -> nnei
-  const unsigned int tid = threadIdx.x;
-  const int_64 idx = blockIdx.x * blockDim.x + tid;
-  const unsigned int idy = blockIdx.y;
-  const unsigned int idw = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  __shared__ FPTYPE grad_one[9];
-  if (tid < 9) {
-    grad_one[tid] = grad[tid];
-  }
-  __syncthreads();
-  if (idx >= nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE tmp[9];
-  for (int dd0 = 0; dd0 < 3; ++dd0) {
-    for (int dd1 = 0; dd1 < 3; ++dd1) {
-      tmp[dd0 * 3 + dd1] =
-          rij[idx * nnei * 3 + idy * 3 + dd1] *
-          env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
-    }
-  }
-  grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp);
-}
-
-template <typename FPTYPE>
-__global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net,
-                                            const FPTYPE* grad,
-                                            const FPTYPE* env_deriv,
-                                            const FPTYPE* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei) {
-  // idy -> nnei
-  const unsigned int tid = threadIdx.x;
-  const int_64 idx = blockIdx.x * blockDim.x + tid;
-  const unsigned int idy = blockIdx.y;
-  const int ndescrpt = nnei;
-  __shared__ FPTYPE grad_one[9];
-  if (tid < 9) {
-    grad_one[tid] = grad[tid];
-  }
-  __syncthreads();
-  if (idx >= nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE tmp[9];
-  for (int dd0 = 0; dd0 < 3; ++dd0) {
-    for (int dd1 = 0; dd1 < 3; ++dd1) {
-      tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] *
-                           env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0];
-    }
-  }
-  grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  const int LEN = 128;
-  const int nblock = (nloc + LEN - 1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 4);
-  hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_virial_grad_r_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei) {
-  const int ndescrpt = nnei;
-  DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  const int LEN = 128;
-  const int nblock = (nloc + LEN - 1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 1);
-  hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_virial_grad_a_gpu<float>(float* grad_net,
-                                            const float* grad,
-                                            const float* env_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei);
-template void prod_virial_grad_a_gpu<double>(double* grad_net,
-                                             const double* grad,
-                                             const double* env_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei);
-template void prod_virial_grad_r_gpu<float>(float* grad_net,
-                                            const float* grad,
-                                            const float* env_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei);
-template void prod_virial_grad_r_gpu<double>(double* grad_net,
-                                             const double* grad,
-                                             const double* env_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu
deleted file mode 100644
index de67ef648c..0000000000
--- a/source/lib/src/rocm/region.hip.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "device.h"
-#include "region.cuh"
-#include "region.h"
-
-template <typename FPTYPE>
-__global__ void _phys2Inter(FPTYPE *inter,
-                            const FPTYPE *phys,
-                            const FPTYPE *rec_boxt) {
-  phys2Inter(inter, phys, rec_boxt);
-}
-
-template <typename FPTYPE>
-__global__ void _inter2Phys(FPTYPE *phys,
-                            const FPTYPE *inter,
-                            const FPTYPE *boxt) {
-  inter2Phys(phys, inter, boxt);
-}
-
-template <typename FPTYPE>
-__global__ void _compute_volume(FPTYPE *volume, const FPTYPE *boxt) {
-  volume[0] = compute_volume(boxt);
-}
-
-namespace deepmd {
-// only for unittest
-template <typename FPTYPE>
-void convert_to_inter_gpu(FPTYPE *ri,
-                          const Region<FPTYPE> &region,
-                          const FPTYPE *rp) {
-  hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void convert_to_phys_gpu(FPTYPE *rp,
-                         const Region<FPTYPE> &region,
-                         const FPTYPE *ri) {
-  hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
-  hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void convert_to_inter_gpu<float>(float *ri,
-                                          const Region<float> &region,
-                                          const float *rp);
-template void convert_to_inter_gpu<double>(double *ri,
-                                           const Region<double> &region,
-                                           const double *rp);
-template void convert_to_phys_gpu<float>(float *rp,
-                                         const Region<float> &region,
-                                         const float *ri);
-template void convert_to_phys_gpu<double>(double *rp,
-                                          const Region<double> &region,
-                                          const double *ri);
-template void volume_gpu<float>(float *volume, const Region<float> &region);
-template void volume_gpu<double>(double *volume, const Region<double> &region);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
deleted file mode 100644
index 88a1cbb574..0000000000
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ /dev/null
@@ -1,1036 +0,0 @@
-#include "device.h"
-#include "tabulate.h"
-
-#define MM 4
-#define KK 4
-#define TPB 256
-#define WARP_SIZE 64
-#define FULL_MASK 0xffffffff
-
-template <typename FPTYPE>
-__forceinline__ __device__ void locate_xx(FPTYPE& xx,
-                                          int& table_idx,
-                                          const FPTYPE& lower,
-                                          const FPTYPE& upper,
-                                          const FPTYPE& max,
-                                          const FPTYPE& stride0,
-                                          const FPTYPE& stride1) {
-  if (xx < lower) {
-    table_idx = 0;
-    xx = (FPTYPE)0.;
-  } else if (xx < upper) {
-    table_idx = (int)((xx - lower) / stride0);
-    xx -= (table_idx * stride0 + lower);
-  } else if (xx < max) {
-    int first_stride = int((upper - lower) / stride0);
-    table_idx = first_stride + (int)((xx - upper) / stride1);
-    xx -= ((table_idx - first_stride) * stride1 + upper);
-  } else {
-    table_idx =
-        int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = (FPTYPE)0.;
-  }
-}
-
-template <typename FPTYPE>
-__forceinline__ __device__ void locate_xx_se_t(FPTYPE& xx,
-                                               int& table_idx,
-                                               const FPTYPE& lower,
-                                               const FPTYPE& upper,
-                                               const FPTYPE& min,
-                                               const FPTYPE& max,
-                                               const FPTYPE& stride0,
-                                               const FPTYPE& stride1) {
-  if (xx < min) {
-    table_idx = 0;
-    xx = (FPTYPE)0.;
-  } else if (xx < lower) {
-    table_idx = (int)((xx - min) / stride1);
-    xx -= (table_idx * stride1 + min);
-  } else if (xx < upper) {
-    int first_stride = int((lower - min) / stride1);
-    table_idx = first_stride + (int)((xx - lower) / stride0);
-    xx -= ((table_idx - first_stride) * stride0 + lower);
-  } else if (xx < max) {
-    int first_stride =
-        int((lower - min) / stride1) + int((upper - lower) / stride0);
-    table_idx = first_stride + (int)((xx - upper) / stride1);
-    xx -= ((table_idx - first_stride) * stride1 + upper);
-  } else {
-    table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) +
-                (int)((max - upper) / stride1) - 1;
-    xx = (FPTYPE)0.;
-  }
-}
-
-template <typename FPTYPE>
-__forceinline__ __device__ FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) {
-  return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3];
-}
-
-template <typename FPTYPE>
-__forceinline__ __device__ void warp_reduce(FPTYPE& val) {
-  for (int offset = 32; offset > 0; offset >>= 1) {
-    val += __shfl_down(val, offset);  // ########????
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_a_fifth_order_polynomial(
-    FPTYPE* out,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* two_embed,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted) {
-  bool enable_se_atten = two_embed != nullptr;
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
-  bool unloop = false;
-  int breakpoint = nnei - 1;
-  FPTYPE* iteratorC = (FPTYPE*)&_data[0];
-  for (int kk = 0; kk < MTILE; kk++) {
-    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
-  }
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em_x[block_idx * nnei + ii];
-    if (xx == ago && is_sorted) {
-      unloop = true;
-      breakpoint = ii;
-    }
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res =
-        var[0] +
-        (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-            xx;
-    if (enable_se_atten) {
-      FPTYPE t = two_embed[block_idx * nnei * last_layer_size +
-                           ii * last_layer_size + thread_idx];
-      res = res * t + res;
-    }
-
-    for (int kk = 0; kk < MTILE; kk++) {
-      iteratorC[kk * last_layer_size + thread_idx] +=
-          (nnei - breakpoint) * em[block_idx * nnei * MTILE + ii * MTILE + kk] *
-          res;
-    }
-    if (unloop) {
-      break;
-    }
-  }
-  for (int ii = 0; ii < MTILE; ii++) {
-    out[block_idx * MTILE * last_layer_size + ii * last_layer_size +
-        thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
-    FPTYPE* dy_dem_x,
-    FPTYPE* dy_dem,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* two_embed,
-    const FPTYPE* dy,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted) {
-  bool enable_se_atten = two_embed != nullptr;
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl(threadIdx.x / 64, 0);
-  int lane_idx = threadIdx.x % 64;
-  int breakpoint = nnei - 1;
-  bool unloop = false;
-  FPTYPE* iteratorA = (FPTYPE*)&_data[0];  // dy
-  for (int ii = 0; ii < MTILE; ii++) {
-    for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
-      iteratorA[ii * last_layer_size + jj] =
-          dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
-    }
-  }
-  __syncthreads();
-  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
-  for (int ii = 0; ii < nnei - warp_idx; ii += KTILE) {
-    FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx];
-    if (ago == xx && is_sorted) {
-      unloop = true;
-      breakpoint = ii + warp_idx;
-    }
-
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    FPTYPE sum[KTILE] = {(FPTYPE)0.};
-    FPTYPE Csub = (FPTYPE)0.;
-    for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
-      FPTYPE var[6];
-      // load iteratorB through table
-      var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
-      FPTYPE res =
-          var[0] +
-          (var[1] +
-           (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-              xx;
-      FPTYPE t;
-      if (enable_se_atten) {
-        t = two_embed[block_idx * nnei * last_layer_size +
-                      ii * last_layer_size + jj];
-        res = res * t + res;
-      }
-
-      for (int kk = 0; kk < KTILE; kk++) {
-        sum[kk] +=
-            (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res;
-      }
-      res = em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] *
-            iteratorA[0 * last_layer_size + jj];
-      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] *
-             iteratorA[1 * last_layer_size + jj];
-      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] *
-             iteratorA[2 * last_layer_size + jj];
-      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] *
-             iteratorA[3 * last_layer_size + jj];
-      Csub +=
-          (nnei - breakpoint) *
-          (var[1] + ((FPTYPE)2. * var[2] +
-                     ((FPTYPE)3. * var[3] +
-                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                         xx) *
-                        xx) *
-          (enable_se_atten ? res * t + res : res);
-    }
-    //__syncwarp();->syncwrap
-    __syncthreads();
-    for (int kk = 0; kk < KTILE; kk++) {
-      warp_reduce(sum[kk]);
-    }
-    warp_reduce(Csub);
-    if (lane_idx == 0) {
-      for (int kk = 0; kk < KTILE; kk++) {
-        dy_dem[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
-      }
-      dy_dem_x[block_idx * nnei + ii + warp_idx] = Csub;
-    }
-    if (unloop) {
-      break;
-    }
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
-    FPTYPE* dz_dy,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* dz_dy_dem_x,
-    const FPTYPE* dz_dy_dem,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted) {
-  extern __shared__ int _data[];
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
-  bool unloop = false;
-  int breakpoint = nnei - 1;
-  FPTYPE* iteratorC = (FPTYPE*)&_data[0];
-  for (int kk = 0; kk < MTILE; kk++) {
-    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
-  }
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em_x[block_idx * nnei + ii];
-    FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii];
-    if (xx == ago && is_sorted) {
-      unloop = true;
-      breakpoint = ii;
-    }
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res =
-        var[0] +
-        (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-            xx;
-    FPTYPE res_grad =
-        var[1] + ((FPTYPE)2. * var[2] +
-                  ((FPTYPE)3. * var[3] +
-                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                      xx) *
-                     xx;
-
-    for (int kk = 0; kk < MTILE; kk++) {
-      int em_index = block_idx * nnei * MTILE + ii * MTILE + kk;
-      iteratorC[kk * last_layer_size + thread_idx] +=
-          (nnei - breakpoint) *
-          (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res);
-    }
-    if (unloop) {
-      break;
-    }
-  }
-  for (int ii = 0; ii < MTILE; ii++) {
-    dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size +
-          thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_t_fifth_order_polynomial(
-    FPTYPE* out,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  FPTYPE sum = (FPTYPE)0.;
-  for (int ii = 0; ii < nnei_i; ii++) {
-    for (int jj = 0; jj < nnei_j; jj++) {
-      FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE tmp = xx;
-      int table_idx = 0;
-      locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      FPTYPE var[6];
-      var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-      FPTYPE res =
-          var[0] +
-          (var[1] +
-           (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-              xx;
-
-      sum += tmp * res;
-    }
-  }
-  out[block_idx * last_layer_size + thread_idx] = sum;
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
-    FPTYPE* dy_dem_x,
-    FPTYPE* dy_dem,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* dy,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl(threadIdx.x / 64, 0);
-  int lane_idx = threadIdx.x % 64;
-  FPTYPE* iteratorA = (FPTYPE*)&_data[0];  // dy
-  for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) {
-    iteratorA[ii] = dy[block_idx * last_layer_size + ii];
-  }
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei_i; ii++) {
-    FPTYPE ago =
-        __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
-    for (int jj = warp_idx; jj < nnei_j; jj += KTILE) {
-      FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE tmp = xx;
-      int table_idx = 0;
-      locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      FPTYPE sum = (FPTYPE)0.;
-      FPTYPE Csub = (FPTYPE)0.;
-      for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) {
-        FPTYPE var[6];
-        // load iteratorB through table
-        var[0] = table[table_idx * last_layer_size * 6 + 6 * kk + 0];
-        var[1] = table[table_idx * last_layer_size * 6 + 6 * kk + 1];
-        var[2] = table[table_idx * last_layer_size * 6 + 6 * kk + 2];
-        var[3] = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
-        var[4] = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
-        var[5] = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-        FPTYPE res =
-            var[0] +
-            (var[1] +
-             (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-                xx;
-
-        sum += iteratorA[kk] * res;
-        Csub +=
-            iteratorA[kk] * tmp *
-            (var[1] + ((FPTYPE)2. * var[2] +
-                       ((FPTYPE)3. * var[3] +
-                        ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                           xx) *
-                          xx);
-      }
-      __syncthreads();
-      warp_reduce(sum);
-      warp_reduce(Csub);
-      if (lane_idx == 0) {
-        dy_dem[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = sum;
-        dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = Csub;
-      }
-    }
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(
-    FPTYPE* dz_dy,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* dz_dy_dem_x,
-    const FPTYPE* dz_dy_dem,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size) {
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  FPTYPE sum = (FPTYPE)0.;
-  for (int ii = 0; ii < nnei_i; ii++) {
-    FPTYPE ago =
-        __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
-    for (int jj = 0; ii < nnei_j; jj++) {
-      FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE tmp = xx;
-      FPTYPE dz_xx =
-          dz_dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE dz_em = dz_dy_dem[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE var[6];
-
-      int table_idx = 0;
-      locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-      FPTYPE res =
-          var[0] +
-          (var[1] +
-           (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-              xx;
-      FPTYPE res_grad =
-          var[1] + ((FPTYPE)2. * var[2] +
-                    ((FPTYPE)3. * var[3] +
-                     ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                        xx) *
-                       xx;
-
-      sum += (tmp * res_grad * dz_xx + dz_em * res);
-    }
-  }
-  dz_dy[block_idx * last_layer_size + thread_idx] = sum;
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_r_fifth_order_polynomial(
-    FPTYPE* out,
-    const FPTYPE* table,
-    const FPTYPE* em,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em[block_idx * nnei + ii];
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    out[block_idx * nnei * last_layer_size + ii * last_layer_size +
-        thread_idx] =
-        var[0] +
-        (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-            xx;
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
-    FPTYPE* dy_dem,
-    const FPTYPE* table,
-    const FPTYPE* em,
-    const FPTYPE* dy,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl(threadIdx.x / 64, 0);
-  int lane_idx = threadIdx.x % 64;
-
-  for (int ii = 0; ii < nnei; ii += KTILE) {
-    FPTYPE xx = em[block_idx * nnei + ii + warp_idx];
-
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    FPTYPE Csub = 0.f;
-    for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
-      FPTYPE var[6];
-      // load iteratorB through table
-      var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
-      Csub +=
-          (var[1] + ((FPTYPE)2. * var[2] +
-                     ((FPTYPE)3. * var[3] +
-                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                         xx) *
-                        xx) *
-          dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj];
-    }
-    //__syncwarp();->syncwrap
-    __syncthreads();
-    warp_reduce(Csub);
-    if (lane_idx == 0) {
-      dy_dem[block_idx * nnei + ii + warp_idx] = Csub;
-    }
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
-    FPTYPE* dz_dy,
-    const FPTYPE* table,
-    const FPTYPE* em,
-    const FPTYPE* dz_dy_dem,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size) {
-  extern __shared__ int _data[];
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em[block_idx * nnei + ii];
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res_grad =
-        var[1] + ((FPTYPE)2. * var[2] +
-                  ((FPTYPE)3. * var[3] +
-                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                      xx) *
-                     xx;
-    dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size +
-          thread_idx] = dz_dy_dem[block_idx * nnei + ii] * res_grad;
-  }
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const FPTYPE* two_embed,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size,
-                              const bool is_sorted) {
-  if (nloc <= 0) {
-    return;
-  }
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out,
-      table, em_x, em, two_embed, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei, last_layer_size, is_sorted);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* two_embed,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size,
-                                   const bool is_sorted) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
-  DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
-
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x,
-      dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1],
-      table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
-      is_sorted);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size,
-                                        const bool is_sorted) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM,
-                                                                KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy,
-      table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1],
-      table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
-      is_sorted);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei_i,
-                              const int nnei_j,
-                              const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, last_layer_size, 0, 0, out, table, em_x, em, table_info[0],
-      table_info[1], table_info[2], table_info[3], table_info[4], nnei_i,
-      nnei_j, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei_i,
-                                   const int nnei_j,
-                                   const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
-  DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
-
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_t_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size, 0, dy_dem_x,
-      dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei_i,
-                                        const int nnei_j,
-                                        const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM,
-                                                                KK>),
-      nloc, last_layer_size, 0, 0, dz_dy, table, em_x, em, dz_dy_dem_x,
-      dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3],
-      table_info[4], nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out,
-      table, em, table_info[0], table_info[1], table_info[2], table_info[3],
-      table_info[4], nnei, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
-
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem,
-      table, em, dy, table_info[0], table_info[1], table_info[2], table_info[3],
-      table_info[4], nnei, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(
-      hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM,
-                                                                KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy,
-      table, em, dz_dy_dem, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void tabulate_fusion_se_a_gpu<float>(float* out,
-                                              const float* table,
-                                              const float* table_info,
-                                              const float* em_x,
-                                              const float* em,
-                                              const float* two_embed,
-                                              const int nloc,
-                                              const int nnei,
-                                              const int last_layer_size,
-                                              const bool is_sorted);
-template void tabulate_fusion_se_a_gpu<double>(double* out,
-                                               const double* table,
-                                               const double* table_info,
-                                               const double* em_x,
-                                               const double* em,
-                                               const double* two_embed,
-                                               const int nloc,
-                                               const int nnei,
-                                               const int last_layer_size,
-                                               const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu<float>(float* dy_dem_x,
-                                                   float* dy_dem,
-                                                   const float* table,
-                                                   const float* table_info,
-                                                   const float* em_x,
-                                                   const float* em,
-                                                   const float* two_embed,
-                                                   const float* dy,
-                                                   const int nloc,
-                                                   const int nnei,
-                                                   const int last_layer_size,
-                                                   const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu<double>(double* dy_dem_x,
-                                                    double* dy_dem,
-                                                    const double* table,
-                                                    const double* table_info,
-                                                    const double* em_x,
-                                                    const double* em,
-                                                    const double* two_embed,
-                                                    const double* dy,
-                                                    const int nloc,
-                                                    const int nnei,
-                                                    const int last_layer_size,
-                                                    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu<float>(
-    float* dz_dy,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* dz_dy_dem_x,
-    const float* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu<double>(
-    double* dz_dy,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* dz_dy_dem_x,
-    const double* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-
-template void tabulate_fusion_se_t_gpu<float>(float* out,
-                                              const float* table,
-                                              const float* table_info,
-                                              const float* em_x,
-                                              const float* em,
-                                              const int nloc,
-                                              const int nnei_i,
-                                              const int nnei_j,
-                                              const int last_layer_size);
-template void tabulate_fusion_se_t_gpu<double>(double* out,
-                                               const double* table,
-                                               const double* table_info,
-                                               const double* em_x,
-                                               const double* em,
-                                               const int nloc,
-                                               const int nnei_i,
-                                               const int nnei_j,
-                                               const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu<float>(float* dy_dem_x,
-                                                   float* dy_dem,
-                                                   const float* table,
-                                                   const float* table_info,
-                                                   const float* em_x,
-                                                   const float* em,
-                                                   const float* dy,
-                                                   const int nloc,
-                                                   const int nnei_i,
-                                                   const int nnei_j,
-                                                   const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu<double>(double* dy_dem_x,
-                                                    double* dy_dem,
-                                                    const double* table,
-                                                    const double* table_info,
-                                                    const double* em_x,
-                                                    const double* em,
-                                                    const double* dy,
-                                                    const int nloc,
-                                                    const int nnei_i,
-                                                    const int nnei_j,
-                                                    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu<float>(
-    float* dz_dy,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* dz_dy_dem_x,
-    const float* dz_dy_dem,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu<double>(
-    double* dz_dy,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* dz_dy_dem_x,
-    const double* dz_dy_dem,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-
-template void tabulate_fusion_se_r_gpu<float>(float* out,
-                                              const float* table,
-                                              const float* table_info,
-                                              const float* em,
-                                              const int nloc,
-                                              const int nnei,
-                                              const int last_layer_size);
-template void tabulate_fusion_se_r_gpu<double>(double* out,
-                                               const double* table,
-                                               const double* table_info,
-                                               const double* em,
-                                               const int nloc,
-                                               const int nnei,
-                                               const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu<float>(float* dy_dem,
-                                                   const float* table,
-                                                   const float* table_info,
-                                                   const float* em,
-                                                   const float* dy,
-                                                   const int nloc,
-                                                   const int nnei,
-                                                   const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu<double>(double* dy_dem,
-                                                    const double* table,
-                                                    const double* table_info,
-                                                    const double* em,
-                                                    const double* dy,
-                                                    const int nloc,
-                                                    const int nnei,
-                                                    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu<float>(
-    float* dz_dy,
-    const float* table,
-    const float* table_info,
-    const float* em,
-    const float* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu<double>(
-    double* dz_dy,
-    const double* table,
-    const double* table_info,
-    const double* em,
-    const double* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-
-}  // namespace deepmd