From 57cb80c7836eb08cb7700c76022ec3e3a5ed23ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Fri, 21 May 2021 17:17:42 +0800
Subject: [PATCH 01/11] PR

---
 source/CMakeLists.txt                       |  35 +-
 source/api_cc/CMakeLists.txt                |   5 +
 source/api_cc/include/custom_op.h           |   4 +-
 source/api_cc/src/DeepPot.cc                |  41 ++
 source/api_cc/tests/CMakeLists.txt          |  41 +-
 source/lib/CMakeLists.txt                   |   7 +
 source/lib/include/coord.h                  |  44 ++
 source/lib/include/device.h                 |   6 +-
 source/lib/include/fmt_nlist.h              |  28 +
 source/lib/include/gelu.h                   |  23 +
 source/lib/include/gpu_rocm.h               | 115 ++++
 source/lib/include/neighbor_list.h          |  42 ++
 source/lib/include/prod_env_mat.h           |  50 ++
 source/lib/include/prod_force.h             |  22 +
 source/lib/include/prod_force_grad.h        |  19 +
 source/lib/include/prod_virial.h            |  27 +
 source/lib/include/prod_virial_grad.h       |  22 +
 source/lib/include/region.h                 |  22 +
 source/lib/include/tabulate.h               |  25 +
 source/lib/src/neighbor_list.cc             |  33 ++
 source/lib/src/prod_env_mat.cc              |  65 +++
 source/lib/src/rocm/CMakeLists.txt          |  45 ++
 source/lib/src/rocm/coord.hip.cu            | 414 ++++++++++++++
 source/lib/src/rocm/gelu.hip.cu             |  98 ++++
 source/lib/src/rocm/neighbor_list.hip.cu    | 184 +++++++
 source/lib/src/rocm/prod_env_mat.hip.cu     | 579 ++++++++++++++++++++
 source/lib/src/rocm/prod_force.hip.cu       | 161 ++++++
 source/lib/src/rocm/prod_force_grad.hip.cu  | 143 +++++
 source/lib/src/rocm/prod_virial.hip.cu      | 176 ++++++
 source/lib/src/rocm/prod_virial_grad.hip.cu | 141 +++++
 source/lib/src/rocm/region.hip.cu           |  69 +++
 source/lib/src/rocm/tabulate.hip.cu         | 246 +++++++++
 source/lib/tests/CMakeLists.txt             |  46 ++
 source/lib/tests/test_coord.cc              | 396 ++++++++++++-
 source/lib/tests/test_env_mat_a.cc          | 213 +++++++
 source/lib/tests/test_env_mat_r.cc          | 203 +++++++
 source/lib/tests/test_fmt_nlist.cc          | 155 +++++-
 source/lib/tests/test_gelu.cc               |  70 +++
 source/lib/tests/test_neighbor_list.cc      | 113 +++-
 source/lib/tests/test_prod_force_a.cc       |  30 +
 source/lib/tests/test_prod_force_grad_a.cc  |  30 +
 source/lib/tests/test_prod_force_grad_r.cc  |  30 +
 source/lib/tests/test_prod_force_r.cc       |  30 +
 source/lib/tests/test_prod_virial_a.cc      |  47 ++
 source/lib/tests/test_prod_virial_grad_a.cc |  36 +-
 source/lib/tests/test_prod_virial_grad_r.cc |  33 ++
 source/lib/tests/test_prod_virial_r.cc      |  47 ++
 source/lib/tests/test_simulation_region.cc  |  74 ++-
 source/lib/tests/test_tabulate.cc           |  60 ++
 source/lmp/env.sh.in                        |   2 +-
 source/op/CMakeLists.txt                    |  37 +-
 source/op/custom_op.h                       |   4 +-
 source/op/gelu_multi_device.cc              |  22 +-
 source/op/prod_env_mat_multi_device.cc      | 371 ++++++++++++-
 source/op/prod_force_grad_multi_device.cc   |  16 +-
 source/op/prod_force_multi_device.cc        |  17 +-
 source/op/prod_virial_grad_multi_device.cc  |  16 +-
 source/op/prod_virial_multi_device.cc       |  16 +-
 source/op/tabulate_multi_device.cc          |  16 +-
 source/op/unaggregated_grad.cc              |  16 +-
 60 files changed, 5029 insertions(+), 49 deletions(-)
 create mode 100644 source/lib/include/gpu_rocm.h
 create mode 100644 source/lib/src/rocm/CMakeLists.txt
 create mode 100644 source/lib/src/rocm/coord.hip.cu
 create mode 100644 source/lib/src/rocm/gelu.hip.cu
 create mode 100644 source/lib/src/rocm/neighbor_list.hip.cu
 create mode 100644 source/lib/src/rocm/prod_env_mat.hip.cu
 create mode 100644 source/lib/src/rocm/prod_force.hip.cu
 create mode 100644 source/lib/src/rocm/prod_force_grad.hip.cu
 create mode 100644 source/lib/src/rocm/prod_virial.hip.cu
 create mode 100644 source/lib/src/rocm/prod_virial_grad.hip.cu
 create mode 100644 source/lib/src/rocm/region.hip.cu
 create mode 100644 source/lib/src/rocm/tabulate.hip.cu

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 908edd2b0a..e0ab5ad564 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.7)
 project(DeePMD)
 set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 
+# note
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake")
 # build cpp or python interfaces
 if (NOT DEFINED BUILD_CPP_IF) 
   set(BUILD_CPP_IF TRUE)
@@ -72,6 +74,29 @@ if (USE_CUDA_TOOLKIT)
   add_definitions("-D GOOGLE_CUDA")
 endif()
 
+#define USE_ROCM_TOOLKIT
+if (DEFINED USE_ROCM_TOOLKIT)
+  if (USE_ROCM_TOOLKIT)
+    find_package(HIP REQUIRED)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+  else()
+    message(STATUS "Will not build nv GPU support")
+  endif()
+else()
+  find_package(HIP QUIET)
+  if (HIP_FOUND)
+    set(USE_ROCM_TOOLKIT TRUE)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+    message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support")
+  else()
+    set(USE_ROCM_TOOLKIT FALSE)
+    message(STATUS "No rocm support found, will not build AMD GPU support")
+  endif()
+endif()
+if (USE_ROCM_TOOLKIT)
+  add_definitions("-D TENSORFLOW_USE_ROCM")
+endif()
+
 # find tensorflow, I need tf abi info
 find_package(tensorflow REQUIRED)
 
@@ -195,9 +220,13 @@ if (BUILD_CPP_IF)
   set (LIB_DEEPMD_OP		"deepmd_op")
   set (LIB_DEEPMD_CC		"deepmd_cc")
   if (USE_CUDA_TOOLKIT)
-    set (LIB_DEEPMD_OP_CUDA		"deepmd_op_cuda")
-  else ()
-    set (LIB_DEEPMD_OP_CUDA		"deepmd_op")
+    set (LIB_DEEPMD_OP_DEVICE		"deepmd_op_cuda")
+    else()
+      if(USE_ROCM_TOOLKIT)
+        set (LIB_DEEPMD_OP_DEVICE   "deepmd_op_rocm")
+      else ()
+        set (LIB_DEEPMD_OP_DEVICE		"deepmd_op")
+      endif()
   endif()
   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
     set (LIB_DEEPMD_NATIVE	"deepmd_native_md")
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index 855732b26b..f2019928c9 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -12,6 +12,7 @@ if (USE_CUDA_TOOLKIT)
   include_directories("${CUDA_INCLUDE_DIRS}")
 endif()
 
+
 file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
@@ -21,6 +22,10 @@ if (USE_CUDA_TOOLKIT)
   target_link_libraries (${libname} ${CUDA_LIBRARIES})
 endif()
 
+if (USE_ROCM_TOOLKIT)
+  target_link_libraries(${libname} ${HIP_LIBRARIES})
+endif()
+
 install(TARGETS ${libname} DESTINATION lib/)
 
 install(
diff --git a/source/api_cc/include/custom_op.h b/source/api_cc/include/custom_op.h
index 60e15234aa..bd27821fb3 100644
--- a/source/api_cc/include/custom_op.h
+++ b/source/api_cc/include/custom_op.h
@@ -23,12 +23,12 @@ struct DeviceFunctor {
   {
     device = "CPU";
   }
-  #if GOOGLE_CUDA
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   void operator()(
       std::string& device, 
       const GPUDevice& d) 
   {
     device = "GPU";
   }
-  #endif // GOOGLE_CUDA
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
\ No newline at end of file
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index e88357e04a..9400b47691 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -19,6 +19,19 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
 }
 #endif
 
+#if  TENSORFLOW_USE_ROCM
+#include<hip/hip_runtime.h>
+
+#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); }
+inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true)
+{
+    if (code != hipSuccess)
+    {
+        fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+#endif //TENSORFLOW_USE_ROCM
 
 static 
 std::vector<int> cum_sum (const std::vector<int32> & n_sel) {
@@ -217,6 +230,20 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_
     graph::SetDefaultDevice(str, &graph_def);
   }
   #endif // GOOGLE_CUDA
+
+  #if TENSORFLOW_USE_ROCM
+  hipGetDeviceCount(&gpu_num); // check current device environment
+  if (gpu_num > 0) {
+    options.config.set_allow_soft_placement(true);
+    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+    options.config.mutable_gpu_options()->set_allow_growth(true);
+    hipErrcheck(hipSetDevice(gpu_rank % gpu_num));
+    std::string str = "/gpu:";
+    str += std::to_string(gpu_rank % gpu_num);
+    graph::SetDefaultDevice(str, &graph_def);
+  }
+  #endif // TENSORFLOW_USE_ROCM
+
   check_status (NewSession(options, &session));
   check_status (session->Create(graph_def));
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
@@ -524,6 +551,10 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
   cudaGetDeviceCount(&gpu_num);
   #endif // GOOGLE_CUDA
 
+  #if TENSORFLOW_USE_ROCM
+  hipGetDeviceCount(&gpu_num);
+  #endif //TENSORFLOW_USE_ROCM
+
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
@@ -542,6 +573,16 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
   }
   #endif // GOOGLE_CUDA
 
+
+  #if TENSORFLOW_USE_ROCM
+  if (gpu_num > 0) {
+      options.config.set_allow_soft_placement(true);
+      options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+      options.config.mutable_gpu_options()->set_allow_growth(true);
+      hipErrcheck(hipSetDevice(gpu_rank % gpu_num));
+  }
+  #endif // TENSORFLOW_USE_ROCM
+  
   for (unsigned ii = 0; ii < numb_models; ++ii) {
     if (gpu_num > 0) {
       std::string str = "/gpu:";
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index 8692f5427d..e0785298b8 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -48,8 +48,8 @@ find_package(Threads)
 # find openmp
 find_package(OpenMP)
 if (OPENMP_FOUND)
-    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
 
 # define USE_CUDA_TOOLKIT
@@ -76,6 +76,41 @@ if (USE_CUDA_TOOLKIT)
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
 endif()
 
+# define USE_ROCM_TOOLKIT
+if(NOT DEFINED HIP_PATH)
+  if(NOT DEFINED ENV{HIP_PATH})
+    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+  else()
+    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+  endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+if (DEFINED USE_ROCM_TOOLKIT)
+  if (USE_ROCM_TOOLKIT)
+    find_package(HIP REQUIRED)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+  else()
+    message(STATUS "Will not build AMD GPU support")
+  endif()
+else()
+  find_package(HIP QUIET)
+  if (HIP_FOUND)
+    set(USE_ROCM_TOOLKIT TRUE)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+    message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support")
+  else()
+    set(USE_ROCM_TOOLKIT FALSE)
+    message(STATUS "No rocm support found, will not build AMD GPU support")
+  endif()
+endif()
+
+if (USE_ROCM_TOOLKIT)
+  add_definitions("-D TENSORFLOW_USE_ROCM")
+  include_directories(${ROCM_INCLUDE_DIRS})
+  add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
+endif()
+
+
 file(GLOB TEST_SRC test_*.cc)
 add_executable( runUnitTests ${TEST_SRC} )
 
@@ -93,6 +128,8 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config)
+elseif(USE_ROCM_TOOLKIT)
+  target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname}  pthread deepmd_op_rocm ${TensorFlow_LIBRARY} coverage_config)
 else()
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config)
 endif()
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 8d965edde9..8c15c1d64b 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -18,6 +18,13 @@ if (USE_CUDA_TOOLKIT)
   target_link_libraries (${libname} ${CUDA_LIBRARIES} ${EXTRA_LIBS})
 endif()
 
+if (USE_ROCM_TOOLKIT)
+  add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_subdirectory(src/rocm)
+  set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
+  target_link_libraries (${libname} ${HIP_LIBRARIES} ${EXTRA_LIBS})
+endif()
+
 if(BUILD_PY_IF)
   install(TARGETS ${libname} DESTINATION deepmd/op/)
 endif(BUILD_PY_IF)
diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h
index d2beb1a6f2..a6beb6a013 100644
--- a/source/lib/include/coord.h
+++ b/source/lib/include/coord.h
@@ -91,4 +91,48 @@ copy_coord_gpu(
     const deepmd::Region<FPTYPE> & region);
 #endif // GOOGLE_CUDA
 
+
+#if TENSORFLOW_USE_ROCM
+// normalize coords
+// output:
+// coord
+// input:
+// natom, box_info: boxt, rec_boxt
+template <typename FPTYPE>
+void
+normalize_coord_gpu_rocm(
+    FPTYPE * coord,
+    const int natom,
+    const deepmd::Region<FPTYPE> & region);
+
+// copy coordinates
+// outputs:
+//	out_c, out_t, mapping, nall, 
+//  int_data(temp cuda memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map,
+//                             cell_map,cell_shift_map,sec_loc_cellnum_map,sec_total_cellnum_map,loc_clist
+// inputs:
+//	in_c, in_t, nloc, mem_nall, loc_cellnum, total_cellnum, cell_info, box_info
+//	mem_nall is the size of allocated memory for out_c, out_t, mapping
+// returns
+//	0: succssful
+//	1: the memory is not large enough to hold all copied coords and types.
+//	   i.e. nall > mem_nall
+template <typename FPTYPE>
+int
+copy_coord_gpu_rocm(
+    FPTYPE * out_c,
+    int * out_t,
+    int * mapping,
+    int * nall,
+    int * int_data,
+    const FPTYPE * in_c,
+    const int * in_t,
+    const int & nloc,
+    const int & mem_nall,
+    const int & loc_cellnum,
+    const int & total_cellnum,
+    const int * cell_info,
+    const deepmd::Region<FPTYPE> & region);
+#endif // TENSORFLOW_USE_ROCM
+
 }
diff --git a/source/lib/include/device.h b/source/lib/include/device.h
index f0e36ae73d..7fe6cc127c 100644
--- a/source/lib/include/device.h
+++ b/source/lib/include/device.h
@@ -10,4 +10,8 @@ typedef unsigned long long uint_64;
 
 #if GOOGLE_CUDA
 #include "gpu_cuda.h"
-#endif
\ No newline at end of file
+#endif
+
+#if TENSORFLOW_USE_ROCM
+#include "gpu_rocm.h"
+#endif
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 1c6f66059c..f893897f6b 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -42,9 +42,37 @@ void test_encoding_decoding_nbor_info_gpu_cuda(
     const int * in_index,
     const int size_of_array);
 #endif //GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+template <typename FPTYPE>
+void format_nbor_list_gpu_rocm(    
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const deepmd::InputNlist & gpu_inlist,
+    int * array_int,
+    uint_64 * array_longlong,
+    const int max_nbor_size,
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const std::vector<int> sec);
+
+template <typename FPTYPE>
+void test_encoding_decoding_nbor_info_gpu_rocm(
+    uint_64 * key,
+    int * out_type,
+    int * out_index,
+    const int * in_type,
+    const FPTYPE * in_dist,
+    const int * in_index,
+    const int size_of_array);
+#endif //TENSORFLOW_USE_ROCM
+
 }
 
 
+
 ////////////////////////////////////////////////////////
 // legacy code
 ////////////////////////////////////////////////////////
diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h
index 20f2d96de5..cf82720434 100644
--- a/source/lib/include/gelu.h
+++ b/source/lib/include/gelu.h
@@ -46,4 +46,27 @@ void gelu_grad_grad_gpu_cuda(
     const int size);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+void gelu_gpu_rocm(
+    FPTYPE * out, 
+    const FPTYPE * xx, 
+    const int size);
+
+template<typename FPTYPE>
+void gelu_grad_gpu_rocm(
+    FPTYPE * out, 
+    const FPTYPE * xx,
+    const FPTYPE * dy, 
+    const int size);
+
+template<typename FPTYPE>
+void gelu_grad_grad_gpu_rocm(
+    FPTYPE * out,
+    const FPTYPE * xx,
+    const FPTYPE * dy, 
+    const FPTYPE * dy_2,
+    const int size);
+
+#endif//TENSORFLOW_USE_ROCM
 }
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
new file mode 100644
index 0000000000..ee3e88ee9e
--- /dev/null
+++ b/source/lib/include/gpu_rocm.h
@@ -0,0 +1,115 @@
+#pragma once 
+#include <vector>
+#include <stdio.h>
+#include <assert.h>
+#include<hip/hip_runtime.h>
+//#include<rocprim/rocprim.hpp>
+//#include <hipcub/hipcub.hpp>
+
+#define GPU_MAX_NBOR_SIZE 4096
+
+#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); }
+inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) {
+    if (code != hipSuccess) {
+        fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+#define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);}
+inline void nborAssert(hipError_t code, const char *file, int line, bool abort=true) {
+    if (code != hipSuccess) {
+        fprintf(stderr,"hip assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line);
+        if (abort) exit(code);
+    }
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void memcpy_host_to_device(
+    FPTYPE * device, 
+    std::vector<FPTYPE> &host) 
+{
+  hipErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice));  
+}
+
+template <typename FPTYPE>
+void memcpy_host_to_device(
+    FPTYPE * device, 
+    const FPTYPE * host,
+    const int size) 
+{
+  hipErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));  
+}
+
+template <typename FPTYPE>
+void memcpy_device_to_host(
+    FPTYPE * device, 
+    std::vector<FPTYPE> &host) 
+{
+  hipErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost));  
+}
+template <typename FPTYPE>
+void memcpy_device_to_host(
+    const FPTYPE * device, 
+    FPTYPE * host,
+    const int size) 
+{
+  hipErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));  
+}
+
+template <typename FPTYPE>
+void malloc_device_memory(
+    FPTYPE * &device, 
+    std::vector<FPTYPE> &host) 
+{
+  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+}
+
+template <typename FPTYPE>
+void malloc_device_memory(
+    FPTYPE * &device, 
+    const int size) 
+{
+  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+}
+
+template <typename FPTYPE>
+void malloc_device_memory_sync(
+    FPTYPE * &device,
+    std::vector<FPTYPE> &host) 
+{
+  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  memcpy_host_to_device(device, host);
+}
+template <typename FPTYPE>
+void malloc_device_memory_sync(
+    FPTYPE * &device,
+    const FPTYPE * host,
+    const int size)
+{
+  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+  memcpy_host_to_device(device, host, size);
+}
+
+template <typename FPTYPE>
+void delete_device_memory(
+    FPTYPE * &device) 
+{
+  if (device != NULL) {
+    hipErrcheck(hipFree(device));
+  }
+}
+
+template <typename FPTYPE>
+void memset_device_memory(
+  FPTYPE * device,
+  const FPTYPE var,
+  const int size)
+  {
+    hipErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size));
+  }
+}
+
+
+
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index afd0a674a0..bc717255b9 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -104,6 +104,48 @@ void use_nlist_map(
 	
 #endif // GOOGLE_CUDA
 
+
+#if TENSORFLOW_USE_ROCM
+void convert_nlist_gpu_rocm(
+    InputNlist & gpu_nlist,
+    InputNlist & cpu_nlist,
+    int* & gpu_memory,
+    const int & max_nbor_size);
+
+void free_nlist_gpu_rocm(
+    InputNlist & gpu_nlist);
+
+// build neighbor list.
+// outputs
+//	nlist, max_list_size
+//	max_list_size is the maximal size of jlist.
+// inputs
+//	c_cpy, nloc, nall, mem_size, rcut, region
+//	mem_size is the size of allocated memory for jlist.
+// returns
+//	0: succssful
+//	1: the memory is not large enough to hold all neighbors.
+//	   i.e. max_list_size > mem_nall
+template <typename FPTYPE>
+int
+build_nlist_gpu_rocm(
+    InputNlist & nlist,
+    int * max_list_size,
+    int * nlist_data,
+    const FPTYPE * c_cpy, 
+    const int & nloc, 
+    const int & nall, 
+    const int & mem_size,
+    const float & rcut);
+
+void use_nlist_map(
+    int * nlist, 
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei);
+	
+#endif // TENSORFLOW_USE_ROCM
+
 } // namespace deepmd
 
 
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 0789bf75a6..58f1ae8485 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -91,5 +91,55 @@ void env_mat_nbor_update(
     const int size);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE> 
+void prod_env_mat_a_gpu_rocm(    
+    FPTYPE * em, 
+    FPTYPE * em_deriv, 
+    FPTYPE * rij, 
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const InputNlist & gpu_inlist,
+    int * array_int, 
+    unsigned long long * array_longlong,
+    const int max_nbor_size,
+    const FPTYPE * avg, 
+    const FPTYPE * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec);
+
+template<typename FPTYPE> 
+void prod_env_mat_r_gpu_rocm(    
+    FPTYPE * em, 
+    FPTYPE * em_deriv, 
+    FPTYPE * rij, 
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const InputNlist & gpu_inlist,
+    int * array_int, 
+    unsigned long long * array_longlong,
+    const int max_nbor_size,
+    const FPTYPE * avg, 
+    const FPTYPE * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec);
+
+void env_mat_nbor_update(
+    InputNlist &inlist,
+    InputNlist &gpu_inlist,
+    int &max_nbor_size,
+    int* &nbor_list_dev,
+    const int * mesh, 
+    const int size);
+#endif // TENSORFLOW_USE_ROCM
+
 }
 
diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h
index 1667edb61c..49530a1952 100644
--- a/source/lib/include/prod_force.h
+++ b/source/lib/include/prod_force.h
@@ -44,4 +44,26 @@ void prod_force_r_gpu_cuda(
     const int nnei);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE> 
+void prod_force_a_gpu_rocm(
+    FPTYPE * force, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * in_deriv, 
+    const int * nlist, 
+    const int nloc, 
+    const int nall, 
+    const int nnei);
+
+template<typename FPTYPE> 
+void prod_force_r_gpu_rocm(
+    FPTYPE * force, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * in_deriv, 
+    const int * nlist, 
+    const int nloc, 
+    const int nall, 
+    const int nnei);
+#endif // TENSORFLOW_USE_ROCM
+
 }
diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h
index f6ac58269f..ff0bbe8015 100644
--- a/source/lib/include/prod_force_grad.h
+++ b/source/lib/include/prod_force_grad.h
@@ -40,4 +40,23 @@ void prod_force_grad_r_gpu_cuda(
     const int nnei);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+void prod_force_grad_a_gpu_rocm(
+    FPTYPE * grad_net, 
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int * nlist, 
+    const int nloc, 
+    const int nnei);
+
+template<typename FPTYPE>
+void prod_force_grad_r_gpu_rocm(
+    FPTYPE * grad_net, 
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int * nlist, 
+    const int nloc, 
+    const int nnei);
+#endif // TENSORFLOW_USE_ROCM
 }
diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h
index 6655059e12..229ba968da 100644
--- a/source/lib/include/prod_virial.h
+++ b/source/lib/include/prod_virial.h
@@ -52,4 +52,31 @@ void prod_virial_r_gpu_cuda(
     const int nnei);
 #endif // GOOGLE_CUDA
 
+
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+void prod_virial_a_gpu_rocm(
+    FPTYPE * virial, 
+    FPTYPE * atom_virial, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * env_deriv, 
+    const FPTYPE * rij, 
+    const int * nlist, 
+    const int nloc, 
+    const int nall, 
+    const int nnei);
+
+template<typename FPTYPE>
+void prod_virial_r_gpu_rocm(
+    FPTYPE * virial, 
+    FPTYPE * atom_virial, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * env_deriv, 
+    const FPTYPE * rij, 
+    const int * nlist, 
+    const int nloc, 
+    const int nall, 
+    const int nnei);
+#endif // TENSORFLOW_USE_ROCM
+
 } //namespace deepmd
diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h
index 7a8c87c0dd..8ab8171e14 100644
--- a/source/lib/include/prod_virial_grad.h
+++ b/source/lib/include/prod_virial_grad.h
@@ -44,4 +44,26 @@ void prod_virial_grad_r_gpu_cuda(
     const int nnei);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+void prod_virial_grad_a_gpu_rocm(
+    FPTYPE * grad_net,
+    const FPTYPE * grad,
+    const FPTYPE * env_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei);
+
+template<typename FPTYPE>
+void prod_virial_grad_r_gpu_rocm(
+    FPTYPE * grad_net,
+    const FPTYPE * grad,
+    const FPTYPE * env_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei);
+#endif // TENSORFLOW_USE_ROCM
+
 }
diff --git a/source/lib/include/region.h b/source/lib/include/region.h
index 901299a241..b6428a1bf4 100644
--- a/source/lib/include/region.h
+++ b/source/lib/include/region.h
@@ -59,6 +59,28 @@ volume_gpu(
     const Region<FPTYPE> & region);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+//only for unittest
+template<typename FPTYPE>
+void
+convert_to_inter_gpu_rocm(
+    FPTYPE * ri, 
+    const Region<FPTYPE> & region,
+    const FPTYPE * rp);
+
+template<typename FPTYPE>
+void
+convert_to_phys_gpu_rocm(
+    FPTYPE * rp, 
+    const Region<FPTYPE> & region,
+    const FPTYPE * ri);
+
+template<typename FPTYPE>
+void
+volume_gpu_rocm(
+    FPTYPE * volume, 
+    const Region<FPTYPE> & region);
+#endif // TENSORFLOW_USE_ROCM
 }
 
 
diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h
index b684be3c4c..bce0913ebd 100644
--- a/source/lib/include/tabulate.h
+++ b/source/lib/include/tabulate.h
@@ -52,5 +52,30 @@ void tabulate_fusion_grad_gpu_cuda(
     const int last_layer_size);
 #endif // GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+void tabulate_fusion_gpu_rocm(
+    FPTYPE * out,
+    const FPTYPE * table, 
+    const FPTYPE * table_info, 
+    const FPTYPE * em_x, 
+    const FPTYPE * em, 
+    const int nloc,
+    const int nnei, 
+    const int last_layer_size);
+
+template<typename FPTYPE>
+void tabulate_fusion_grad_gpu_rocm(
+    FPTYPE * dy_dem_x, 
+    FPTYPE * dy_dem,
+    const FPTYPE * table, 
+    const FPTYPE * table_info, 
+    const FPTYPE * em_x, 
+    const FPTYPE * em, 
+    const FPTYPE * dy, 
+    const int nloc, 
+    const int nnei, 
+    const int last_layer_size);
+#endif // TENSORFLOW_USE_ROCM
 }
 
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index 89e8552524..a41784bc29 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -875,3 +875,36 @@ void deepmd::free_nlist_gpu_cuda(
   delete_device_memory(gpu_nlist.firstneigh);
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+void deepmd::convert_nlist_gpu_rocm(
+    InputNlist & gpu_nlist,
+    InputNlist & cpu_nlist,
+    int* & gpu_memory,
+    const int & max_nbor_size)
+{
+  const int inum = cpu_nlist.inum;
+  gpu_nlist.inum = inum;
+  malloc_device_memory(gpu_nlist.ilist, inum);
+  malloc_device_memory(gpu_nlist.numneigh, inum);
+  malloc_device_memory(gpu_nlist.firstneigh, inum);
+  memcpy_host_to_device(gpu_nlist.ilist, cpu_nlist.ilist, inum);
+  memcpy_host_to_device(gpu_nlist.numneigh, cpu_nlist.numneigh, inum);
+  int ** _firstneigh = NULL;
+  _firstneigh = (int**)malloc(sizeof(int*) * inum);
+  for (int ii = 0; ii < inum; ii++) {
+    memcpy_host_to_device(gpu_memory + ii * max_nbor_size, cpu_nlist.firstneigh[ii], cpu_nlist.numneigh[ii]);
+    _firstneigh[ii] = gpu_memory + ii * max_nbor_size;
+  }
+  memcpy_host_to_device(gpu_nlist.firstneigh, _firstneigh, inum);
+  free(_firstneigh);
+}
+
+void deepmd::free_nlist_gpu_rocm(
+    InputNlist & gpu_nlist)
+{
+  delete_device_memory(gpu_nlist.ilist);
+  delete_device_memory(gpu_nlist.numneigh);
+  delete_device_memory(gpu_nlist.firstneigh);
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index c5e3223e9e..44f6dc516c 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -320,3 +320,68 @@ void deepmd::env_mat_nbor_update(
   delete [] mesh_host;
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+void deepmd::env_mat_nbor_update(
+    InputNlist &inlist,
+    InputNlist &gpu_inlist,
+    int &max_nbor_size,
+    int* &nbor_list_dev,
+    const int * mesh, 
+    const int size)
+{
+  int *mesh_host = new int[size];
+  hipErrcheck(hipMemcpy(mesh_host, mesh, sizeof(int) * size, hipMemcpyDeviceToHost));
+  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
+	memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
+	memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
+  const int ago = mesh_host[0];
+  if (ago == 0) {
+    const int inum = inlist.inum;
+    if (gpu_inlist.inum < inum) {
+      delete_device_memory(gpu_inlist.ilist);
+      delete_device_memory(gpu_inlist.numneigh);
+      delete_device_memory(gpu_inlist.firstneigh);
+      malloc_device_memory(gpu_inlist.ilist, inum);
+      malloc_device_memory(gpu_inlist.numneigh, inum);
+      malloc_device_memory(gpu_inlist.firstneigh, inum);
+    }
+    memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
+    memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
+    int _max_nbor_size = max_numneigh(inlist);
+    if (_max_nbor_size <= 1024) {
+      _max_nbor_size = 1024;
+    }
+    else if (_max_nbor_size <= 2048) {
+      _max_nbor_size = 2048;
+    }
+    else {
+      _max_nbor_size = 4096;
+    }
+    if ( nbor_list_dev == NULL 
+      || _max_nbor_size > max_nbor_size 
+      || inum > gpu_inlist.inum) 
+    {
+      delete_device_memory(nbor_list_dev);
+      malloc_device_memory(nbor_list_dev, inum * _max_nbor_size);
+    }
+    // update info
+    gpu_inlist.inum = inum;
+    max_nbor_size = _max_nbor_size;
+
+    // copy nbor list from host to the device
+    std::vector<int> nbor_list_host(inum * max_nbor_size, 0);
+    int ** _firstneigh = (int**)malloc(sizeof(int*) * inum);
+    for (int ii = 0; ii < inum; ii++) {
+      _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
+      for (int jj = 0; jj < inlist.numneigh[ii]; jj++) {
+        nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj];
+      }
+    }
+    memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0], inum * max_nbor_size);
+    memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum);
+    free(_firstneigh);
+  }
+  delete [] mesh_host;
+}
+#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
new file mode 100644
index 0000000000..27c4d62f68
--- /dev/null
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -0,0 +1,45 @@
+# required cmake version
+cmake_minimum_required(VERSION 3.15)
+# project name
+project(deepmd_op_rocm)
+set(CMAKE_LINK_WHAT_YOU_USE TRUE)
+#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake")
+if(NOT DEFINED HIP_PATH)
+  if(NOT DEFINED ENV{HIP_PATH})
+    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+  else()
+    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+  endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+find_package(HIP REQUIRED)
+
+add_compile_definitions(__HIP_PLATFORM_HCC__)
+link_directories(${HIP_ROOT_DIR}/lib)
+add_definitions("-DTENSORFLOW_USE_ROCM")
+
+# set c++ version c++11
+#SET(CMAKE_CXX_STANDARD 11)
+SET(CMAKE_HIP_STANDARD 11)
+
+message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
+message(STATUS "HIP major version is " ${HIP_TOOLKIT_ROOT_DIR})
+
+
+set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
+
+set (SOURCE_FILES
+    prod_env_mat.hip.cu prod_force.hip.cu prod_virial.hip.cu gelu.hip.cu tabulate.hip.cu coord.hip.cu neighbor_list.hip.cu prod_force_grad.hip.cu prod_virial_grad.hip.cu region.hip.cu
+)
+
+hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES})
+target_link_libraries(deepmd_op_rocm ${HIP_LIBRARIES})
+
+#install(TARGETS deepmd_op_rocm DESTINATION lib/)
+if (BUILD_CPP_IF)
+  install(TARGETS deepmd_op_rocm DESTINATION lib/)
+endif (BUILD_CPP_IF)
+if (BUILD_PY_IF)
+  install(TARGETS deepmd_op_rocm DESTINATION deepmd/op/)
+endif (BUILD_PY_IF)
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
new file mode 100644
index 0000000000..b5516d3603
--- /dev/null
+++ b/source/lib/src/rocm/coord.hip.cu
@@ -0,0 +1,414 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "gpu_rocm.h"
+#include "coord.h"
+#include "region.cuh"
+
+__device__ inline int collapse_index(
+    const int * idx,
+    const int * size)
+{
+    return (idx[0] * size[1] + idx[1]) * size[2] + idx[2];
+}
+__device__ inline void index_recover(
+    const int in_idx,
+    const int * size, 
+    int * idx)
+{
+    idx[2]=in_idx%size[2];
+    idx[1]=int(in_idx/size[2])%size[1];
+    idx[0]=int(int(in_idx/size[2])/size[1]);
+}
+__device__ inline void idx_addshift(
+    int * idx, 
+    const int * shift)
+{
+    for(int dd=0;dd<3;dd++)
+    {
+        idx[dd]+=shift[dd];
+    }
+}
+__device__ inline void idx_unshift(
+    int * idx, 
+    const int * shift)
+{
+    for(int dd=0;dd<3;dd++)
+    {
+        idx[dd]-=shift[dd];
+    }
+}
+__device__ inline int compute_pbc_shift(
+    int idx, 
+    int ncell)
+{
+    int shift = 0;
+    if (idx < 0) {
+    shift = 1;
+    while (idx + shift * ncell < 0) shift ++;
+    }
+    else if (idx >= ncell) {
+    shift = -1;
+    while (idx + shift * ncell >= ncell) shift --;
+    }
+    return shift;
+}
+
+template<typename FPTYPE>
+__global__ void normalize_one(
+    FPTYPE *out_c,
+    const FPTYPE *boxt,
+    const FPTYPE *rec_boxt,
+    const int nall)
+{
+    // <<<nall/TPB, TPB>>>
+    int idy=blockIdx.x*blockDim.x+threadIdx.x;
+    if (idy>=nall){return;}
+    FPTYPE inter[3];
+    phys2Inter(inter,out_c+idy*3,rec_boxt);
+    for (int dd = 0; dd < 3; ++dd) {
+        while(inter[dd] >= 1.) inter[dd] -= 1.;
+        while(inter[dd] < 0.) inter[dd] += 1.;
+    }
+    inter2Phys(out_c+idy*3,inter,boxt);
+}
+
+template<typename FPTYPE>
+__global__ void _fill_idx_cellmap(
+    int * idx_cellmap,
+    int * idx_cellmap_noshift,
+    const FPTYPE *in_c,
+    const FPTYPE *rec_boxt,
+    const int *nat_stt,
+    const int *nat_end,
+    const int *ext_stt,
+    const int *ext_end,
+    const int nloc)
+{
+    int idy = blockIdx.x*blockDim.x+threadIdx.x;
+    int ext_ncell[3];
+    int global_grid[3];
+    int idx_orig_shift[3];
+    FPTYPE cell_size[3];
+    FPTYPE nat_orig[3];
+    for (int dd = 0; dd < 3; ++dd) 
+    {
+        ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
+        global_grid[dd] = nat_end[dd] - nat_stt[dd];
+        idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
+        cell_size[dd] = 1./global_grid[dd];
+        nat_orig[dd] = nat_stt[dd] * cell_size[dd];
+    }
+    if (idy<nloc)
+    {
+        int idx_noshift[3]; 
+        int idx[3];
+        FPTYPE inter[3];
+        phys2Inter(inter,in_c+idy*3,rec_boxt);
+        for (int dd = 0; dd < 3; ++dd){
+            idx_noshift[dd] = (inter[dd] - nat_orig[dd]) / cell_size[dd];
+            if (inter[dd] - nat_orig[dd] < 0.) idx_noshift[dd] --;
+            if (idx_noshift[dd] < nat_stt[dd]) 
+            {
+                idx_noshift[dd] = nat_stt[dd];
+            }
+            else if (idx_noshift[dd] >= nat_end[dd]) 
+            {
+                idx_noshift[dd] = nat_end[dd] - 1;
+            }
+            idx[dd] = idx_noshift[dd]+idx_orig_shift[dd];
+        }
+        idx_cellmap_noshift[idy]=collapse_index(idx_noshift, global_grid);
+        idx_cellmap[idy]=collapse_index(idx, ext_ncell);
+    }
+}
+
+__global__ void _fill_loc_cellnum_map(
+    int * temp_idx_order,
+    int * loc_cellnum_map,
+    const int * idx_cellmap_noshift,
+    const int nloc,
+    const int loc_cellnum)
+{
+    int idy = blockIdx.x*blockDim.x+threadIdx.x;
+    if (idy<loc_cellnum)
+    {
+        int num=0;
+        for(int ii=0;ii<nloc;ii++)
+        {
+            if(idx_cellmap_noshift[ii]==idy)
+            {
+                temp_idx_order[ii]=num;
+                num++;
+            }
+        }
+        loc_cellnum_map[idy]=num;
+    }
+}
+
+__global__ void _fill_total_cellnum_map(
+    int * total_cellnum_map,
+    int * mask_cellnum_map,
+    int * cell_map,
+    int * cell_shift_map,
+    const int * nat_stt,
+    const int * nat_end,
+    const int * ext_stt,
+    const int * ext_end,
+    const int * loc_cellnum_map,
+    const int total_cellnum)
+{
+    int idy = blockIdx.x*blockDim.x+threadIdx.x;
+    int ext_ncell[3];
+    int global_grid[3];
+    int idx_orig_shift[3];
+    for (int dd = 0; dd < 3; ++dd) 
+    {
+        ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
+        global_grid[dd] = nat_end[dd] - nat_stt[dd];
+        idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
+    }
+    if(idy<total_cellnum)
+    {
+        int * shift=cell_shift_map+idy*3;
+        int idx[3];
+        index_recover(idy, ext_ncell, idx);
+        idx_unshift(idx, idx_orig_shift);
+        shift[0]=compute_pbc_shift(idx[0],global_grid[0]);
+        shift[1]=compute_pbc_shift(idx[1],global_grid[1]);
+        shift[2]=compute_pbc_shift(idx[2],global_grid[2]);
+        bool loc=false;
+        if(shift[0]==0&&shift[1]==0&&shift[2]==0)loc=true;
+        for(int dd=0;dd<3;dd++)
+        {
+            idx[dd]+=shift[dd]*global_grid[dd];
+        }
+        int orig_idy=collapse_index(idx, global_grid);
+        mask_cellnum_map[idy]=loc_cellnum_map[orig_idy];
+        total_cellnum_map[idy]=mask_cellnum_map[idy];
+        if(loc)mask_cellnum_map[idy]=0;
+        cell_map[idy]=orig_idy;
+    }
+}
+
+__global__ void _build_loc_clist(
+    int *clist,
+    const int *idx_cellmap, 
+    const int *idx_order,
+    const int *sec_num_map,
+    const int nloc)
+{
+    int idy = blockIdx.x*blockDim.x+threadIdx.x;
+    if(idy>=nloc){return;}
+    int cell_idx=idx_cellmap[idy];
+    int * clist_row = clist+sec_num_map[cell_idx];
+    clist_row[idx_order[idy]]=idy;
+}
+
+template<typename FPTYPE>
+__global__ void _copy_coord(
+    FPTYPE * out_c, 
+    int * out_t, 
+    int * mapping, 
+    const FPTYPE * in_c, 
+    const int * in_t, 
+    const int * cell_map, 
+    const int * cell_shift_map, 
+    const int * sec_loc_cellnum_map, 
+    const int * sec_total_cellnum_map, 
+    const int * loc_clist, 
+    const int nloc, 
+    const int nall, 
+    const int total_cellnum, 
+    const FPTYPE * boxt, 
+    const FPTYPE * rec_boxt)
+{
+    int idy = blockIdx.x*blockDim.x+threadIdx.x;
+    if(idy>=nall){return;}
+    if(idy<nloc)
+    {
+        mapping[idy]=idy;
+        out_t[idy]=in_t[idy];
+        for(int dd=0;dd<3;dd++)
+        {
+            out_c[idy*3+dd]=in_c[idy*3+dd];
+        }
+    }
+    else
+    {
+        int cell_idx=0;
+        int atom_idx=0;
+        int orig_cell_idx=0;
+        int orig_idy=0;
+        int shift[3];
+        FPTYPE d_shift[3];
+        for(int ii=0;ii<total_cellnum;ii++)
+        {
+            if(idy>=sec_total_cellnum_map[ii+1])cell_idx++;
+            else break;
+        }
+        for(int dd=0;dd<3;dd++)
+        {
+            shift[dd]=cell_shift_map[cell_idx*3+dd];
+            d_shift[dd]=shift[dd];
+        }
+        atom_idx=idy-sec_total_cellnum_map[cell_idx];
+        orig_cell_idx=cell_map[cell_idx];
+        orig_idy=loc_clist[sec_loc_cellnum_map[orig_cell_idx]+atom_idx];
+        mapping[idy]=orig_idy;
+        out_t[idy]=in_t[orig_idy];
+        FPTYPE shift_v[3];
+        inter2Phys(shift_v,d_shift,boxt);
+        for(int dd=0;dd<3;dd++)
+        {
+            out_c[idy*3+dd]=in_c[orig_idy*3+dd]-shift_v[dd];
+        }
+    }
+}
+
+template <typename FPTYPE>
+void compute_int_data(
+    int * int_data, 
+    const FPTYPE * in_c, 
+    const int * cell_info, 
+    const deepmd::Region<FPTYPE> & region, 
+    const int nloc, 
+    const int loc_cellnum, 
+    const int total_cellnum)
+{
+    int * idx_cellmap=int_data;
+    int * idx_cellmap_noshift=idx_cellmap+nloc;
+    int * temp_idx_order=idx_cellmap_noshift+nloc;
+    int * loc_cellnum_map=temp_idx_order+nloc;
+    int * total_cellnum_map=loc_cellnum_map+loc_cellnum;
+    int * mask_cellnum_map=total_cellnum_map+total_cellnum;
+    int * cell_map=mask_cellnum_map+total_cellnum;
+    int * cell_shift_map=cell_map+total_cellnum;
+    const int * nat_stt=cell_info;
+    const int * nat_end=cell_info+3;
+    const int * ext_stt=cell_info+6;
+    const int * ext_end=cell_info+9;
+    const FPTYPE * rec_boxt = region.rec_boxt;
+    
+    const int nblock_loc=(nloc+TPB-1)/TPB;
+    hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, 
+        nat_stt, nat_end, ext_stt, ext_end, nloc);
+
+    const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB;
+    hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0, temp_idx_order, loc_cellnum_map, 
+        idx_cellmap_noshift, nloc, loc_cellnum);
+
+    const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB;
+    hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0, total_cellnum_map, mask_cellnum_map, cell_map, 
+        cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
+}
+
+void build_loc_clist(
+    int * int_data, 
+    const int nloc, 
+    const int loc_cellnum, 
+    const int total_cellnum)
+{
+    const int nblock=(nloc+TPB-1)/TPB;
+    const int * idx_cellmap_noshift=int_data+nloc;
+    const int * temp_idx_order=idx_cellmap_noshift+nloc;
+    const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum;
+    int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1;
+    hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc);
+}
+
+template <typename FPTYPE>
+void copy_coord(
+    FPTYPE * out_c, 
+    int * out_t, 
+    int * mapping, 
+    const int * int_data, 
+    const FPTYPE * in_c, 
+    const int * in_t, 
+    const int nloc, 
+    const int nall, 
+    const int loc_cellnum, 
+    const int total_cellnum, 
+    const deepmd::Region<FPTYPE> & region)
+{
+    const int nblock=(nall+TPB-1)/TPB;
+    const int * cell_map=int_data+3*nloc+loc_cellnum+2*total_cellnum;
+    const int * cell_shift_map=cell_map+total_cellnum;
+    const int * sec_loc_cellnum_map=cell_shift_map+3*total_cellnum;
+    const int * sec_total_cellnum_map=sec_loc_cellnum_map+loc_cellnum+1;
+    const int * loc_clist=sec_total_cellnum_map+total_cellnum+1;
+
+    const FPTYPE *boxt = region.boxt;
+    const FPTYPE *rec_boxt = region.rec_boxt;
+    hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, 
+        sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void
+normalize_coord_gpu_rocm(
+    FPTYPE * coord,
+    const int natom,
+    const Region<FPTYPE> & region)
+{
+    const FPTYPE * boxt=region.boxt;
+    const FPTYPE * rec_boxt=region.rec_boxt;
+    const int nblock=(natom+TPB-1)/TPB;
+    hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt, natom);
+}
+
+template <typename FPTYPE>
+int
+copy_coord_gpu_rocm(
+    FPTYPE * out_c,
+    int * out_t,
+    int * mapping,
+    int * nall,
+    int * int_data,
+    const FPTYPE * in_c,
+    const int * in_t,
+    const int & nloc,
+    const int & mem_nall,
+    const int & loc_cellnum,
+    const int & total_cellnum,
+    const int * cell_info,
+    const Region<FPTYPE> & region)
+{
+    compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum);
+    int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
+    hipErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost));
+    int * loc_cellnum_map=int_data_cpu;
+    int * total_cellnum_map=loc_cellnum_map+loc_cellnum;
+    int * mask_cellnum_map=total_cellnum_map+total_cellnum;
+    int * sec_loc_cellnum_map=mask_cellnum_map+total_cellnum;
+    int * sec_total_cellnum_map=sec_loc_cellnum_map+loc_cellnum+1;
+    sec_loc_cellnum_map[0]=0;
+    sec_total_cellnum_map[0]=nloc;
+    int max_cell=0;
+    for(int iii=0;iii<total_cellnum;iii++)
+    {
+        if(max_cell<total_cellnum_map[iii]){max_cell=total_cellnum_map[iii];}
+        if(iii<loc_cellnum){sec_loc_cellnum_map[iii+1]=sec_loc_cellnum_map[iii]+loc_cellnum_map[iii];}
+        sec_total_cellnum_map[iii+1]=sec_total_cellnum_map[iii]+mask_cellnum_map[iii];
+    }
+    *nall=sec_total_cellnum_map[total_cellnum];
+    if(*nall > mem_nall){
+        delete[] int_data_cpu;
+        // size of the output arrays is not large enough
+        return 1;
+    }
+    else{
+        hipErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
+            sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), hipMemcpyHostToDevice));
+        delete[] int_data_cpu;
+        build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
+        copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall, loc_cellnum, total_cellnum, region);
+    }
+    return 0;
+}
+
+template void normalize_coord_gpu_rocm<float>(float * coord, const int natom, const Region<float> & region);
+template void normalize_coord_gpu_rocm<double>(double * coord, const int natom, const Region<double> & region);
+template int copy_coord_gpu_rocm<float>(float * out_c, int * out_t, int * mapping, int * nall, int * int_data, const float * in_c, const int * in_t, const int & nloc, const int & mem_nall, const int & loc_cellnum, const int & total_cellnum, const int * cell_info, const Region<float> & region);
+template int copy_coord_gpu_rocm<double>(double * out_c, int * out_t, int * mapping, int * nall, int * int_data, const double * in_c, const int * in_t, const int & nloc, const int & mem_nall, const int & loc_cellnum, const int & total_cellnum, const int * cell_info, const Region<double> & region);
+}
\ No newline at end of file
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
new file mode 100644
index 0000000000..d8f02ae9fc
--- /dev/null
+++ b/source/lib/src/rocm/gelu.hip.cu
@@ -0,0 +1,98 @@
+#include "hip/hip_runtime.h"
+#include "gelu.h"
+#include "device.h"
+
+template <typename FPTYPE>
+__global__ void gelu(
+    FPTYPE * out, 
+    const FPTYPE * xx, 
+    int const size) 
+{
+  int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size) {
+    return;
+  }
+  out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
+}
+
+template <typename FPTYPE>
+__global__ void gelu_grad(
+    FPTYPE * out, 
+    const FPTYPE * xx, 
+    const FPTYPE * dy, 
+    int const size) 
+{
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size) {
+    return;
+  }
+  // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
+  const FPTYPE var = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]));
+  out[idx] = dy[idx] * (0.5 * SQRT_2_PI * xx[idx] * (1 - var * var) * (0.134145 * xx[idx] * xx[idx] + 1) + 0.5 * var + 0.5);
+}
+
+template <typename FPTYPE>
+__global__ void gelu_grad_grad(
+    FPTYPE * out, 
+    const FPTYPE * xx, 
+    const FPTYPE * dy, 
+    const FPTYPE * dy_2,
+    int const size) 
+{
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size) {
+    return;
+  }
+  // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])));
+  const FPTYPE var1 = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]));
+  const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[idx] * xx[idx] + 1);
+  out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2);
+}
+
+namespace deepmd {
+  template<typename FPTYPE>
+  void gelu_gpu_rocm(
+      FPTYPE * out, 
+      const FPTYPE * xx, 
+      const int size)
+  {
+    const int THREAD_ITEMS = 1024;
+    const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+  
+    hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size);
+  }
+  
+  template<typename FPTYPE>
+  void gelu_grad_gpu_rocm(
+      FPTYPE * out, 
+      const FPTYPE * xx,
+      const FPTYPE * dy, 
+      const int size)
+  {
+    const int THREAD_ITEMS = 1024;
+    const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+  
+    hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, size);
+  }
+  
+  template<typename FPTYPE>
+  void gelu_grad_grad_gpu_rocm(
+      FPTYPE * out,
+      const FPTYPE * xx,
+      const FPTYPE * dy, 
+      const FPTYPE * dy_2,
+      const int size)
+  {
+    const int THREAD_ITEMS = 1024;
+    const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+    
+    hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, dy_2, size);
+  }
+  
+  template void gelu_gpu_rocm<float>(float * out, const float * x, const int size);
+  template void gelu_gpu_rocm<double>(double * out, const double * x, const int size);
+  template void gelu_grad_gpu_rocm<float>(float * out, const float * x, const float * dy, const int size);
+  template void gelu_grad_gpu_rocm<double>(double * out, const double * x, const double * dy, const int size);
+  template void gelu_grad_grad_gpu_rocm<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
+  template void gelu_grad_grad_gpu_rocm<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+}
\ No newline at end of file
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
new file mode 100644
index 0000000000..a0da866d12
--- /dev/null
+++ b/source/lib/src/rocm/neighbor_list.hip.cu
@@ -0,0 +1,184 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "neighbor_list.h"
+
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(
+    FPTYPE * arr1, 
+    FPTYPE * arr2) 
+{
+    return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template<typename FPTYPE>
+__global__ void build_nlist(
+    int * ilist, 
+    int * temp_nlist,
+    const FPTYPE * c_cpy, 
+    const FPTYPE rcut2,
+    const int nloc,
+    const int nall,
+    const int mem_size)
+{
+    const unsigned int atom_idx = blockIdx.x;
+    const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if(neighbor_idx<nall)
+    {
+        int * neighbor_row = temp_nlist + atom_idx * mem_size;
+        if(neighbor_idx==atom_idx)
+        {
+            ilist[atom_idx]=atom_idx;
+        }
+        else
+        {
+            const FPTYPE * ccoord=c_cpy+atom_idx*3;
+            const FPTYPE * ncoord=c_cpy+neighbor_idx*3;
+            FPTYPE diff[3];
+            for(int kk=0;kk<3;kk++){
+                diff[kk] = ccoord[kk] - ncoord[kk];
+            }
+            FPTYPE r2 = dev_dot(diff, diff);
+            if(r2<rcut2){
+                neighbor_row[neighbor_idx]=neighbor_idx;
+            }
+        }
+    }
+}
+
+__global__ void scan_nlist(
+    int * numneigh, 
+    int * nei_order, 
+    const int * temp_nlist, 
+    const int mem_size, 
+    const int nloc,
+    const int nall)
+{
+    const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(atom_idx<nloc){
+        const int * row_nlist = temp_nlist + atom_idx * mem_size;
+        int * row_order = nei_order + atom_idx * mem_size;
+        int nei_num=0;
+        for(int i=0;i<nall;i++){
+            if(row_nlist[i]!=-1){
+                row_order[i]=nei_num;
+                nei_num++;
+            }
+        }
+        numneigh[atom_idx]=nei_num;
+    }
+}
+
+__global__ void fill_nlist(
+    int ** firstneigh,
+    const int * temp_nlist,
+    const int * nei_order,
+    const int mem_size,
+    const int nall)
+{
+    const unsigned int atom_idx = blockIdx.x;
+    const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if(neighbor_idx<nall)
+    {
+        const int * in_row = temp_nlist + atom_idx * mem_size;
+        int * out_row = firstneigh[atom_idx];
+        int nei = in_row[neighbor_idx];
+        if(nei!=-1){
+            out_row[nei_order[atom_idx * mem_size + neighbor_idx]]=nei;
+        }
+    }
+}
+
+__global__ void map_nlist(
+    int *nlist,
+    const int *nlist_map,
+    const int nloc,
+    const int nnei
+)
+{
+    int atom_idx=blockIdx.x;
+    int nei_idx=blockIdx.y*blockDim.y+threadIdx.y;
+    if(nei_idx>=nnei){return;}
+    int nlist_idx=atom_idx*nnei+nei_idx;
+    int nlist_item=nlist[nlist_idx];
+    if(nlist_item!=-1){
+        nlist[nlist_idx]=nlist_map[nlist_item];
+    }
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+int build_nlist_gpu_rocm(
+    InputNlist & nlist,
+    int * max_list_size,
+    int * nlist_data,
+    const FPTYPE * c_cpy, 
+    const int & nloc, 
+    const int & nall, 
+    const int & mem_size,
+    const float & rcut)
+{
+    if(mem_size < nall){
+        return 1;
+    }
+    const int nblock = (nall+TPB-1)/TPB;
+    int * ilist = nlist.ilist;
+    int * numneigh = nlist.numneigh;
+    int ** firstneigh = nlist.firstneigh;
+    hipErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+    int * temp_nlist = nlist_data; //nloc*mem_size
+    int * nei_order = temp_nlist + nloc * mem_size;
+    nlist.inum = nloc;
+    FPTYPE rcut2 = rcut * rcut;
+    
+    
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, TPB);
+    hipLaunchKernelGGL(build_nlist, block_grid, thread_grid, 0, 0, 
+                ilist, 
+                temp_nlist,
+                c_cpy, 
+                rcut2,
+                nloc,
+                nall,
+                mem_size);
+    const int nblock_ = (nloc+TPB-1)/TPB;
+    hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0, 
+                numneigh, 
+                nei_order, 
+                temp_nlist, 
+                mem_size, 
+                nloc, 
+                nall);
+    hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, 
+                firstneigh,
+                temp_nlist,
+                nei_order,
+                mem_size,
+                nall
+    );
+    int * numneigh_host = new int[nloc];
+    hipErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost));
+    int max_nei = 0;
+    for(int ii=0;ii<nloc;ii++){
+        if(numneigh_host[ii]>max_nei)max_nei=numneigh_host[ii];
+    }
+    *max_list_size = max_nei;
+    delete [] numneigh_host;
+    return 0;
+}
+
+void use_nlist_map(
+    int * nlist, 
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei)
+{
+    int nblock=(nnei+TPB-1)/TPB;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, TPB);
+    hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map, nloc, nnei);
+}
+
+template int build_nlist_gpu_rocm<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
+template int build_nlist_gpu_rocm<double>(InputNlist & nlist, int * max_list_size, int * nlist_data, const double * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
+}
\ No newline at end of file
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
new file mode 100644
index 0000000000..61635aaf83
--- /dev/null
+++ b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -0,0 +1,579 @@
+#include "hip/hip_runtime.h"
+#include "gpu_rocm.h"
+#include "fmt_nlist.h"
+#include "prod_env_mat.h"
+#include "device.h"
+#include "prod_env_mat.h"
+#include "rocprim/rocprim.hpp"
+#include "hipcub/hipcub.hpp"
+
+// common part of prod_env_mat
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void BlockSortKernel(
+    Key * d_in,
+    Key * d_out)                // Tile of output
+{   
+  enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+  // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+  typedef hipcub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+  // Specialize BlockRadixSort type for our thread block
+  typedef hipcub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+  // Shared memory
+  __shared__ union TempStorage
+  {
+    typename BlockLoadT::TempStorage        load;
+    typename BlockRadixSortT::TempStorage   sort;
+  } temp_storage;
+  // Per-thread tile items
+  Key items[ITEMS_PER_THREAD];
+  // Our current block's offset
+  int block_offset = blockIdx.x * TILE_SIZE;
+  // Load items into a blocked arrangement
+  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+  // Barrier for smem reuse
+  __syncthreads();
+  // Sort keys
+  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+  // Store output in striped fashion
+  hipcub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+}
+
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(
+    FPTYPE * arr1, 
+    FPTYPE * arr2) 
+{
+  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template<typename FPTYPE>
+__device__ inline void spline5_switch(
+    FPTYPE & vv,
+    FPTYPE & dd,
+    FPTYPE & xx, 
+    const float & rmin, 
+    const float & rmax) 
+{
+  if (xx < rmin) {
+    dd = 0;
+    vv = 1;
+  }
+  else if (xx < rmax) {
+    FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+    FPTYPE du = 1. / (rmax - rmin) ;
+    vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
+    dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+  }
+  else {
+    dd = 0;
+    vv = 0;
+  }
+}
+
+template<typename FPTYPE>
+__device__ inline uint_64 encoding_nbor_info(
+    const int type,
+    const FPTYPE dist,
+    const int index)
+{
+  // nbor info checking:
+  // the type of nbor atom must be smaller than 128
+  // the distance of center atom between nbor atom must be smaller than 128
+  // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24)
+  if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) {
+    __builtin_trap();
+  }
+  return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index;
+}
+
+__device__ inline void decoding_nbor_info(
+    int &type,
+    int &index,
+    const uint_64 key)
+{
+  type = key >> 57;
+  index = key & 0xFFFFFF;
+}
+
+template<typename FPTYPE>
+__global__ void get_i_idx(
+    FPTYPE * i_idx,
+    const int nloc,
+    const FPTYPE * ilist)
+{
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx >= nloc) {
+    return;
+  }
+  i_idx[ilist[idx]] = idx;
+}
+
+template<typename FPTYPE>
+__global__ void format_nlist_fill_a(
+    uint_64 * key,
+    const FPTYPE * coord,
+    const int * type,
+    const int * numneigh,
+    int ** firstneigh,
+    const float rcut,
+    int * i_idx,
+    const int MAX_NBOR_SIZE)
+{   
+  // <<<nloc, MAX_NBOR_SIZE>>>
+  const unsigned int idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+  
+  const int nsize = numneigh[i_idx[idx]];
+  if (idy >= nsize) {
+    return;
+  }
+
+  const int * nei_idx = firstneigh[i_idx[idx]];
+  // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
+  uint_64 * key_in = key + idx * MAX_NBOR_SIZE;
+  FPTYPE diff[3];
+  const int & j_idx = nei_idx[idy];
+  for (int dd = 0; dd < 3; dd++) {
+    diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
+  }
+  FPTYPE rr = sqrt(dev_dot(diff, diff)); 
+  if (rr <= rcut) {
+    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
+  }
+}
+
+template<typename FPTYPE>
+__global__ void format_nlist_fill_b(
+    int * nlist,
+    const int nlist_size,
+    const int nloc,
+    FPTYPE * key,
+    const int * sec,
+    const int sec_size,
+    int * nei_iter_dev,
+    const int max_nbor_size)
+{ 
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx >= nloc) {
+    return;
+  }
+  
+  int * row_nlist = nlist + idx * nlist_size;
+  int * nei_iter = nei_iter_dev + idx * sec_size;
+  FPTYPE * key_out = key + nloc * max_nbor_size + idx * max_nbor_size;
+  for (int ii = 0; ii < sec_size; ii++) {
+    nei_iter[ii] = sec[ii];
+  }
+  
+  int nei_type = 0, nbor_idx = 0;
+  for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) {
+    decoding_nbor_info(nei_type, nbor_idx, key_out[kk]);
+    if (nei_iter[nei_type] < sec[nei_type + 1]) {
+      row_nlist[nei_iter[nei_type]++] = nbor_idx;
+    }
+  }
+}
+
+template<typename FPTYPE>
+__global__ void encoding_decoding_nbor_info(
+    uint_64 * key,
+    int * out_type,
+    int * out_index,
+    const int * in_type,
+    const FPTYPE * in_dist,
+    const int * in_index,
+    const int size_of_array)
+{ 
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx >= size_of_array) {
+    return;
+  }
+  
+  key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]);
+  decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_1024 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx) 
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 1024;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  const int ITEMS_PER_THREAD = 8;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_2048 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx) 
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 2048;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  const int ITEMS_PER_THREAD = 8;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_4096 (
+    uint_64 * key,
+    const FPTYPE* coord,
+    const int* type,
+    const deepmd::InputNlist & gpu_inlist,
+    const int& nloc,       
+    const float& rcut, 
+    int * i_idx)
+{   
+  const int LEN = 256;
+  const int MAX_NBOR_SIZE = 4096;
+  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(1, LEN);
+  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
+      key,
+      coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  const int ITEMS_PER_THREAD = 16;
+  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
+  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
+      key, 
+      key + nloc * MAX_NBOR_SIZE);
+}
+
+template<
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void compute_env_mat_a(
+    FPTYPE* em,
+    FPTYPE* em_deriv,
+    FPTYPE* rij,
+    const FPTYPE* coord,
+    const FPTYPE* avg,
+    const FPTYPE* std,
+    const int* type,
+    const int* nlist,
+    const int nnei,
+    const float rmin,
+    const float rmax)
+{   
+  // <<<nloc, TPB>>>
+  const unsigned int bid = blockIdx.x;
+  const unsigned int tid = threadIdx.x;
+  if (tid >= nnei) {
+    return;
+  }
+  const int ndescrpt = nnei * 4;
+  const int * row_nlist = nlist + bid * nnei;
+  FPTYPE * row_rij = rij + bid * nnei * 3;
+  FPTYPE * row_descript = em + bid * nnei * 4;
+  FPTYPE * row_descript_deriv = em_deriv + bid * nnei * 12;
+  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+    const int idx_value = ii * 4;	  // 4 components
+    const int idx_deriv = ii * 12;	// 4 components time 3 directions
+    if (row_nlist[ii] >= 0) {
+      FPTYPE rr[3]  = {0};
+      FPTYPE dd[4]  = {0};
+      FPTYPE vv[12] = {0};
+      const int j_idx = row_nlist[ii];
+      for (int kk = 0; kk < 3; kk++) {
+        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        row_rij[ii * 3 + kk] = rr[kk];
+      }
+      // const FPTYPE * rr = &row_rij[ii * 3];
+      FPTYPE nr2 = dev_dot(rr, rr);
+      FPTYPE inr = 1./sqrt(nr2);
+      FPTYPE nr = nr2 * inr;
+      FPTYPE inr2 = inr * inr;
+      FPTYPE inr4 = inr2 * inr2;
+      FPTYPE inr3 = inr4 * nr;
+      FPTYPE sw, dsw;
+      spline5_switch(sw, dsw, nr, rmin, rmax);
+      dd[0] = (1./nr)       ;//* sw;
+      dd[1] = (rr[0] / nr2) ;//* sw;
+      dd[2] = (rr[1] / nr2) ;//* sw;
+      dd[3] = (rr[2] / nr2) ;//* sw;
+      vv[0] = (rr[0] * inr3 * sw - dd[0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+      vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+      vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+      // ****deriv of component x/r2
+      vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+      vv[4] = ((2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+      vv[5] = ((2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+      // ***deriv of component y/r2
+      vv[6] = ((2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+      vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+      vv[8] = ((2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+      // ***deriv of component z/r2 
+      vv[9] = ((2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+      vv[10]= ((2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+      vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+      // 4 value components
+      dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+      dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
+      dd[2] *= sw; // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2];
+      dd[3] *= sw; // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3];
+      for (int ii = 0; ii < 12; ii++) {
+        row_descript_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+      }
+      for (int ii = 0; ii < 4; ii++) {  
+        row_descript[idx_value + ii] = (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) / std[type[bid] * ndescrpt + idx_value + ii];
+      }
+    }
+    else {
+      // TODO: move it to the memset.
+      row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value];
+    }
+  }
+}
+
+template<
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void compute_env_mat_r(
+    FPTYPE* em,
+    FPTYPE* em_deriv,
+    FPTYPE* rij,
+    const FPTYPE* coord,
+    const FPTYPE* avg,
+    const FPTYPE* std,
+    const int* type,
+    const int* nlist,
+    const int nnei,
+    const float rmin,
+    const float rmax)
+{
+  // <<<nloc, TPB>>>
+  const unsigned int bid = blockIdx.x;
+  const unsigned int tid = threadIdx.x;
+  if (tid >= nnei) {
+    return;
+  }
+  const int ndescrpt = nnei;
+  const int * row_nlist = nlist + bid * nnei;
+  FPTYPE * row_rij = rij + bid * nnei * 3;
+  FPTYPE * row_em = em + bid * nnei;
+  FPTYPE * row_em_deriv = em_deriv + bid * nnei * 3;
+  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+    const int idx_value = ii;	  // 4 components
+    const int idx_deriv = ii * 3;	// 4 components time 3 directions
+    if (row_nlist[ii] >= 0) {
+      FPTYPE rr[3]  = {0};
+      FPTYPE vv[3]  = {0};
+      FPTYPE dd     = 0;
+      const int & j_idx = row_nlist[ii];
+      for (int kk = 0; kk < 3; kk++) {
+        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+        row_rij[ii * 3 + kk] = rr[kk];
+      }
+      // const FPTYPE * rr = &row_rij[ii * 3];
+      FPTYPE nr2 = dev_dot(rr, rr);
+      FPTYPE inr = 1./sqrt(nr2);
+      FPTYPE nr = nr2 * inr;
+      FPTYPE inr2 = inr * inr;
+      FPTYPE inr4 = inr2 * inr2;
+      FPTYPE inr3 = inr4 * nr;
+      FPTYPE sw, dsw;
+      spline5_switch(sw, dsw, nr, rmin, rmax);
+      dd = (1./nr)       ;//* sw;
+      vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+      vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+      vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+      
+      // 4 value components
+      dd *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+      for (int ii = 0; ii < 3; ii++) {
+        row_em_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+      }
+      row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) / std[type[bid] * ndescrpt + idx_value];
+    }
+    else {
+      // TODO: move it to the memset.
+      row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value];
+    }
+  }
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void format_nbor_list_gpu_rocm(    
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const deepmd::InputNlist & gpu_inlist,
+    int * array_int,
+    uint_64 * array_longlong,
+    const int max_nbor_size,
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const std::vector<int> sec)
+{
+  const int LEN = 256;
+  const int nnei = sec.back();
+  const int nblock = (nloc + LEN -1) / LEN;
+  int * sec_dev = array_int;
+  int * nei_iter = array_int + sec.size(); // = new int[sec_size];
+  int * i_idx = array_int + sec.size() + nloc * sec.size();
+  uint_64 * key = array_longlong;
+  assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
+  hipErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei));
+  hipErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  hipErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice));   
+
+  hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, 
+      i_idx,
+      nloc, gpu_inlist.ilist);
+
+  if (max_nbor_size == 1024) {
+    format_nbor_list_1024 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  } 
+  else if (max_nbor_size == 2048) {
+    format_nbor_list_2048 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  } 
+  else if (max_nbor_size == 4096) {
+    format_nbor_list_4096 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  }
+
+  hipLaunchKernelGGL(format_nlist_fill_b, nblock, LEN, 0, 0, 
+      nlist,
+      nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+}
+
+template <typename FPTYPE>
+void prod_env_mat_a_gpu_rocm(    
+    FPTYPE * em, 
+    FPTYPE * em_deriv, 
+    FPTYPE * rij, 
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const InputNlist & gpu_inlist,
+    int * array_int, 
+    uint_64 * array_longlong,
+    const int max_nbor_size,
+    const FPTYPE * avg, 
+    const FPTYPE * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec)
+{
+  const int nnei = sec.back();
+  const int ndescrpt = nnei * 4;
+  hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+
+  format_nbor_list_gpu_rocm(
+      nlist, 
+      coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+  nborErrcheck(hipGetLastError());
+  nborErrcheck(hipDeviceSynchronize());
+
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a<FPTYPE, TPB>), nloc, TPB, 0, 0, 
+      em, em_deriv, rij, 
+      coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+}
+
+template <typename FPTYPE>
+void prod_env_mat_r_gpu_rocm(    
+    FPTYPE * em, 
+    FPTYPE * em_deriv, 
+    FPTYPE * rij, 
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const InputNlist & gpu_inlist,
+    int * array_int, 
+    uint_64 * array_longlong,
+    const int max_nbor_size,
+    const FPTYPE * avg, 
+    const FPTYPE * std, 
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const float rcut_smth, 
+    const std::vector<int> sec)
+{
+  const int nnei = sec.back();
+  const int ndescrpt = nnei * 1;
+  hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+
+  format_nbor_list_gpu_rocm(
+      nlist, 
+      coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+  nborErrcheck(hipGetLastError());
+  nborErrcheck(hipDeviceSynchronize());
+  
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r<FPTYPE, TPB>), nloc, TPB, 0, 0, 
+      em, em_deriv, rij, 
+      coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+}
+
+template <typename FPTYPE>
+void test_encoding_decoding_nbor_info_gpu_rocm(
+    uint_64 * key,
+    int * out_type,
+    int * out_index,
+    const int * in_type,
+    const FPTYPE * in_dist,
+    const int * in_index,
+    const int size_of_array)
+{
+  const int nblock = (size_of_array + TPB - 1) / TPB;
+  hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, 
+      key, out_type, out_index,
+      in_type, in_dist, in_index, size_of_array);
+}
+
+template void prod_env_mat_a_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void prod_env_mat_a_gpu_rocm<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void prod_env_mat_r_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void prod_env_mat_r_gpu_rocm<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void format_nbor_list_gpu_rocm<float>(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
+template void format_nbor_list_gpu_rocm<double>(int * nlist, const double * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
+template void test_encoding_decoding_nbor_info_gpu_rocm(uint_64 * key, int * out_type, int * out_index, const int * in_type, const float * in_dist, const int * in_index, const int size_of_array);
+template void test_encoding_decoding_nbor_info_gpu_rocm(uint_64 * key, int * out_type, int * out_index, const int * in_type, const double * in_dist, const int * in_index, const int size_of_array);
+}
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
new file mode 100644
index 0000000000..3c56b8155a
--- /dev/null
+++ b/source/lib/src/rocm/prod_force.hip.cu
@@ -0,0 +1,161 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "prod_force.h"
+#include "rocprim/rocprim.hpp"
+
+template <
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom(
+    FPTYPE * force, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * in_deriv, 
+    const int ndescrpt)
+{
+  __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+  unsigned int bid = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+    data[ii] = 0.f;
+  }
+  for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+    for (int jj = 0; jj < 3; jj++) {
+      data[jj * THREADS_PER_BLOCK + tid] += net_deriv[bid * ndescrpt + ii] * in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+    }
+  }
+  __syncthreads(); 
+  // do reduction in shared memory
+  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    if (tid < ii) {
+      for (int jj = 0; jj < 3; jj++) {
+        data[jj * THREADS_PER_BLOCK + tid] += data[jj * THREADS_PER_BLOCK + tid + ii];
+      }
+    }
+    __syncthreads();
+  }
+  // write result for this block to global memory
+  if (tid == 0) {
+    force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+    force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+    force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+  }
+}
+
+template<typename FPTYPE>
+__global__ void force_deriv_wrt_neighbors_a(
+    FPTYPE * force, 
+    const FPTYPE * net_deriv,
+    const FPTYPE * in_deriv,
+    const int * nlist,
+    const int nloc,
+    const int nnei)
+{  
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+    const unsigned int idw = threadIdx.z;
+    const int ndescrpt = nnei * 4;
+    if (idx >= nloc) {
+        return;
+    }
+    // deriv wrt neighbors
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    atomicAdd(
+        force + j_idx * 3 + idz, 
+        net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
+}
+
+template<typename FPTYPE>
+__global__ void force_deriv_wrt_neighbors_r(
+		FPTYPE * force, 
+		const FPTYPE * net_deriv,
+		const FPTYPE * in_deriv,
+		const int * nlist,
+		const int nloc,
+		const int nnei)
+{  
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+    const int ndescrpt = nnei * 1;
+    if (idx >= nloc) {
+        return;
+    }
+    // deriv wrt neighbors
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    atomicAdd(
+        force + j_idx * 3 + idz, 
+        net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+}
+
+namespace deepmd {
+  template<typename FPTYPE> 
+  void prod_force_a_gpu_rocm(    
+      FPTYPE * force, 
+      const FPTYPE * net_deriv, 
+      const FPTYPE * in_deriv, 
+      const int * nlist, 
+      const int nloc, 
+      const int nall, 
+      const int nnei)
+  {
+    const int ndescrpt = nnei * 4;
+    hipErrcheck(hipMemset(
+        force, 
+        0.0, sizeof(FPTYPE) * nall * 3));
+  
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
+        force, 
+        net_deriv, in_deriv, ndescrpt);
+  
+    const int LEN = 64;
+    const int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 3, 4);
+    hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
+        force, 
+        net_deriv, in_deriv, nlist, nloc, nnei);
+  }
+  
+  template<typename FPTYPE> 
+  void prod_force_r_gpu_rocm(    
+      FPTYPE * force, 
+      const FPTYPE * net_deriv, 
+      const FPTYPE * in_deriv, 
+      const int * nlist, 
+      const int nloc, 
+      const int nall, 
+      const int nnei)
+  {
+    const int ndescrpt = nnei * 1;
+    hipErrcheck(hipMemset(
+        force, 
+        0.0, sizeof(FPTYPE) * nall * 3));
+  
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
+        force, 
+        net_deriv, in_deriv, ndescrpt);
+  
+    const int LEN = 64;
+    const int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 3);
+    hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
+        force, 
+        net_deriv, in_deriv, nlist, nloc, nnei);
+  }
+  
+  template void prod_force_a_gpu_rocm<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
+  template void prod_force_a_gpu_rocm<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
+  template void prod_force_r_gpu_rocm<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
+  template void prod_force_r_gpu_rocm<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
+  
+}
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
new file mode 100644
index 0000000000..3a3fd8ea58
--- /dev/null
+++ b/source/lib/src/rocm/prod_force_grad.hip.cu
@@ -0,0 +1,143 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "prod_force_grad.h"
+
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(
+    const FPTYPE * arr1, 
+    const FPTYPE * arr2) 
+{
+    return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
+}
+
+template<typename FPTYPE>
+__global__ void force_grad_wrt_center_atom(
+    FPTYPE * grad_net,
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int ndescrpt)
+{
+    __shared__ FPTYPE grad_one[3];
+    unsigned int center_idx = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    if(tid < 3){
+        grad_one[tid] = grad[center_idx * 3 + tid];
+    }
+    __syncthreads();
+    unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid;
+    if(descrpt_idx < ndescrpt){
+        grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot(grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3);
+    }
+}
+
+template<typename FPTYPE>
+__global__ void force_grad_wrt_neighbors_a(
+    FPTYPE * grad_net, 
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int * nlist, 
+    const int nloc,
+    const int nnei)
+{
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idw = threadIdx.y;
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    if (j_idx >= nloc) j_idx = j_idx % nloc;
+    grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
+}
+
+template<typename FPTYPE>
+__global__ void force_grad_wrt_neighbors_r(
+    FPTYPE * grad_net, 
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int * nlist, 
+    const int nloc,
+    const int nnei)
+{
+    // idy -> nnei
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    if (j_idx >= nloc) j_idx = j_idx % nloc;
+    grad_net[idx * nnei + idy] += dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3);
+}
+
+namespace deepmd {
+template<typename FPTYPE>
+void prod_force_grad_a_gpu_rocm(
+    FPTYPE * grad_net, 
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int * nlist, 
+    const int nloc, 
+    const int nnei)
+{
+    const int ndescrpt = nnei * 4;
+    hipErrcheck(hipMemset(
+        grad_net, 
+        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+    const int nblock = (ndescrpt + TPB - 1) / TPB;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(TPB, 1);
+    hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, 
+        grad_net,
+        grad, env_deriv, ndescrpt);
+
+    const int LEN = 128;
+    const int nblock_ = (nloc + LEN -1) / LEN;
+    dim3 block_grid_(nblock_, nnei);
+    dim3 thread_grid_(LEN, 4);
+    hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, 0, 
+        grad_net,
+        grad, env_deriv, nlist, nloc, nnei);
+}
+
+template<typename FPTYPE>
+void prod_force_grad_r_gpu_rocm(
+    FPTYPE * grad_net, 
+    const FPTYPE * grad, 
+    const FPTYPE * env_deriv, 
+    const int * nlist, 
+    const int nloc, 
+    const int nnei)
+{
+    const int ndescrpt = nnei * 1;
+    hipErrcheck(hipMemset(
+        grad_net, 
+        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+    const int nblock = (ndescrpt + TPB - 1) / TPB;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(TPB, 1);
+    hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, 
+        grad_net,
+        grad, env_deriv, ndescrpt);
+
+    const int LEN = 128;
+    const int nblock_ = (nloc + LEN -1) / LEN;
+    dim3 block_grid_(nblock_, nnei);
+    dim3 thread_grid_(LEN, 1);
+    hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, 0, 
+        grad_net,
+        grad, env_deriv, nlist, nloc, nnei);
+}
+
+template void prod_force_grad_a_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei);
+template void prod_force_grad_a_gpu_rocm<double>(double * grad_net, const double * grad, const double * env_deriv, const int * nlist, const int nloc, const int nnei);
+template void prod_force_grad_r_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei);
+template void prod_force_grad_r_gpu_rocm<double>(double * grad_net, const double * grad, const double * env_deriv, const int * nlist, const int nloc, const int nnei);
+}
\ No newline at end of file
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
new file mode 100644
index 0000000000..a285a1789b
--- /dev/null
+++ b/source/lib/src/rocm/prod_virial.hip.cu
@@ -0,0 +1,176 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "prod_virial.h"
+#include "rocprim/rocprim.hpp"
+
+template <
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void atom_virial_reduction(
+    FPTYPE * virial, 
+    const FPTYPE * atom_virial,
+    const int nall)
+{
+    unsigned int bid = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    __shared__ FPTYPE data[THREADS_PER_BLOCK];
+    data[tid] = 0.f;
+    for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
+        data[tid] += atom_virial[ii * 9 + bid];
+    }
+    __syncthreads(); 
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            data[tid] += data[tid + ii];
+        }
+        __syncthreads();
+    }
+    // write result for this block to global memory
+    if (tid == 0) virial[bid] = data[0];
+}
+
+template<typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_a(
+    FPTYPE * virial, 
+    FPTYPE * atom_virial,
+    const FPTYPE * net_deriv,
+    const FPTYPE * in_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei) 
+{
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned int idy = blockIdx.y;
+  const unsigned int idz = threadIdx.y;
+  const unsigned int idw = threadIdx.z;
+  const int ndescrpt = nnei * 4;
+  if (idx >= nloc) {
+      return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+      return;
+  }
+  // atomicAdd(
+  //    virial + idz, 
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
+  atomicAdd(
+      atom_virial + j_idx * 9 + idz, 
+      net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
+}
+
+template<typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_r(
+    FPTYPE * virial, 
+    FPTYPE * atom_virial,
+    const FPTYPE * net_deriv,
+    const FPTYPE * in_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei) 
+{
+    // idx -> nloc
+    // idy -> nnei
+    // idz = dd0 * 3 + dd1
+    // dd0 = idz / 3
+    // dd1 = idz % 3
+    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idz = threadIdx.y;
+    const int ndescrpt = nnei * 1;
+
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    // atomicAdd(
+    //    virial + idz, 
+    //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
+    atomicAdd(
+        atom_virial + j_idx * 9 + idz, 
+        net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
+}
+
+namespace deepmd {
+template<typename FPTYPE>
+void prod_virial_a_gpu_rocm(
+    FPTYPE * virial, 
+    FPTYPE * atom_virial, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * in_deriv, 
+    const FPTYPE * rij, 
+    const int * nlist, 
+    const int nloc, 
+    const int nall, 
+    const int nnei)
+{
+  hipErrcheck(hipMemset(
+      virial, 
+      0.0, sizeof(FPTYPE) * 9));
+  hipErrcheck(hipMemset(
+      atom_virial, 
+      0.0, sizeof(FPTYPE) * 9 * nall));
+    
+  const int LEN = 16;
+  int nblock = (nloc + LEN -1) / LEN;
+  dim3 block_grid(nblock, nnei);
+  dim3 thread_grid(LEN, 9, 4);
+  // compute virial of a frame
+  hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
+      virial, atom_virial, 
+      net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  // reduction atom_virial to virial
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9, TPB, 0, 0, 
+      virial, 
+      atom_virial, nall);
+}
+
+template<typename FPTYPE>
+void prod_virial_r_gpu_rocm(
+    FPTYPE * virial, 
+    FPTYPE * atom_virial, 
+    const FPTYPE * net_deriv, 
+    const FPTYPE * in_deriv, 
+    const FPTYPE * rij, 
+    const int * nlist, 
+    const int nloc, 
+    const int nall, 
+    const int nnei)
+{
+  hipErrcheck(hipMemset(
+      virial, 
+      0.0, sizeof(FPTYPE) * 9));
+  hipErrcheck(hipMemset(
+      atom_virial, 
+      0.0, sizeof(FPTYPE) * 9 * nall));
+    
+  const int LEN = 16;
+  int nblock = (nloc + LEN -1) / LEN;
+  dim3 block_grid(nblock, nnei);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
+      virial, atom_virial, 
+      net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  // reduction atom_virial to virial
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9, TPB, 0, 0, 
+    virial, 
+    atom_virial, nall);
+}
+
+template void prod_virial_a_gpu_rocm<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
+template void prod_virial_a_gpu_rocm<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
+template void prod_virial_r_gpu_rocm<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
+template void prod_virial_r_gpu_rocm<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
+}
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
new file mode 100644
index 0000000000..c4d8a5c19a
--- /dev/null
+++ b/source/lib/src/rocm/prod_virial_grad.hip.cu
@@ -0,0 +1,141 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "prod_virial_grad.h"
+
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot9(
+    const FPTYPE * arr1, 
+    const FPTYPE * arr2) 
+{
+    FPTYPE result = 0.0;
+    for(int ii=0; ii<9; ii++){
+        result += arr1[ii] * arr2[ii];
+    }
+    return result;
+}
+
+template<typename FPTYPE>
+__global__ void virial_grad_wrt_neighbors_a(
+    FPTYPE * grad_net,
+    const FPTYPE * grad,
+    const FPTYPE * env_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei)
+{
+    // idy -> nnei
+    const unsigned int tid = threadIdx.x;
+    const unsigned int idx = blockIdx.x * blockDim.x + tid;
+    const unsigned int idy = blockIdx.y;
+    const unsigned int idw = threadIdx.y;
+    const int ndescrpt = nnei * 4;
+    __shared__ FPTYPE grad_one[9];
+    if(tid < 9){
+        grad_one[tid] = grad[tid];
+    }
+    __syncthreads(); 
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    FPTYPE tmp[9];
+    for (int dd0 = 0; dd0 < 3; ++dd0){
+        for (int dd1 = 0; dd1 < 3; ++dd1){
+            tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
+        }
+    }
+    grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp);
+}
+
+template<typename FPTYPE>
+__global__ void virial_grad_wrt_neighbors_r(
+    FPTYPE * grad_net,
+    const FPTYPE * grad,
+    const FPTYPE * env_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei)
+{
+    // idy -> nnei
+    const unsigned int tid = threadIdx.x;
+    const unsigned int idx = blockIdx.x * blockDim.x + tid;
+    const unsigned int idy = blockIdx.y;
+    const int ndescrpt = nnei;
+    __shared__ FPTYPE grad_one[9];
+    if(tid < 9){
+        grad_one[tid] = grad[tid];
+    }
+    __syncthreads(); 
+    if (idx >= nloc) {
+        return;
+    }
+    int j_idx = nlist[idx * nnei + idy];
+    if (j_idx < 0) {
+        return;
+    }
+    FPTYPE tmp[9];
+    for (int dd0 = 0; dd0 < 3; ++dd0){
+        for (int dd1 = 0; dd1 < 3; ++dd1){
+            tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0];
+        }
+    }
+    grad_net[idx * ndescrpt + idy] -= -1.0 * dev_dot9(grad_one, tmp);
+}
+
+namespace deepmd {
+template<typename FPTYPE>
+void prod_virial_grad_a_gpu_rocm(
+    FPTYPE * grad_net,
+    const FPTYPE * grad,
+    const FPTYPE * env_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei)
+{
+    const int ndescrpt = nnei * 4;
+    hipErrcheck(hipMemset(
+        grad_net, 
+        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+    const int LEN = 128;
+    const int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 4);
+    hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
+        grad_net,
+        grad, env_deriv, rij, nlist, nloc, nnei);
+}
+
+template<typename FPTYPE>
+void prod_virial_grad_r_gpu_rocm(
+    FPTYPE * grad_net,
+    const FPTYPE * grad,
+    const FPTYPE * env_deriv,
+    const FPTYPE * rij,
+    const int * nlist,
+    const int nloc,
+    const int nnei)
+{
+    const int ndescrpt = nnei;
+    hipErrcheck(hipMemset(
+        grad_net, 
+        0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+    const int LEN = 128;
+    const int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 1);
+    hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
+        grad_net,
+        grad, env_deriv, rij, nlist, nloc, nnei);
+}
+
+template void prod_virial_grad_a_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei);
+template void prod_virial_grad_a_gpu_rocm<double>(double * grad_net, const double * grad, const double * env_deriv, const double * rij, const int * nlist, const int nloc, const int nnei);
+template void prod_virial_grad_r_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei);
+template void prod_virial_grad_r_gpu_rocm<double>(double * grad_net, const double * grad, const double * env_deriv, const double * rij, const int * nlist, const int nloc, const int nnei);
+}
\ No newline at end of file
diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu
new file mode 100644
index 0000000000..7f883b14c3
--- /dev/null
+++ b/source/lib/src/rocm/region.hip.cu
@@ -0,0 +1,69 @@
+#include "hip/hip_runtime.h"
+#include "device.h"
+#include "region.h"
+#include "region.cuh"
+
+template<typename FPTYPE>
+__global__ void _phys2Inter(
+    FPTYPE *inter, 
+    const FPTYPE *phys, 
+    const FPTYPE *rec_boxt)
+{
+    phys2Inter(inter, phys, rec_boxt);
+}
+
+template<typename FPTYPE>
+__global__ void _inter2Phys(
+    FPTYPE *phys, 
+    const FPTYPE *inter, 
+    const FPTYPE *boxt)
+{
+    inter2Phys(phys, inter, boxt);
+}
+
+template<typename FPTYPE>
+__global__ void _compute_volume(
+    FPTYPE * volume, 
+    const FPTYPE * boxt)
+{
+    volume[0] = compute_volume(boxt);
+}
+
+namespace deepmd {
+//only for unittest
+template<typename FPTYPE>
+void
+convert_to_inter_gpu_rocm(
+    FPTYPE * ri, 
+    const Region<FPTYPE> & region,
+    const FPTYPE * rp)
+{
+    hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt);
+}
+
+template<typename FPTYPE>
+void
+convert_to_phys_gpu_rocm(
+    FPTYPE * rp, 
+    const Region<FPTYPE> & region,
+    const FPTYPE * ri)
+{
+    hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt);
+}
+
+template<typename FPTYPE>
+void
+volume_gpu_rocm(
+    FPTYPE * volume,
+    const Region<FPTYPE> & region)
+{
+    hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt);
+}
+
+template void convert_to_inter_gpu_rocm<float>(float * ri, const Region<float> & region, const float * rp);
+template void convert_to_inter_gpu_rocm<double>(double * ri, const Region<double> & region, const double * rp);
+template void convert_to_phys_gpu_rocm<float>(float * rp, const Region<float> & region, const float * ri);
+template void convert_to_phys_gpu_rocm<double>(double * rp, const Region<double> & region, const double * ri);
+template void volume_gpu_rocm<float>(float * volume, const Region<float> & region);
+template void volume_gpu_rocm<double>(double * volume, const Region<double> & region);
+}
\ No newline at end of file
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
new file mode 100644
index 0000000000..80aec8af8f
--- /dev/null
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -0,0 +1,246 @@
+#include "hip/hip_runtime.h"
+#include "tabulate.h"
+#include<hip/hip_runtime.h>
+#include<rocprim/rocprim.hpp>
+#include "device.h"
+
+#define MM 4
+#define KK 4
+#define TPB 256
+#define WARP_SIZE 64
+#define FULL_MASK 0xffffffff
+#include "gpu_rocm.h"
+
+template <typename FPTYPE>
+__forceinline__ __device__
+void locate_xx(
+    FPTYPE& xx, 
+    int& table_idx,
+    const FPTYPE& lower, 
+    const FPTYPE& upper,  
+    const FPTYPE& max, 
+    const FPTYPE& stride0, 
+    const FPTYPE& stride1)
+{
+  if (xx < lower) {
+    table_idx = 0;
+    xx = 0;
+  }
+  else if (xx < upper) {
+    table_idx = (int)((xx - lower) / stride0);
+    xx -= (table_idx * stride0 + lower);
+  }
+  else if (xx < max) {
+    int first_stride = int((upper - lower) / stride0);
+    table_idx = first_stride + (int)((xx - upper) / stride1);
+    xx -= ((table_idx - first_stride) * stride1 + upper);
+  }
+  else {
+    table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+    xx = 0;
+  }
+}
+
+template <typename FPTYPE>
+__forceinline__ __device__ 
+FPTYPE dot(
+    FPTYPE ll[4], 
+    FPTYPE rr[4]) 
+{
+  return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3];
+}
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__
+void warp_reduce(
+    FPTYPE & val) 
+{
+  for (int offset = 32; offset > 0; offset >>= 1)
+    val += __shfl_down( val, offset);//########????
+}
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion_fifth_order_polynomial(
+    FPTYPE * out, 
+    const FPTYPE * table, 
+    const FPTYPE * em_x, 
+    const FPTYPE * em, 
+    const FPTYPE lower, 
+    const FPTYPE upper, 
+    const FPTYPE max, 
+    const FPTYPE stride0, 
+    const FPTYPE stride1, 
+    const int nnei, 
+    const int last_layer_size) 
+{
+  HIP_DYNAMIC_SHARED( int, _data)
+  const int block_idx = blockIdx.x;   // nloc
+  const int thread_idx = threadIdx.x; // last_layer_size
+  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
+  bool unloop = false;
+  int breakpoint = nnei - 1;
+  FPTYPE * iteratorC = (FPTYPE*) &_data[0];
+  for (int kk = 0; kk < MTILE; kk++)
+    iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+  __syncthreads();
+
+  for (int ii = 0; ii < nnei; ii++) {
+    FPTYPE var[6]; 
+    FPTYPE xx = em_x[block_idx * nnei + ii];
+    if (xx == ago) {
+      unloop = true;
+      breakpoint = ii;
+    }
+    int table_idx = 0;
+    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
+    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
+    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
+    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
+    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
+    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
+    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
+    FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
+    
+    for (int kk = 0; kk < MTILE; kk++) {
+      iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * em[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+    }
+    if (unloop) break;
+  }
+  for (int ii = 0; ii < MTILE; ii++) {
+    out[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
+  }
+}
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion_grad_fifth_order_polynomial(
+    FPTYPE * dy_dem_x, 
+    FPTYPE * dy_dem,   
+    const FPTYPE * table, 
+    const FPTYPE * em_x, 
+    const FPTYPE * em, 
+    const FPTYPE * dy, 
+    const FPTYPE lower, 
+    const FPTYPE upper, 
+    const FPTYPE max, 
+    const FPTYPE stride0, 
+    const FPTYPE stride1, 
+    const int nnei, 
+    const int last_layer_size) 
+{
+  HIP_DYNAMIC_SHARED( int, _data)
+  const int block_idx = blockIdx.x;  // nloc
+  const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
+  int warp_idx = __shfl(threadIdx.x / 64, 0);
+  int lane_idx = threadIdx.x % 64;
+  int breakpoint = nnei - 1;
+  bool unloop = false;
+  FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
+  for (int ii = 0; ii < MTILE; ii++) {
+    if (thread_idx < last_layer_size) {
+      iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+    }
+  }
+  __syncthreads();
+  FPTYPE ago = __shfl( em_x[block_idx * nnei + nnei - 1], 0);
+  for (int ii = 0; ii < nnei; ii += KTILE) {
+    FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx];
+    if (ago == xx) { 
+      unloop = true;
+      breakpoint = ii + warp_idx;
+    }
+    
+    int table_idx = 0;
+    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
+    FPTYPE sum[KTILE] = {0.f};
+    FPTYPE Csub = 0.f;
+    for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
+      FPTYPE var[6]; 
+      // load iteratorB through table 
+      var[0]  = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; 
+      var[1]  = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; 
+      var[2]  = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; 
+      var[3]  = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
+      var[4]  = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
+      var[5]  = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
+      FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
+      
+      for (int kk = 0; kk < KTILE; kk++) {
+        sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res;
+      }
+      res  = em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * iteratorA[0 * last_layer_size + jj];
+      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
+      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
+      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
+      Csub += (nnei - breakpoint) * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * res;
+    }
+    //__syncwarp();->syncwrap
+    __syncthreads();
+    for (int kk = 0; kk < KTILE; kk++) {
+      warp_reduce(sum[kk]);
+    }
+    warp_reduce(Csub);
+    if (lane_idx == 0) {
+      for (int kk = 0; kk < KTILE; kk++) {
+        dy_dem[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
+      }
+      dy_dem_x[block_idx * nnei + ii + warp_idx] = Csub;
+    }
+    if (unloop) break;
+  }
+}
+
+namespace deepmd {
+    
+template<typename FPTYPE>
+    void tabulate_fusion_gpu_rocm(
+        FPTYPE * out,
+        const FPTYPE * table, 
+        const FPTYPE * table_info, 
+        const FPTYPE * em_x, 
+        const FPTYPE * em, 
+        const int nloc,
+        const int nnei, 
+        const int last_layer_size) 
+    {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, 
+          out, 
+          table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+    }
+    
+    template<typename FPTYPE>
+    void tabulate_fusion_grad_gpu_rocm(
+        FPTYPE * dy_dem_x, 
+        FPTYPE * dy_dem,
+        const FPTYPE * table, 
+        const FPTYPE * table_info, 
+        const FPTYPE * em_x, 
+        const FPTYPE * em, 
+        const FPTYPE * dy, 
+        const int nloc, 
+        const int nnei, 
+        const int last_layer_size)
+    {
+      hipErrcheck(hipMemset(
+          dy_dem_x,
+          0.0, sizeof(FPTYPE) * nloc * nnei));
+      hipErrcheck(hipMemset(
+          dy_dem,
+          0.0, sizeof(FPTYPE) * nloc * nnei * 4));
+    
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, 
+          dy_dem_x, dy_dem,
+          table, em_x, em, dy,  table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+    }
+    
+    template void tabulate_fusion_gpu_rocm<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
+    template void tabulate_fusion_gpu_rocm<double>(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size);
+    template void tabulate_fusion_grad_gpu_rocm<float> (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); 
+    template void tabulate_fusion_grad_gpu_rocm<double> (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size);
+    }
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index 03242ba8dd..d6908a18ce 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -3,6 +3,8 @@ project(libdeepmd_test)
 
 enable_testing()
 
+set(CMAKE_LINK_WHAT_YOU_USE TRUE)
+
 set(libname "deepmd")
 set(LIB_BASE_DIR ${CMAKE_SOURCE_DIR}/../)
 
@@ -40,6 +42,45 @@ if (USE_CUDA_TOOLKIT)
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
 endif()
 
+#define USE_ROCM_TOOLKIT
+if(NOT DEFINED HIP_PATH)
+  if(NOT DEFINED ENV{HIP_PATH})
+    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+  else()
+    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+  endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+if (DEFINED USE_ROCM_TOOLKIT)
+  if(USE_ROCM_TOOLKIT)
+    link_directories(${HIP_ROOT_DIR}/lib)
+    add_definitions("-DTENSORFLOW_USE_ROCM")
+    find_package(HIP REQUIRED)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+  else()
+    message(STATUS "Will not build rocm GPU support")
+  endif()
+else()
+  find_package(HIP REQUIRED)  
+  if (HIP_FOUND)
+    link_directories(${HIP_ROOT_DIR}/lib)
+    add_definitions("-DTENSORFLOW_USE_ROCM")
+    set(USE_ROCM_TOOLKIT TRUE)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+    message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build ROCM GPU support")
+  else()
+    set(USE_ROCM_TOOLKIT FALSE)
+    message(STATUS "No ROCM support found, will not build ROCM GPU support")
+  endif()
+endif()
+if (USE_ROCM_TOOLKIT)
+  add_definitions("-DUSE_ROCM_TOOLKIT")
+  include_directories(${ROCM_INCLUDE_DIRS})
+  add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
+endif()
+
+
 file(GLOB TEST_SRC test_*.cc)
 add_executable( runUnitTests ${TEST_SRC} )
 
@@ -65,6 +106,11 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config)
+elseif()
+  set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) 
+  message(status"EXTRA_LIBS:${EXTRA_LIBS}")
+  message(status"EXTRA_LIBS:${HIP}")
+  target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc ${EXTRA_LIBS} coverage_config)
 else()
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config)
 endif()
diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc
index 8dee80745e..54c58c8a11 100644
--- a/source/lib/tests/test_coord.cc
+++ b/source/lib/tests/test_coord.cc
@@ -159,6 +159,96 @@ TEST_F(TestNormCoord, gpu_case2)
 
 #endif //GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestNormCoord, gpu_case0)
+{
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL;
+  std::vector<double > out_c(r0);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(out_c_dev, out_c);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  for(int ii = 0; ii < posi.size(); ++ii){
+    EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12);
+  }
+}
+
+TEST_F(TestNormCoord, gpu_case1)
+{
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL;
+  std::vector<double > out_c(r1);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(out_c_dev, out_c);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  for(int ii = 0; ii < posi.size(); ++ii){
+    EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12);
+  }
+}
+
+TEST_F(TestNormCoord, gpu_case2)
+{
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL;
+  std::vector<double > out_c(r2);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(out_c_dev, out_c);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  for(int ii = 0; ii < posi.size(); ++ii){
+    EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12);
+  }
+}
+
+#endif //TENSORFLOW_USE_ROCM
+
 typedef std::pair<std::vector<double>,std::vector<int>> atom;
 
 static void
@@ -474,6 +564,158 @@ TEST_F(TestCopyCoord, gpu_lessmem)
 }
 #endif //GOOGLE_CUDA
 
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestCopyCoord, gpu)
+{
+  int mem_size = 1000;
+  std::vector<double > out_c(mem_size * 3);
+  std::vector<int > out_t(mem_size);
+  std::vector<int > mapping(mem_size);
+  int nall;
+  std::vector<int> cell_info;
+  cell_info.resize(23);
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  deepmd::compute_cell_info(&cell_info[0], rc, region);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  const int loc_cellnum=cell_info[21];
+  const int total_cellnum=cell_info[22];
+  int * cell_info_dev=NULL;
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL, * in_c_dev=NULL;
+  int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL;
+  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(in_c_dev, posi);
+  deepmd::malloc_device_memory_sync(in_t_dev, atype);
+  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
+  deepmd::malloc_device_memory(out_t_dev, mem_size);
+  deepmd::malloc_device_memory(mapping_dev, mem_size);
+  deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  int ret = deepmd::copy_coord_gpu_rocm(
+      out_c_dev, 
+      out_t_dev, 
+      mapping_dev, 
+      &nall,
+      int_data_dev,
+      in_c_dev,
+      in_t_dev,
+      nloc,
+      mem_size,
+      loc_cellnum,
+      total_cellnum,
+      cell_info_dev,
+      region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::memcpy_device_to_host(out_t_dev, out_t);
+  deepmd::memcpy_device_to_host(mapping_dev, mapping);
+  deepmd::delete_device_memory(cell_info_dev);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(in_c_dev);
+  deepmd::delete_device_memory(in_t_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  deepmd::delete_device_memory(out_t_dev);
+  deepmd::delete_device_memory(mapping_dev);
+  deepmd::delete_device_memory(int_data_dev);
+  EXPECT_EQ(ret, 0);
+  EXPECT_EQ(nall, expected_nall);
+  out_c.resize(nall*3);
+  out_t.resize(nall);
+  mapping.resize(nall);
+
+  std::vector<double > out_c_1(mem_size * 3);
+  std::vector<int > out_t_1(mem_size);
+  std::vector<int > mapping_1(mem_size);
+  sort_atoms(out_c_1, out_t_1, mapping_1, out_c, out_t, mapping, nloc, nall);
+  for(int ii = 0; ii < expected_nall; ++ii){
+    for(int dd = 0; dd < 3; ++dd){
+      EXPECT_LT(fabs(out_c_1[ii*3+dd] - expected_posi_cpy[ii*3+dd]), 1e-12);
+    }
+    EXPECT_EQ(out_t_1[ii], expected_atype_cpy[ii]);
+    EXPECT_EQ(mapping_1[ii], expected_mapping[ii]);
+  }  
+}
+
+TEST_F(TestCopyCoord, gpu_lessmem)
+{
+  int mem_size = 40;
+  std::vector<double > out_c(mem_size * 3);
+  std::vector<int > out_t(mem_size);
+  std::vector<int > mapping(mem_size);
+  int nall;
+  std::vector<int> cell_info;
+  cell_info.resize(23);
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  deepmd::compute_cell_info(&cell_info[0], rc, region);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  const int loc_cellnum=cell_info[21];
+  const int total_cellnum=cell_info[22];
+  int * cell_info_dev=NULL;
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL, * in_c_dev=NULL;
+  int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL;
+  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(in_c_dev, posi);
+  deepmd::malloc_device_memory_sync(in_t_dev, atype);
+  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
+  deepmd::malloc_device_memory(out_t_dev, mem_size);
+  deepmd::malloc_device_memory(mapping_dev, mem_size);
+  deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  int ret = deepmd::copy_coord_gpu_rocm(
+      out_c_dev, 
+      out_t_dev, 
+      mapping_dev, 
+      &nall,
+      int_data_dev,
+      in_c_dev,
+      in_t_dev,
+      nloc,
+      mem_size,
+      loc_cellnum,
+      total_cellnum,
+      cell_info_dev,
+      region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::memcpy_device_to_host(out_t_dev, out_t);
+  deepmd::memcpy_device_to_host(mapping_dev, mapping);
+  deepmd::delete_device_memory(cell_info_dev);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(in_c_dev);
+  deepmd::delete_device_memory(in_t_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  deepmd::delete_device_memory(out_t_dev);
+  deepmd::delete_device_memory(mapping_dev);
+  deepmd::delete_device_memory(int_data_dev);
+  EXPECT_EQ(ret, 1);
+  // EXPECT_EQ(nall, expected_nall);
+  // std::cout << "---------------------" 
+  // 	    << nloc << " " 
+  // 	    << nall << std::endl;
+}
+#endif //TENSORFLOW_USE_ROCM
+
 class TestCopyCoordMoreCell : public ::testing::Test
 {
 protected:
@@ -745,4 +987,156 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem)
   // 	    << nloc << " " 
   // 	    << nall << std::endl;
 }
-#endif //GOOGLE_CUDA
\ No newline at end of file
+#endif //GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestCopyCoordMoreCell, gpu)
+{
+  int mem_size = 1000;
+  std::vector<double > out_c(mem_size * 3);
+  std::vector<int > out_t(mem_size);
+  std::vector<int > mapping(mem_size);
+  int nall;
+  std::vector<int> cell_info;
+  cell_info.resize(23);
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  deepmd::compute_cell_info(&cell_info[0], rc, region);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  const int loc_cellnum=cell_info[21];
+  const int total_cellnum=cell_info[22];
+  int * cell_info_dev=NULL;
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL, * in_c_dev=NULL;
+  int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL;
+  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(in_c_dev, posi);
+  deepmd::malloc_device_memory_sync(in_t_dev, atype);
+  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
+  deepmd::malloc_device_memory(out_t_dev, mem_size);
+  deepmd::malloc_device_memory(mapping_dev, mem_size);
+  deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  int ret = deepmd::copy_coord_gpu_rocm(
+      out_c_dev, 
+      out_t_dev, 
+      mapping_dev, 
+      &nall,
+      int_data_dev,
+      in_c_dev,
+      in_t_dev,
+      nloc,
+      mem_size,
+      loc_cellnum,
+      total_cellnum,
+      cell_info_dev,
+      region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::memcpy_device_to_host(out_t_dev, out_t);
+  deepmd::memcpy_device_to_host(mapping_dev, mapping);
+  deepmd::delete_device_memory(cell_info_dev);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(in_c_dev);
+  deepmd::delete_device_memory(in_t_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  deepmd::delete_device_memory(out_t_dev);
+  deepmd::delete_device_memory(mapping_dev);
+  deepmd::delete_device_memory(int_data_dev);
+  EXPECT_EQ(ret, 0);
+  EXPECT_EQ(nall, expected_nall);
+  out_c.resize(nall*3);
+  out_t.resize(nall);
+  mapping.resize(nall);
+
+  std::vector<double > out_c_1(mem_size * 3);
+  std::vector<int > out_t_1(mem_size);
+  std::vector<int > mapping_1(mem_size);
+  sort_atoms(out_c_1, out_t_1, mapping_1, out_c, out_t, mapping, nloc, nall);
+  for(int ii = 0; ii < expected_nall; ++ii){
+    for(int dd = 0; dd < 3; ++dd){
+      EXPECT_LT(fabs(out_c_1[ii*3+dd] - expected_posi_cpy[ii*3+dd]), 1e-12);
+    }
+    EXPECT_EQ(out_t_1[ii], expected_atype_cpy[ii]);
+    EXPECT_EQ(mapping_1[ii], expected_mapping[ii]);
+  }  
+}
+
+TEST_F(TestCopyCoordMoreCell, gpu_lessmem)
+{
+  int mem_size = 40;
+  std::vector<double > out_c(mem_size * 3);
+  std::vector<int > out_t(mem_size);
+  std::vector<int > mapping(mem_size);
+  int nall;
+  std::vector<int> cell_info;
+  cell_info.resize(23);
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  init_region_cpu(region, &boxt[0]);
+  deepmd::compute_cell_info(&cell_info[0], rc, region);
+  std::vector<double> box_info;
+  box_info.resize(18);
+  memcpy(&box_info[0], &boxt[0], sizeof(double)*9);
+  memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9);
+  const int loc_cellnum=cell_info[21];
+  const int total_cellnum=cell_info[22];
+  int * cell_info_dev=NULL;
+  double * box_info_dev=NULL;
+  double * out_c_dev=NULL, * in_c_dev=NULL;
+  int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL;
+  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
+  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
+  deepmd::malloc_device_memory_sync(in_c_dev, posi);
+  deepmd::malloc_device_memory_sync(in_t_dev, atype);
+  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
+  deepmd::malloc_device_memory(out_t_dev, mem_size);
+  deepmd::malloc_device_memory(mapping_dev, mem_size);
+  deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc);
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  int ret = deepmd::copy_coord_gpu_rocm(
+      out_c_dev, 
+      out_t_dev, 
+      mapping_dev, 
+      &nall,
+      int_data_dev,
+      in_c_dev,
+      in_t_dev,
+      nloc,
+      mem_size,
+      loc_cellnum,
+      total_cellnum,
+      cell_info_dev,
+      region_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  deepmd::memcpy_device_to_host(out_c_dev, out_c);
+  deepmd::memcpy_device_to_host(out_t_dev, out_t);
+  deepmd::memcpy_device_to_host(mapping_dev, mapping);
+  deepmd::delete_device_memory(cell_info_dev);
+  deepmd::delete_device_memory(box_info_dev);
+  deepmd::delete_device_memory(in_c_dev);
+  deepmd::delete_device_memory(in_t_dev);
+  deepmd::delete_device_memory(out_c_dev);
+  deepmd::delete_device_memory(out_t_dev);
+  deepmd::delete_device_memory(mapping_dev);
+  deepmd::delete_device_memory(int_data_dev);
+  EXPECT_EQ(ret, 1);
+  // EXPECT_EQ(nall, expected_nall);
+  // std::cout << "---------------------" 
+  // 	    << nloc << " " 
+  // 	    << nall << std::endl;
+}
+#endif //TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index f9e262cb2c..cfb70acfe9 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -723,3 +723,216 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   }
 }
 #endif //GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestEnvMatA, prod_gpu_rocm)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu_rocm(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_cpy_dev, 
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+		  1e-5);
+      }
+    }    
+  }
+}
+
+
+TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu_rocm(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_cpy_dev, 
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    EXPECT_EQ(env_1.size(), nnei * 4);
+    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
+    EXPECT_EQ(rij_a_1.size(), nnei * 3);
+    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
+    EXPECT_EQ(env_1.size() * nloc, em.size());
+    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
+    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
+    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
+    for (unsigned jj = 0; jj < env_1.size(); ++jj){
+      EXPECT_LT(fabs(em[ii*nnei*4+jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){
+      EXPECT_LT(fabs(em_deriv[ii*nnei*4*3+jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){
+      EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){
+      EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]);
+    }
+  }
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+  		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+  		  1e-5);
+      }
+    }    
+  }
+}
+#endif //TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index c7b0db5e1d..1b232e28b5 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -555,3 +555,206 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   }
 }
 #endif //GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestEnvMatR, prod_gpu_rocm)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_r_gpu_rocm(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_cpy_dev, 
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 1; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*1 + jj*1 + dd] - 
+		       expected_env[ii*nnei*1 + jj*1 + dd]) , 
+		  1e-5);
+      }
+    }    
+  }
+}
+
+TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_r_gpu_rocm(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_cpy_dev, 
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    EXPECT_EQ(env_1.size(), nnei * 1);
+    EXPECT_EQ(env_deriv_1.size(), nnei * 1 * 3);
+    EXPECT_EQ(rij_a_1.size(), nnei * 3);
+    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
+    EXPECT_EQ(env_1.size() * nloc, em.size());
+    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
+    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
+    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
+    for (unsigned jj = 0; jj < env_1.size(); ++jj){
+      EXPECT_LT(fabs(em[ii*nnei*1+jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){
+      EXPECT_LT(fabs(em_deriv[ii*nnei*1*3+jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){
+      EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){
+      EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]);
+    }
+  }
+}
+#endif //TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index df2a68534f..d2de6e8855 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -525,4 +525,157 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda)
 //   deepmd::delete_device_memory(out_index_dev);
 //   deepmd::delete_device_memory(key_dev);
 // }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestFormatNlist, gpu_rocm)
+{
+  std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
+  build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);  
+  // make a input nlist
+  int inum = nlist_a_0.size();
+  std::vector<int > ilist(inum);
+  std::vector<int > numneigh(inum);
+  std::vector<int* > firstneigh(inum);
+  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(in_nlist, nlist_a_0);
+  // allocate the mem for the result
+  std::vector<int> nlist(inum * sec_a.back());
+  EXPECT_EQ(nlist.size(), expect_nlist_cpy.size());
+
+  double * posi_cpy_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  for (int ii = 0; ii < inum; ii++) {
+    max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii];
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  // format nlist
+  format_nbor_list_gpu_rocm(
+      nlist_dev, 
+      posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  // validate
+  for(int ii = 0; ii < nlist.size(); ++ii){
+    EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
+  }
+}
+
+TEST_F(TestFormatNlistShortSel, gpu_rocm)
+{
+  std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
+  build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);  
+  // make a input nlist
+  int inum = nlist_a_0.size();
+  std::vector<int > ilist(inum);
+  std::vector<int > numneigh(inum);
+  std::vector<int* > firstneigh(inum);
+  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(in_nlist, nlist_a_0);  
+  // mem
+  std::vector<int> nlist(inum * sec_a.back());
+  EXPECT_EQ(nlist.size(), expect_nlist_cpy.size());
+  // format nlist
+  double * posi_cpy_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  for (int ii = 0; ii < inum; ii++) {
+    max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii];
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  // format nlist
+  format_nbor_list_gpu_rocm(
+      nlist_dev, 
+      posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  // validate
+  for(int ii = 0; ii < nlist.size(); ++ii){
+    EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
+  }
+}
+
+TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_rocm) 
+{
+  int * valid_type_dev = NULL, * valid_index_dev = NULL, * out_type_dev = NULL, * out_index_dev = NULL;
+  double * valid_dist_dev = NULL;
+  uint_64 * key_dev = NULL;
+  std::vector<int> out_type(size_of_array, 0);
+  std::vector<int> out_index(size_of_array, 0);
+  std::vector<uint_64> key(size_of_array, 0);
+  deepmd::malloc_device_memory_sync(valid_type_dev, valid_type);
+  deepmd::malloc_device_memory_sync(valid_dist_dev, valid_dist);
+  deepmd::malloc_device_memory_sync(valid_index_dev, valid_index);
+  deepmd::malloc_device_memory_sync(out_type_dev, out_type);
+  deepmd::malloc_device_memory_sync(out_index_dev, out_index);
+  deepmd::malloc_device_memory_sync(key_dev, key);
+
+  deepmd::test_encoding_decoding_nbor_info_gpu_rocm(
+      key_dev, out_type_dev, out_index_dev,
+      valid_type_dev, valid_dist_dev, valid_index_dev, size_of_array
+  );
+
+  deepmd::memcpy_device_to_host(key_dev, key);
+  deepmd::memcpy_device_to_host(out_type_dev, out_type);
+  deepmd::memcpy_device_to_host(out_index_dev, out_index);
+  deepmd::delete_device_memory(valid_type_dev);
+  deepmd::delete_device_memory(valid_dist_dev);
+  deepmd::delete_device_memory(valid_index_dev);
+  deepmd::delete_device_memory(out_type_dev);
+  deepmd::delete_device_memory(out_index_dev);
+  deepmd::delete_device_memory(key_dev);
+  // validate
+  for(int ii = 0; ii < size_of_array; ii++) {
+    EXPECT_EQ(key[ii], expect_key[ii]);
+    EXPECT_EQ(out_type[ii], expect_type[ii]);
+    EXPECT_EQ(out_index[ii], expect_index[ii]);
+  }
+}
+
+
+#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
index 4d85b2dd27..46539f7334 100644
--- a/source/lib/tests/test_gelu.cc
+++ b/source/lib/tests/test_gelu.cc
@@ -212,3 +212,73 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda)
   }  
 }
 #endif // GOOGLE_CUDA
+
+
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestGelu, gelu_gpu_rocm)
+{
+  std::vector<double> gelu(nloc, 0.0);
+  
+  double * gelu_dev = NULL, * xx_dev = NULL;
+  deepmd::malloc_device_memory_sync(gelu_dev, gelu);
+  deepmd::malloc_device_memory_sync(xx_dev, xx);
+  deepmd::gelu_gpu_rocm<double> (gelu_dev, xx_dev, nloc);
+  deepmd::memcpy_device_to_host(gelu_dev, gelu);
+  deepmd::delete_device_memory(gelu_dev);
+  deepmd::delete_device_memory(xx_dev);
+
+  EXPECT_EQ(gelu.size(), nloc);
+  EXPECT_EQ(gelu.size(), expected_gelu.size());
+  for (int jj = 0; jj < gelu.size(); ++jj){
+    EXPECT_LT(fabs(gelu[jj] - expected_gelu[jj]) , 1e-5);
+  }  
+}
+
+TEST_F(TestGelu, gelu_grad_gpu_rocm)
+{
+  std::vector<double> dy(100, 1.0);
+  std::vector<double> gelu_grad(nloc, 0.0);
+
+  double * gelu_grad_dev = NULL, * xx_dev = NULL, * dy_dev = NULL;
+  deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
+  deepmd::malloc_device_memory_sync(xx_dev, xx);
+  deepmd::malloc_device_memory_sync(dy_dev, dy);
+  deepmd::gelu_grad_gpu_rocm<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
+  deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad);
+  deepmd::delete_device_memory(gelu_grad_dev);
+  deepmd::delete_device_memory(xx_dev);
+  deepmd::delete_device_memory(dy_dev);
+
+  EXPECT_EQ(gelu_grad.size(), nloc);
+  EXPECT_EQ(gelu_grad.size(), expected_gelu_grad.size());
+  for (int jj = 0; jj < gelu_grad.size(); ++jj){
+    EXPECT_LT(fabs(gelu_grad[jj] - expected_gelu_grad[jj]) , 1e-5);
+  }  
+}
+
+TEST_F(TestGelu, gelu_grad_grad_gpu_rocm)
+{
+  std::vector<double> dy(100, 1.0);
+  std::vector<double> dy_2(100, 1.0);
+  std::vector<double> gelu_grad_grad(nloc, 0.0);
+
+  double * gelu_grad_grad_dev = NULL, * xx_dev = NULL, * dy_dev = NULL, * dy_2_dev = NULL;
+  deepmd::malloc_device_memory_sync(gelu_grad_grad_dev, gelu_grad_grad);
+  deepmd::malloc_device_memory_sync(xx_dev, xx);
+  deepmd::malloc_device_memory_sync(dy_dev, dy);
+  deepmd::malloc_device_memory_sync(dy_2_dev, dy_2);
+  deepmd::gelu_grad_grad_gpu_rocm<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
+  deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
+  deepmd::delete_device_memory(gelu_grad_grad_dev);
+  deepmd::delete_device_memory(xx_dev);
+  deepmd::delete_device_memory(dy_dev);
+  deepmd::delete_device_memory(dy_2_dev);
+
+  EXPECT_EQ(gelu_grad_grad.size(), nloc);
+  EXPECT_EQ(gelu_grad_grad.size(), expected_gelu_grad_grad.size());
+  for (int jj = 0; jj < gelu_grad_grad.size(); ++jj){
+    EXPECT_LT(fabs(gelu_grad_grad[jj] - expected_gelu_grad_grad[jj]) , 1e-5);
+  }  
+}
+#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc
index 5cf74f7382..193279070f 100644
--- a/source/lib/tests/test_neighbor_list.cc
+++ b/source/lib/tests/test_neighbor_list.cc
@@ -225,4 +225,115 @@ TEST_F(TestNeighborList, gpu_lessmem)
   deepmd::delete_device_memory(c_cpy_dev);
 }
 
-#endif //GOOGLE_CUDA
\ No newline at end of file
+#endif //GOOGLE_CUDA
+
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestNeighborList, gpu)
+{
+  int mem_size = 48;
+
+  int * nlist_data_dev=NULL, * jlist_dev=NULL, * ilist_dev=NULL, * numneigh_dev=NULL;
+  int ** firstneigh_dev=NULL;
+  std::vector<int* > temp_firstneigh(nloc);
+  double * c_cpy_dev=NULL;
+
+  deepmd::malloc_device_memory(nlist_data_dev, 2 * nloc * mem_size);
+  deepmd::malloc_device_memory(jlist_dev, nloc * mem_size);
+  deepmd::malloc_device_memory(ilist_dev, nloc);
+  deepmd::malloc_device_memory(numneigh_dev, nloc);
+  for(int ii = 0; ii < nloc; ++ii){
+    temp_firstneigh[ii] = jlist_dev + ii * mem_size;
+  }
+  deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh);
+  deepmd::malloc_device_memory_sync(c_cpy_dev, posi_cpy);
+  deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev);
+  
+  int max_list_size;
+  int ret = deepmd::build_nlist_gpu_rocm(
+      nlist_dev,
+      &max_list_size,
+      nlist_data_dev,
+      c_cpy_dev, 
+      nloc, 
+      nall, 
+      mem_size,
+      rc);
+  
+  EXPECT_EQ(ret, 0);
+  int * ilist = new int[nloc];
+  int * numneigh = new int[nloc];  
+  int ** firstneigh = new int*[nloc];
+  int * jlist = new int[nloc * mem_size];
+  deepmd::memcpy_device_to_host(jlist_dev, jlist, nloc * mem_size);
+  deepmd::memcpy_device_to_host(ilist_dev, ilist, nloc);
+  deepmd::memcpy_device_to_host(numneigh_dev, numneigh, nloc);
+  for(int ii = 0; ii < nloc; ++ii){
+    firstneigh[ii] = jlist + ii * mem_size;
+  }
+
+  deepmd::InputNlist nlist(nlist_dev.inum, ilist, numneigh, firstneigh);
+  EXPECT_EQ(nlist.inum, nloc);
+  EXPECT_EQ(max_list_size, 5);
+  for(int ii = 0; ii < nloc; ++ii){
+    EXPECT_EQ(nlist.ilist[ii], ii);
+    EXPECT_EQ(nlist.numneigh[ii], expect_nlist_cpy[ii].size());
+    std::sort(nlist.firstneigh[ii], nlist.firstneigh[ii] + nlist.numneigh[ii]);
+    for(int jj = 0; jj < nlist.numneigh[ii]; ++jj){
+      EXPECT_EQ(nlist.firstneigh[ii][jj], expect_nlist_cpy[ii][jj]);
+    }
+  }  
+  
+  delete[] ilist;
+  delete[] numneigh;
+  delete[] jlist;
+  delete[] firstneigh;
+  deepmd::delete_device_memory(nlist_data_dev);
+  deepmd::delete_device_memory(jlist_dev);
+  deepmd::delete_device_memory(ilist_dev);
+  deepmd::delete_device_memory(numneigh_dev);
+  deepmd::delete_device_memory(firstneigh_dev);
+  deepmd::delete_device_memory(c_cpy_dev);
+}
+
+TEST_F(TestNeighborList, gpu_lessmem)
+{
+  int mem_size = 47;
+
+  int * nlist_data_dev=NULL, * jlist_dev=NULL, * ilist_dev=NULL, * numneigh_dev=NULL;
+  int ** firstneigh_dev=NULL;
+  std::vector<int* > temp_firstneigh(nloc);
+  double * c_cpy_dev=NULL;
+
+  deepmd::malloc_device_memory(nlist_data_dev, 2 * nloc * mem_size);
+  deepmd::malloc_device_memory(jlist_dev, nloc * mem_size);
+  deepmd::malloc_device_memory(ilist_dev, nloc);
+  deepmd::malloc_device_memory(numneigh_dev, nloc);
+  for(int ii = 0; ii < nloc; ++ii){
+    temp_firstneigh[ii] = jlist_dev + ii * mem_size;
+  }
+  deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh);
+  deepmd::malloc_device_memory_sync(c_cpy_dev, posi_cpy);
+  deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev);
+  
+  int max_list_size;
+  int ret = deepmd::build_nlist_gpu_rocm(
+      nlist_dev,
+      &max_list_size,
+      nlist_data_dev,
+      c_cpy_dev, 
+      nloc, 
+      nall, 
+      mem_size,
+      rc);
+  
+  EXPECT_EQ(ret, 1);
+  deepmd::delete_device_memory(nlist_data_dev);
+  deepmd::delete_device_memory(jlist_dev);
+  deepmd::delete_device_memory(ilist_dev);
+  deepmd::delete_device_memory(numneigh_dev);
+  deepmd::delete_device_memory(firstneigh_dev);
+  deepmd::delete_device_memory(c_cpy_dev);
+}
+
+#endif //TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index d9c7c1319d..71685de2aa 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -127,3 +127,33 @@ TEST_F(TestProdForceA, gpu_cuda)
   }
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdForceA, gpu_rocm)
+{
+  std::vector<double> force(nall * 3, 0.0);
+  int n_a_sel = nnei;
+
+  int * nlist_dev = NULL;
+  double * force_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(force_dev, force);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
+
+  deepmd::prod_force_a_gpu_rocm<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
+  
+  deepmd::memcpy_device_to_host(force_dev, force);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(force_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+
+  EXPECT_EQ(force.size(), nall * 3);
+  EXPECT_EQ(force.size(), expected_force.size());
+  for (int jj = 0; jj < force.size(); ++jj){
+    EXPECT_LT(fabs(force[jj] - expected_force[jj]) , 1e-5);
+  }
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc
index e456e2a8dd..45f576a667 100644
--- a/source/lib/tests/test_prod_force_grad_a.cc
+++ b/source/lib/tests/test_prod_force_grad_a.cc
@@ -126,3 +126,33 @@ TEST_F(TestProdForceGradA, gpu)
   // printf("\n");
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdForceGradA, gpu)
+{
+  std::vector<double> grad_net(nloc * ndescrpt);
+  int * nlist_dev = NULL;
+  double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(grad_dev, grad);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
+  deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
+  deepmd::prod_force_grad_a_gpu_rocm<double>(grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei);
+  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(grad_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(grad_net_dev);
+
+  EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
+  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
+  for (int jj = 0; jj < grad_net.size(); ++jj){
+    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5);
+  }  
+  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
+  //   printf("%8.5f, ", grad_net[jj]);
+  // }
+  // printf("\n");
+}
+#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc
index da4ac96d3b..5db405931d 100644
--- a/source/lib/tests/test_prod_force_grad_r.cc
+++ b/source/lib/tests/test_prod_force_grad_r.cc
@@ -126,3 +126,33 @@ TEST_F(TestProdForceGradR, gpu)
   // printf("\n");
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdForceGradR, gpu)
+{
+  std::vector<double> grad_net(nloc * ndescrpt);
+  int * nlist_dev = NULL;
+  double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(grad_dev, grad);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
+  deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
+  deepmd::prod_force_grad_r_gpu_rocm<double>(grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei);
+  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(grad_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(grad_net_dev);
+
+  EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
+  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
+  for (int jj = 0; jj < grad_net.size(); ++jj){
+    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5);
+  }  
+  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
+  //   printf("%8.5f, ", grad_net[jj]);
+  // }
+  // printf("\n");
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index e77cafdace..6577d184a0 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -127,3 +127,33 @@ TEST_F(TestProdForceR, gpu_cuda)
   }  
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdForceR, gpu_rocm)
+{
+  std::vector<double> force(nall * 3, 0.0);
+  int n_a_sel = nnei;
+
+  int * nlist_dev = NULL;
+  double * force_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(force_dev, force);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
+
+  deepmd::prod_force_r_gpu_rocm<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
+
+  deepmd::memcpy_device_to_host(force_dev, force);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(force_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+
+  EXPECT_EQ(force.size(), nall * 3);
+  EXPECT_EQ(force.size(), expected_force.size());
+  for (int jj = 0; jj < force.size(); ++jj){
+    EXPECT_LT(fabs(force[jj] - expected_force[jj]) , 1e-5);
+  }  
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index f63fc00fb5..fcea02891d 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -159,3 +159,50 @@ TEST_F(TestProdVirialA, gpu_cuda)
   }  
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdVirialA, gpu_rocm)
+{
+  std::vector<double> virial(9, 0.0);
+  std::vector<double> atom_virial(nall * 9, 0.0);
+  int n_a_sel = nnei;
+
+  int * nlist_dev = NULL;
+  double * virial_dev = NULL, *atom_virial_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(virial_dev, virial);
+  deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);  
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);  
+  deepmd::malloc_device_memory_sync(rij_dev, rij);  
+
+  deepmd::prod_virial_a_gpu_rocm<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
+  
+  deepmd::memcpy_device_to_host(virial_dev, virial);
+  deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(virial_dev);
+  deepmd::delete_device_memory(atom_virial_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  // virial are not calculated in gpu currently;
+  // for (int ii = 0; ii < 9; ii++) {
+  //   virial[ii] = 0;
+  // }
+  // for (int ii = 0; ii < nall * 9; ii++) {
+  //   virial[ii % 9] += atom_virial[ii];
+  // }
+  EXPECT_EQ(virial.size(), 9);
+  EXPECT_EQ(virial.size(), expected_virial.size());
+  EXPECT_EQ(atom_virial.size(), nall * 9);  
+  EXPECT_EQ(atom_virial.size(), expected_atom_virial.size());  
+  for (int jj = 0; jj < virial.size(); ++jj){
+    EXPECT_LT(fabs(virial[jj] - expected_virial[jj]) , 1e-5);
+  }  
+  for (int jj = 0; jj < atom_virial.size(); ++jj){
+    EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]) , 1e-5);
+  }  
+}
+#endif //TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc
index 461552c5a3..469c7d4027 100644
--- a/source/lib/tests/test_prod_virial_grad_a.cc
+++ b/source/lib/tests/test_prod_virial_grad_a.cc
@@ -132,4 +132,38 @@ TEST_F(TestProdVirialGradA, gpu)
   // }
   // printf("\n");
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
+
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdVirialGradA, gpu)
+{
+  std::vector<double> grad_net(nloc * ndescrpt);
+  int n_a_sel = nnei;
+  int * nlist_dev = NULL;
+  double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(grad_dev, grad);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
+  deepmd::prod_virial_grad_a_gpu_rocm<double>(grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei);
+  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(grad_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  deepmd::delete_device_memory(grad_net_dev);
+
+  EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
+  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
+  for (int jj = 0; jj < grad_net.size(); ++jj){
+    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5);
+  }  
+  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
+  //   printf("%8.5f, ", grad_net[jj]);
+  // }
+  // printf("\n");
+}
+#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc
index 3f12599232..c517cd60ef 100644
--- a/source/lib/tests/test_prod_virial_grad_r.cc
+++ b/source/lib/tests/test_prod_virial_grad_r.cc
@@ -133,3 +133,36 @@ TEST_F(TestProdVirialGradR, gpu)
   // printf("\n");
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdVirialGradR, gpu)
+{
+  std::vector<double> grad_net(nloc * ndescrpt);
+  int n_a_sel = nnei;
+  int * nlist_dev = NULL;
+  double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(grad_dev, grad);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
+  deepmd::prod_virial_grad_r_gpu_rocm<double>(grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei);
+  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(grad_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  deepmd::delete_device_memory(grad_net_dev);
+
+  EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
+  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
+  for (int jj = 0; jj < grad_net.size(); ++jj){
+    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5);
+  }  
+  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
+  //   printf("%8.5f, ", grad_net[jj]);
+  // }
+  // printf("\n");
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index be7e865962..33cf9a9ed8 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -159,3 +159,50 @@ TEST_F(TestProdVirialR, gpu_cuda)
   }
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestProdVirialR, gpu_rocm)
+{
+  std::vector<double> virial(9, 0.0);
+  std::vector<double> atom_virial(nall * 9, 0.0);
+  int n_a_sel = nnei;
+
+  int * nlist_dev = NULL;
+  double * virial_dev = NULL, *atom_virial_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL;
+
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(virial_dev, virial);
+  deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);  
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);  
+  deepmd::malloc_device_memory_sync(rij_dev, rij);  
+
+  deepmd::prod_virial_r_gpu_rocm<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
+  
+  deepmd::memcpy_device_to_host(virial_dev, virial);
+  deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(virial_dev);
+  deepmd::delete_device_memory(atom_virial_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  // virial are not calculated in gpu currently;
+  // for (int ii = 0; ii < 9; ii++) {
+  //   virial[ii] = 0;
+  // }
+  // for (int ii = 0; ii < nall * 9; ii++) {
+  //   virial[ii % 9] += atom_virial[ii];
+  // }
+  EXPECT_EQ(virial.size(), 9);
+  EXPECT_EQ(virial.size(), expected_virial.size());
+  EXPECT_EQ(atom_virial.size(), nall * 9);  
+  EXPECT_EQ(atom_virial.size(), expected_atom_virial.size());  
+  for (int jj = 0; jj < virial.size(); ++jj){
+    EXPECT_LT(fabs(virial[jj] - expected_virial[jj]) , 1e-5);
+  }  
+  for (int jj = 0; jj < atom_virial.size(); ++jj){
+    EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]) , 1e-5);
+  }
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc
index 5c8951de0e..e90ec55783 100644
--- a/source/lib/tests/test_simulation_region.cc
+++ b/source/lib/tests/test_simulation_region.cc
@@ -158,4 +158,76 @@ TEST_F(TestRegion, gpu)
 //     EXPECT_EQ (50.332, square_root (2533.310224));
 // }
 
-
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestRegion, gpu)
+{
+  // check rec_box
+  deepmd::Region<double> region;
+  deepmd::Region<double> region_dev;
+  double * new_boxt = region_dev.boxt;
+  double * new_rec_boxt = region_dev.rec_boxt;
+  double * boxt_dev = NULL, * rec_boxt_dev = NULL;
+  double * ref_rp_dev = NULL, * ref_ri_dev = NULL;
+  init_region_cpu(region, &ref_boxt[0]);
+  for(int ii = 0; ii < 9; ++ii){
+    EXPECT_LT(fabs(region.rec_boxt[ii] - ref_rec_boxt[ii]), 1e-10);
+  }
+  deepmd::malloc_device_memory_sync(boxt_dev, region.boxt, 9);
+  deepmd::malloc_device_memory_sync(rec_boxt_dev, region.rec_boxt, 9);
+  deepmd::malloc_device_memory_sync(ref_rp_dev, ref_rp);
+  deepmd::malloc_device_memory_sync(ref_ri_dev, ref_ri);
+  region_dev.boxt = boxt_dev;
+  region_dev.rec_boxt = rec_boxt_dev;
+  // check volume
+  double vol[1];
+  double * vol_dev = NULL;
+  deepmd::malloc_device_memory(vol_dev, 1);
+  deepmd::volume_gpu_rocm(vol_dev, region_dev);
+  deepmd::memcpy_device_to_host(vol_dev, vol, 1);
+  EXPECT_LT(fabs(vol[0] - expected_vol), 1e-10);
+  // check conversion between phys and inter coords.
+  double ri[3];
+  double * ri_dev = NULL;
+  deepmd::malloc_device_memory(ri_dev, 3);
+  deepmd::convert_to_inter_gpu_rocm(ri_dev, region_dev, ref_rp_dev);
+  deepmd::memcpy_device_to_host(ri_dev, ri, 3);
+  for(int ii = 0; ii < 3; ++ii){
+    EXPECT_LT(fabs(ri[ii] - ref_ri[ii]), 1e-10);
+  }
+  double rp2[3];
+  double * rp2_dev = NULL;
+  deepmd::malloc_device_memory(rp2_dev, 3);
+  deepmd::convert_to_phys_gpu_rocm(rp2_dev, region_dev, ri_dev);
+  deepmd::memcpy_device_to_host(rp2_dev, rp2, 3);
+  for(int ii = 0; ii < 3; ++ii){
+    EXPECT_LT(fabs(rp2[ii] - ref_rp[ii]), 1e-10);
+  }
+  double rp[3];
+  double * rp_dev = NULL;
+  deepmd::malloc_device_memory(rp_dev, 3);
+  deepmd::convert_to_phys_gpu_rocm(rp_dev, region_dev, ref_ri_dev);
+  deepmd::memcpy_device_to_host(rp_dev, rp, 3);
+  for(int ii = 0; ii < 3; ++ii){
+    EXPECT_LT(fabs(rp[ii] - ref_rp[ii]), 1e-10);
+  }
+  double ri2[3];
+  double * ri2_dev = NULL;
+  deepmd::malloc_device_memory(ri2_dev, 3);
+  deepmd::convert_to_inter_gpu_rocm(ri2_dev, region_dev, rp_dev);
+  deepmd::memcpy_device_to_host(ri2_dev, ri2, 3);
+  for(int ii = 0; ii < 3; ++ii){
+    EXPECT_LT(fabs(ri2[ii] - ref_ri[ii]), 1e-10);
+  }
+  deepmd::delete_device_memory(boxt_dev);
+  deepmd::delete_device_memory(rec_boxt_dev);
+  deepmd::delete_device_memory(vol_dev);
+  deepmd::delete_device_memory(ref_rp_dev);
+  deepmd::delete_device_memory(ref_ri_dev);
+  deepmd::delete_device_memory(ri_dev);
+  deepmd::delete_device_memory(rp2_dev);
+  deepmd::delete_device_memory(rp_dev);
+  deepmd::delete_device_memory(ri2_dev);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_tabulate.cc b/source/lib/tests/test_tabulate.cc
index b22cca03d8..43c0ef798e 100644
--- a/source/lib/tests/test_tabulate.cc
+++ b/source/lib/tests/test_tabulate.cc
@@ -232,3 +232,63 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda)
   }
 }
 #endif // GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestTabulate, tabulate_fusion_gpu_rocm)
+{
+  std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
+
+  double * xyz_scatter_dev = NULL, * table_dev = NULL, * em_x_dev = NULL, * em_dev = NULL;
+  deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
+  deepmd::malloc_device_memory_sync(table_dev, table);
+  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::tabulate_fusion_gpu_rocm<double>(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size);
+  deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
+  deepmd::delete_device_memory(xyz_scatter_dev);
+  deepmd::delete_device_memory(table_dev);
+  deepmd::delete_device_memory(em_x_dev);
+  deepmd::delete_device_memory(em_dev);
+
+  EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
+  EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
+  for (int jj = 0; jj < xyz_scatter.size(); ++jj){
+    EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter[jj]) , 1e-5);
+  }
+}
+
+TEST_F(TestTabulate, tabulate_fusion_grad_gpu_rocm)
+{
+  std::vector<double> dy_dem_x(em_x.size(), 0.0);
+  std::vector<double> dy_dem(em.size(), 0.0);
+  std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
+
+  double * dy_dem_x_dev = NULL, * dy_dem_dev = NULL, * table_dev = NULL, * em_x_dev = NULL, * em_dev = NULL, * dy_dev = NULL;
+  deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
+  deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
+  deepmd::malloc_device_memory_sync(table_dev, table);
+  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(dy_dev, dy);
+  deepmd::tabulate_fusion_grad_gpu_rocm<double>(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size);
+  deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
+  deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
+  deepmd::delete_device_memory(dy_dem_x_dev);
+  deepmd::delete_device_memory(dy_dem_dev);
+  deepmd::delete_device_memory(table_dev);
+  deepmd::delete_device_memory(em_x_dev);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(dy_dev);
+
+  EXPECT_EQ(dy_dem_x.size(), nloc * nnei);
+  EXPECT_EQ(dy_dem.size(), nloc * nnei * 4);
+  EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size());
+  EXPECT_EQ(dy_dem.size(), expected_dy_dem.size());
+  for (int jj = 0; jj < dy_dem_x.size(); ++jj){
+    EXPECT_LT(fabs(dy_dem_x[jj] - expected_dy_dem_x[jj]) , 1e-5);
+  }
+  for (int jj = 0; jj < dy_dem.size(); ++jj){
+    EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]) , 1e-5);
+  }
+}
+#endif // TENSORFLOW_USE_ROCM
diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
index a59ad84960..8157ddb8dd 100644
--- a/source/lmp/env.sh.in
+++ b/source/lmp/env.sh.in
@@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
 NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/ "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
-NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_CC@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
+NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_DEVICE@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_CC@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index c3dc7b6815..dbb469f6ef 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -5,18 +5,24 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
 file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc)
 file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
+file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
 if (BUILD_CPP_IF) 
-    if (USE_CUDA_TOOLKIT)
-        add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC})
-        find_package(CUDA REQUIRED)
-        include_directories(${CUDA_INCLUDE_DIRS})
-        target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES})
-    else (USE_CUDA_TOOLKIT)
-        add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
-    endif (USE_CUDA_TOOLKIT)
+  if (USE_CUDA_TOOLKIT)
+    add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC})
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES})
+  elseif()
+    add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC})
+    find_package(HIP REQUIRED)
+    include_directories(${ROCM_INCLUDE_DIRS})
+    target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES})
+  else()
+    add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
+  endif()
 endif (BUILD_CPP_IF)
 
 if (BUILD_PY_IF)
@@ -27,12 +33,20 @@ if (BUILD_PY_IF)
     add_library(op_grads SHARED ${OP_GRADS_SRC})
     find_package(CUDA REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
-    target_link_libraries (op_abi ${LIB_DEEPMD_OP_CUDA})
-    target_link_libraries (op_grads ${LIB_DEEPMD_OP_CUDA})
-  else (USE_CUDA_TOOLKIT)
+    target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
+    target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
+  elseif()
+    add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
+    add_library(op_grads SHARED ${OP_GRADS_SRC})
+    find_package(HIP REQUIRED)
+    include_directories(${HIP_INCLUDE_DIRS})
+    target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
+    target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
+  else()
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
   endif(USE_CUDA_TOOLKIT)
+  
   message(STATUS ${TensorFlowFramework_LIBRARY})
   target_link_libraries(op_abi ${LIB_DEEPMD})
   target_link_libraries(op_grads ${LIB_DEEPMD})
@@ -62,3 +76,4 @@ if (BUILD_PY_IF)
   install(TARGETS op_grads			DESTINATION deepmd/op/)
   install(FILES  ${OP_PY}			DESTINATION deepmd/op/)
 endif (BUILD_PY_IF)
+
diff --git a/source/op/custom_op.h b/source/op/custom_op.h
index 6a8533f66f..e4f9211e61 100644
--- a/source/op/custom_op.h
+++ b/source/op/custom_op.h
@@ -18,12 +18,12 @@ struct DeviceFunctor {
   {
     device = "CPU";
   }
-  #if GOOGLE_CUDA
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   void operator()(
       std::string& device, 
       const GPUDevice& d) 
   {
     device = "GPU";
   }
-  #endif // GOOGLE_CUDA
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
\ No newline at end of file
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index ece0d07ab3..953d89f55a 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -49,6 +49,12 @@ class GeluOp : public OpKernel {
           out, 
           x, size);
       #endif // GOOGLE_CUDA
+
+      #if TENSORFLOW_USE_ROCM
+      deepmd::gelu_gpu_rocm(
+        out,
+        x,size);
+      #endif//TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::gelu_cpu(
@@ -92,6 +98,12 @@ class GeluGradOp : public OpKernel {
           out, 
           x, dy, size);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::gelu_grad_gpu_rocm(
+          out, 
+          x, dy, size);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::gelu_grad_cpu(
@@ -133,6 +145,12 @@ class GeluGradGradOp : public OpKernel {
           out, 
           x, dy, dy_2, size);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::gelu_grad_grad_gpu_rocm(
+          out, 
+          x, dy, dy_2, size);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::gelu_grad_grad_cpu(
@@ -157,7 +175,7 @@ REGISTER_KERNEL_BUILDER(                                                \
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU(T)                                                 \
 REGISTER_KERNEL_BUILDER(                                                \
     Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
@@ -170,4 +188,4 @@ REGISTER_KERNEL_BUILDER(                                                \
     GeluGradGradOp<GPUDevice, T>);                                      
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif // GOOGLE_CUDA
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 1bdd5553f4..64d40acff4 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -180,6 +180,80 @@ _prepare_coord_nlist_gpu(
 #endif //GOOGLE_CUDA
 
 
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+static int
+_norm_copy_coord_gpu_rocm(
+    OpKernelContext* context,
+    Tensor * tensor_list,
+    FPTYPE * & coord_cpy,
+    int * & type_cpy,
+    int * & idx_mapping,
+    int & nall,
+    int & mem_cpy,
+    const FPTYPE * coord,
+    const FPTYPE * box,
+    const int * type,
+    const int &nloc, 
+    const int &max_cpy_trial, 
+    const float & rcut_r);
+
+template<typename FPTYPE>
+static int
+_build_nlist_gpu_rocm(
+    OpKernelContext* context,
+    Tensor * tensor_list,
+    int * &ilist, 
+    int * &numneigh,
+    int ** &firstneigh,
+    int * &jlist,
+    int & max_nnei,
+    int & mem_nnei,
+    const FPTYPE *coord,
+    const int & nloc,
+    const int & new_nall,
+    const int & max_nnei_trial,
+    const float & rcut_r);
+
+static void
+_map_nlist_gpu_rocm(
+    int * nlist,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei);
+
+template <typename FPTYPE>
+static void
+_prepare_coord_nlist_gpu_rocm(
+    OpKernelContext* context,
+    Tensor * tensor_list,
+    FPTYPE const ** coord,
+    FPTYPE * & coord_cpy,
+    int const** type,
+    int * & type_cpy,
+    int * & idx_mapping,
+    deepmd::InputNlist & inlist,
+    int * & ilist,
+    int * & numneigh,
+    int ** & firstneigh,
+    int * & jlist,
+    int * & nbor_list_dev,
+    int & new_nall,
+    int & mem_cpy,
+    int & mem_nnei,
+    int & max_nbor_size,
+    const FPTYPE * box,
+    const int * mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int & nloc,
+    const int & nei_mode,
+    const float & rcut_r,
+    const int & max_cpy_trial,
+    const int & max_nnei_trial);
+    
+#endif //TENSORFLOW_USE_ROCM
+
+
 template <typename Device, typename FPTYPE>
 class ProdEnvMatAOp : public OpKernel {
 public:
@@ -366,6 +440,44 @@ class ProdEnvMatAOp : public OpKernel {
       if(b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
       deepmd::delete_device_memory(firstneigh);
       #endif //GOOGLE_CUDA
+
+            #if TENSORFLOW_USE_ROCM
+      int * idx_mapping = NULL;
+      int * ilist = NULL, * numneigh = NULL;
+      int ** firstneigh = NULL;
+      deepmd::malloc_device_memory(firstneigh, nloc);
+      int * jlist = NULL;
+      FPTYPE * coord_cpy;
+      int * type_cpy;
+      int frame_nall = nall;
+      int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
+      std::vector<Tensor> tensor_list(7);
+      // prepare coord and nlist
+      _prepare_coord_nlist_gpu_rocm<FPTYPE>(
+          context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, idx_mapping, 
+          gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+          frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
+
+      // allocate temp memory, temp memory must not be used after this operation!
+      Tensor int_temp;
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+      Tensor uint64_temp;
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+      array_int = int_temp.flat<int>().data(); 
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+      // launch the gpu(nv) compute function
+      deepmd::prod_env_mat_a_gpu_rocm(
+          em, em_deriv, rij, nlist, 
+          coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a);
+      if(b_nlist_map) _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei);
+      deepmd::delete_device_memory(firstneigh);
+      #endif //TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::InputNlist inlist;
@@ -588,6 +700,45 @@ class ProdEnvMatROp : public OpKernel {
       if(b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
       deepmd::delete_device_memory(firstneigh);
       #endif //GOOGLE_CUDA
+
+      
+      #if TENSORFLOW_USE_ROCM
+      int * idx_mapping = NULL;
+      int * ilist = NULL, * numneigh = NULL;
+      int ** firstneigh = NULL;
+      deepmd::malloc_device_memory(firstneigh, nloc);
+      int * jlist = NULL;
+      FPTYPE * coord_cpy;
+      int * type_cpy;
+      int frame_nall = nall;
+      int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
+      std::vector<Tensor> tensor_list(7);
+      // prepare coord and nlist
+      _prepare_coord_nlist_gpu_rocm<FPTYPE>(
+          context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, idx_mapping, 
+          gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+          frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode, rcut, max_cpy_trial, max_nnei_trial);
+  
+      // allocate temp memory, temp memory must not be used after this operation!
+      Tensor int_temp;
+      TensorShape int_shape;
+      int_shape.AddDim(sec.size() + nloc * sec.size() + nloc);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+      Tensor uint64_temp;
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+      array_int = int_temp.flat<int>().data(); 
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+      
+      // launch the gpu(nv) compute function
+      deepmd::prod_env_mat_r_gpu_rocm(
+          em, em_deriv, rij, nlist, 
+          coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut, rcut_smth, sec);
+      if(b_nlist_map) _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei);
+      deepmd::delete_device_memory(firstneigh);
+      #endif //TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::InputNlist inlist;
@@ -996,6 +1147,222 @@ _prepare_coord_nlist_gpu(
 }
 #endif  // GOOGLE_CUDA
 
+
+#if TENSORFLOW_USE_ROCM
+template<typename FPTYPE>
+static int
+_norm_copy_coord_gpu_rocm(
+    OpKernelContext* context,
+    Tensor * tensor_list,
+    FPTYPE * & coord_cpy,
+    int * & type_cpy,
+    int * & idx_mapping,
+    int & nall,
+    int & mem_cpy,
+    const FPTYPE * coord,
+    const FPTYPE * box,
+    const int * type,
+    const int &nloc, 
+    const int &max_cpy_trial, 
+    const float & rcut_r)
+{
+  // Tensor FPTYPE_temp;
+  TensorShape FPTYPE_shape;
+  FPTYPE_shape.AddDim(nall*3);
+  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
+  FPTYPE * tmp_coord = (*tensor_list).flat<FPTYPE>().data();
+  hipErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice));
+  
+  deepmd::Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  FPTYPE box_info[18];
+  std::copy(region.boxt, region.boxt+9, box_info);
+  std::copy(region.rec_boxt, region.rec_boxt+9, box_info+9);
+  int cell_info[23];
+  deepmd::compute_cell_info(cell_info, rcut_r, region);
+  const int loc_cellnum=cell_info[21];
+  const int total_cellnum=cell_info[22];
+  //Tensor double_temp;
+  TensorShape double_shape;
+  double_shape.AddDim(18);
+  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, double_shape, tensor_list+1);
+  //Tensor int_temp;
+  TensorShape int_shape;
+  int_shape.AddDim(23+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc);
+  context, context->allocate_temp(DT_INT32, int_shape, tensor_list+2);
+  FPTYPE * box_info_dev = (*(tensor_list+1)).flat<FPTYPE>().data();
+  int * cell_info_dev = (*(tensor_list+2)).flat<int>().data();
+  int * int_data_dev = cell_info_dev + 23;
+  deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
+  deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
+  deepmd::Region<FPTYPE> region_dev;
+  FPTYPE * new_boxt = region_dev.boxt;
+  FPTYPE * new_rec_boxt = region_dev.rec_boxt;
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+  deepmd::normalize_coord_gpu_rocm(tmp_coord, nall, region_dev);
+  int tt;
+  for(tt = 0; tt < max_cpy_trial; ++tt){
+    //Tensor cpy_temp;
+    TensorShape cpy_shape;
+    cpy_shape.AddDim(mem_cpy*3);
+    context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list+3);
+    //Tensor t_temp;
+    TensorShape t_shape;
+    t_shape.AddDim(mem_cpy*2);
+    context, context->allocate_temp(DT_INT32, t_shape, tensor_list+4);
+    coord_cpy = (*(tensor_list+3)).flat<FPTYPE>().data();
+    type_cpy = (*(tensor_list+4)).flat<int>().data();
+    idx_mapping = type_cpy + mem_cpy;
+    int ret = deepmd::copy_coord_gpu_rocm(
+        coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev,
+        tmp_coord, type, nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
+    if(ret == 0){
+      break;
+    }
+    else{
+      mem_cpy *= 2;
+    }
+  }
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
+  return (tt != max_cpy_trial);
+}
+
+template<typename FPTYPE>
+static int
+_build_nlist_gpu_rocm(
+    OpKernelContext* context,
+    Tensor * tensor_list,
+    int * &ilist, 
+    int * &numneigh,
+    int ** &firstneigh,
+    int * &jlist,
+    int & max_nnei,
+    int & mem_nnei,
+    const FPTYPE *coord,
+    const int & nloc,
+    const int & new_nall,
+    const int & max_nnei_trial,
+    const float & rcut_r)
+{
+  //Tensor nlist_temp;
+  TensorShape nlist_shape;
+  nlist_shape.AddDim(nloc*2);
+  context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
+  ilist = (*tensor_list).flat<int>().data();
+  numneigh = ilist + nloc;
+  //Tensor jlist_temp;
+  int * ind_data = NULL;
+  
+  std::vector<int*> firstneigh_host(nloc);
+  int tt;
+  for(tt = 0; tt < max_nnei_trial; ++tt){
+    TensorShape jlist_shape;
+    jlist_shape.AddDim(3*nloc*mem_nnei);
+    context->allocate_temp(DT_INT32, jlist_shape, tensor_list+1);
+    jlist = (*(tensor_list+1)).flat<int>().data();
+    ind_data = jlist + nloc * mem_nnei;
+    for(int ii = 0; ii < nloc; ++ii){
+      firstneigh_host[ii] = jlist + ii * mem_nnei;
+    }
+    deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
+    deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
+    int ret = deepmd::build_nlist_gpu_rocm(
+        inlist, &max_nnei, ind_data, 
+        coord, nloc, new_nall, mem_nnei, rcut_r);
+    if(ret == 0){
+      break;
+    }
+    else{
+      mem_nnei *= 2;
+    }
+  }
+  return (tt != max_nnei_trial);
+}
+
+static void
+_map_nlist_gpu_rocm(
+    int * nlist,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei)
+{
+  deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
+}
+
+template <typename FPTYPE>
+static void
+_prepare_coord_nlist_gpu_rocm(
+    OpKernelContext* context,
+    Tensor * tensor_list,
+    FPTYPE const ** coord,
+    FPTYPE * & coord_cpy,
+    int const** type,
+    int * & type_cpy,
+    int * & idx_mapping,
+    deepmd::InputNlist & inlist,
+    int * & ilist,
+    int * & numneigh,
+    int ** & firstneigh,
+    int * & jlist,
+    int * & nbor_list_dev,
+    int & new_nall,
+    int & mem_cpy,
+    int & mem_nnei,
+    int & max_nbor_size,
+    const FPTYPE * box,
+    const int * mesh_tensor_data,
+    const int mesh_tensor_size,
+    const int & nloc,
+    const int & nei_mode,
+    const float & rcut_r,
+    const int & max_cpy_trial,
+    const int & max_nnei_trial)
+{    
+  if(nei_mode != 3){
+    inlist.inum = nloc;
+    // build nlist by myself
+    // normalize and copy coord
+    if(nei_mode == 1){
+      int copy_ok = _norm_copy_coord_gpu_rocm(
+        context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
+        *coord, box, *type, nloc, max_cpy_trial, rcut_r);
+      OP_REQUIRES (context, copy_ok, errors::Aborted("cannot allocate mem for copied coords"));
+      *coord = coord_cpy;
+      *type = type_cpy;
+    }
+    //build nlist
+    int build_ok = _build_nlist_gpu_rocm(
+      context, tensor_list + 5, ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei,
+      *coord, nloc, new_nall, max_nnei_trial, rcut_r);
+    OP_REQUIRES (context, build_ok, errors::Aborted("cannot allocate mem for nlist"));
+    if (max_nbor_size <= 1024) {
+      max_nbor_size = 1024;
+    }
+    else if (max_nbor_size <= 2048) {
+      max_nbor_size = 2048;
+    }
+    else {
+      max_nbor_size = 4096;
+    }
+    inlist.ilist = ilist;
+    inlist.numneigh = numneigh;
+    inlist.firstneigh = firstneigh;
+  }
+  else{
+    // update nbor list
+    deepmd::InputNlist inlist_temp;
+    inlist_temp.inum = nloc;
+    deepmd::env_mat_nbor_update(
+        inlist_temp, inlist, max_nbor_size, nbor_list_dev,
+        mesh_tensor_data, mesh_tensor_size);
+    OP_REQUIRES (context, (max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
+  }
+}
+#endif  // TENSORFLOW_USE_ROCM
+
+
 // Register the CPU kernels.
 #define REGISTER_CPU(T)                                                                                   \
 REGISTER_KERNEL_BUILDER(                                                                                  \
@@ -1008,7 +1375,7 @@ REGISTER_CPU(float);
 REGISTER_CPU(double);                 
             
 // Register the GPU kernels.                  
-#if GOOGLE_CUDA                 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM            
 #define REGISTER_GPU(T)                                                                                   \
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("ProdEnvMatA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
@@ -1018,4 +1385,4 @@ REGISTER_KERNEL_BUILDER(
     ProdEnvMatROp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index 1bda63903b..2dae7c1a0b 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -113,6 +113,12 @@ class ProdForceSeAGradOp : public OpKernel {
             grad_net, 
             grad, in_deriv, nlist, nloc, nnei);
         #endif // GOOGLE_CUDA
+        
+        #if TENSORFLOW_USE_ROCM
+        deepmd::prod_force_grad_a_gpu_rocm(    
+            grad_net, 
+            grad, in_deriv, nlist, nloc, nnei);
+        #endif // TENSORFLOW_USE_ROCM
         }
         else if (device == "CPU") {
         deepmd::prod_force_grad_a_cpu(    
@@ -215,6 +221,12 @@ class ProdForceSeRGradOp : public OpKernel
               grad_net, 
               grad, in_deriv, nlist, nloc, nnei);
           #endif // GOOGLE_CUDA
+          
+          #if TENSORFLOW_USE_ROCM
+          deepmd::prod_force_grad_r_gpu_rocm(    
+              grad_net, 
+              grad, in_deriv, nlist, nloc, nnei);
+          #endif // TENSORFLOW_USE_ROCM
         }
         else if (device == "CPU") {
           deepmd::prod_force_grad_r_cpu(    
@@ -238,7 +250,7 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Register the GPU kernels.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU(T)                                                                      \
 REGISTER_KERNEL_BUILDER(                                                                     \
     Name("ProdForceSeAGrad").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
@@ -248,4 +260,4 @@ REGISTER_KERNEL_BUILDER(
     ProdForceSeRGradOp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
\ No newline at end of file
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index a49b9c1913..748971751e 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -88,6 +88,12 @@ class ProdForceSeAOp : public OpKernel {
           force, 
           net_deriv, in_deriv, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::prod_force_a_gpu_rocm(    
+          force, 
+          net_deriv, in_deriv, nlist, nloc, nall, nnei);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::prod_force_a_cpu(    
@@ -167,6 +173,12 @@ class ProdForceSeROp : public OpKernel {
           force, 
           net_deriv, in_deriv, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::prod_force_r_gpu_rocm(    
+          force, 
+          net_deriv, in_deriv, nlist, nloc, nall, nnei);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::prod_force_r_cpu(    
@@ -190,7 +202,7 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Register the GPU kernels.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU(T)                                                                  \
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
@@ -200,4 +212,5 @@ REGISTER_KERNEL_BUILDER(
     ProdForceSeROp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index ac74d1d141..996c8331b8 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -127,6 +127,12 @@ class ProdVirialSeAGradOp : public OpKernel
           grad_net, 
           grad, in_deriv, rij, nlist, nloc, nnei);
         #endif // GOOGLE_CUDA
+        
+        #if TENSORFLOW_USE_ROCM
+        deepmd::prod_virial_grad_a_gpu_rocm(    
+          grad_net, 
+          grad, in_deriv, rij, nlist, nloc, nnei);
+        #endif // TENSORFLOW_USE_ROCM
       }
       else if (device == "CPU") {
         deepmd::prod_virial_grad_a_cpu(    
@@ -239,6 +245,12 @@ class ProdVirialSeRGradOp : public OpKernel
           grad_net, 
           grad, in_deriv, rij, nlist, nloc, nnei);
         #endif // GOOGLE_CUDA
+        
+        #if TENSORFLOW_USE_ROCM
+        deepmd::prod_virial_grad_r_gpu_rocm(    
+          grad_net, 
+          grad, in_deriv, rij, nlist, nloc, nnei);
+        #endif // TENSORFLOW_USE_ROCM
       }
       else if (device == "CPU") {
         deepmd::prod_virial_grad_r_cpu(    
@@ -262,7 +274,7 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Register the GPU kernels.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU(T)                                                                       \
 REGISTER_KERNEL_BUILDER(                                                                      \
     Name("ProdVirialSeAGrad").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
@@ -272,4 +284,4 @@ REGISTER_KERNEL_BUILDER(
     ProdVirialSeRGradOp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
\ No newline at end of file
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index 14189082a4..00537179c9 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -97,6 +97,12 @@ class ProdVirialSeAOp : public OpKernel {
           virial, atom_virial,
           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::prod_virial_a_gpu_rocm(    
+          virial, atom_virial,
+          net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::prod_virial_a_cpu(    
@@ -183,6 +189,12 @@ class ProdVirialSeROp : public OpKernel {
           virial, atom_virial,
           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::prod_virial_r_gpu_rocm(    
+          virial, atom_virial,
+          net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::prod_virial_r_cpu(    
@@ -206,7 +218,7 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Register the GPU kernels.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM 
 #define REGISTER_GPU(T)                                                                   \
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
@@ -216,4 +228,4 @@ REGISTER_KERNEL_BUILDER(
     ProdVirialSeROp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 8d8f9e82d6..9d54bd18a8 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -67,6 +67,12 @@ class TabulateFusionOp : public OpKernel {
           descriptor,
           table, table_info, em_x, em, nloc, nnei, last_layer_size);
       #endif // GOOGLE_CUDA
+
+      #if TENSORFLOW_USE_ROCM
+      deepmd::tabulate_fusion_gpu_rocm(    
+          descriptor,
+          table, table_info, em_x, em, nloc, nnei, last_layer_size);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::tabulate_fusion_cpu(    
@@ -129,6 +135,12 @@ class TabulateFusionGradOp : public OpKernel {
           dy_dem_x, dy_dem,
           table, table_info, em_x, em, dy, nloc, nnei, last_layer_size);
       #endif // GOOGLE_CUDA
+      
+      #if TENSORFLOW_USE_ROCM
+      deepmd::tabulate_fusion_grad_gpu_rocm(    
+          dy_dem_x, dy_dem,
+          table, table_info, em_x, em, dy, nloc, nnei, last_layer_size);
+      #endif // TENSORFLOW_USE_ROCM
     }
     else if (device == "CPU") {
       deepmd::tabulate_fusion_grad_cpu(    
@@ -150,7 +162,7 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 
-#if  GOOGLE_CUDA
+#if  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU(T)                                                                             \
 REGISTER_KERNEL_BUILDER(                                                                            \
     Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
@@ -160,4 +172,4 @@ REGISTER_KERNEL_BUILDER(
     TabulateFusionGradOp<GPUDevice, T>);                                                                
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index 9cc2b5ef2a..5f23d639a5 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -42,12 +42,12 @@ struct UnaggregatedDyDxSFunctor {
         }
     }
 
-    #if GOOGLE_CUDA
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
         //Currently, Do nothing at all! 
         return;
     }
-    #endif // GOOGLE_CUDA 
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 // calculate the gradient for all variables!
@@ -70,12 +70,12 @@ struct UnaggregatedDyDxFunctor {
         }
     }
 
-    #if GOOGLE_CUDA
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
         //Currently, Do nothing at all! 
         return;
     }
-    #endif // GOOGLE_CUDA
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 template <typename FPTYPE>
@@ -89,12 +89,12 @@ struct UnaggregatedDy2DxSFunctor {
         }
     }
 
-    #if GOOGLE_CUDA
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
         //Currently, Do nothing at all! 
         return;
     }
-    #endif // GOOGLE_CUDA 
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 // calculate the gradient for all variables!
@@ -122,12 +122,12 @@ struct UnaggregatedDy2DxFunctor {
         }
     }
 
-    #if GOOGLE_CUDA
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
         //Currently, Do nothing at all! 
         return;
     }
-    #endif // GOOGLE_CUDA
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 template<typename Device, typename FPTYPE>

From 271f23e622435c6a8511deeeb5b3b0414bc1d494 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Sat, 22 May 2021 16:38:29 +0800
Subject: [PATCH 02/11] modify CMakeLists.txt

---
 source/CMakeLists.txt              | 29 ++++++++++++++++++++---------
 source/api_cc/CMakeLists.txt       |  7 ++++++-
 source/api_cc/tests/CMakeLists.txt | 13 ++++++++++---
 source/lib/CMakeLists.txt          |  5 +++++
 source/lib/tests/CMakeLists.txt    | 19 +++++++++++--------
 source/op/CMakeLists.txt           | 19 ++++++++++++++-----
 6 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index e0ab5ad564..092f902beb 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -2,8 +2,6 @@ cmake_minimum_required(VERSION 3.7)
 project(DeePMD)
 set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 
-# note
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake")
 # build cpp or python interfaces
 if (NOT DEFINED BUILD_CPP_IF) 
   set(BUILD_CPP_IF TRUE)
@@ -75,6 +73,14 @@ if (USE_CUDA_TOOLKIT)
 endif()
 
 #define USE_ROCM_TOOLKIT
+if(NOT DEFINED HIP_PATH)
+  if(NOT DEFINED ENV{HIP_PATH})
+    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+  else()
+    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+  endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 if (DEFINED USE_ROCM_TOOLKIT)
   if (USE_ROCM_TOOLKIT)
     find_package(HIP REQUIRED)
@@ -97,6 +103,11 @@ if (USE_ROCM_TOOLKIT)
   add_definitions("-D TENSORFLOW_USE_ROCM")
 endif()
 
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
+endif()
+
 # find tensorflow, I need tf abi info
 find_package(tensorflow REQUIRED)
 
@@ -221,13 +232,13 @@ if (BUILD_CPP_IF)
   set (LIB_DEEPMD_CC		"deepmd_cc")
   if (USE_CUDA_TOOLKIT)
     set (LIB_DEEPMD_OP_DEVICE		"deepmd_op_cuda")
-    else()
-      if(USE_ROCM_TOOLKIT)
-        set (LIB_DEEPMD_OP_DEVICE   "deepmd_op_rocm")
-      else ()
-        set (LIB_DEEPMD_OP_DEVICE		"deepmd_op")
-      endif()
-  endif()
+  endif(USE_CUDA_TOOLKIT)
+  if (USE_ROCM_TOOLKIT)
+    set (LIB_DEEPMD_OP_DEVICE   "deepmd_op_rocm")
+  endif(USE_ROCM_TOOLKIT)
+  if ((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) 
+    set (LIB_DEEPMD_OP_DEVICE		"deepmd_op")
+  endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
     set (LIB_DEEPMD_NATIVE	"deepmd_native_md")
     set (LIB_DEEPMD_IPI		"deepmd_ipi")
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index f2019928c9..cbb62df1b3 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -8,11 +8,15 @@ configure_file(
   @ONLY
 )
 
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
+endif()
+
 if (USE_CUDA_TOOLKIT)
   include_directories("${CUDA_INCLUDE_DIRS}")
 endif()
 
-
 file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
@@ -26,6 +30,7 @@ if (USE_ROCM_TOOLKIT)
   target_link_libraries(${libname} ${HIP_LIBRARIES})
 endif()
 
+
 install(TARGETS ${libname} DESTINATION lib/)
 
 install(
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index e0785298b8..59e6bbc1cc 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -110,6 +110,10 @@ if (USE_ROCM_TOOLKIT)
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
 endif()
 
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
+endif()
 
 file(GLOB TEST_SRC test_*.cc)
 add_executable( runUnitTests ${TEST_SRC} )
@@ -128,11 +132,14 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config)
-elseif(USE_ROCM_TOOLKIT)
+endif(USE_CUDA_TOOLKIT)
+if(USE_ROCM_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname}  pthread deepmd_op_rocm ${TensorFlow_LIBRARY} coverage_config)
-else()
+endif(USE_ROCM_TOOLKIT)
+if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config)
-endif()
+endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+
 add_test( runUnitTests runUnitTests )
 
 find_package(GTest)
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 8c15c1d64b..c34543ab40 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -2,6 +2,11 @@
 
 set (libname ${LIB_DEEPMD})
 
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
+endif()
+
 if (USE_CUDA_TOOLKIT)
   include_directories("${CUDA_INCLUDE_DIRS}")
 endif()
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index d6908a18ce..36367797b4 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -62,7 +62,7 @@ if (DEFINED USE_ROCM_TOOLKIT)
     message(STATUS "Will not build rocm GPU support")
   endif()
 else()
-  find_package(HIP REQUIRED)  
+  find_package(HIP QUIET)  
   if (HIP_FOUND)
     link_directories(${HIP_ROOT_DIR}/lib)
     add_definitions("-DTENSORFLOW_USE_ROCM")
@@ -80,6 +80,10 @@ if (USE_ROCM_TOOLKIT)
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
 endif()
 
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
+endif()
 
 file(GLOB TEST_SRC test_*.cc)
 add_executable( runUnitTests ${TEST_SRC} )
@@ -106,14 +110,13 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config)
-elseif()
-  set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) 
-  message(status"EXTRA_LIBS:${EXTRA_LIBS}")
-  message(status"EXTRA_LIBS:${HIP}")
-  target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc ${EXTRA_LIBS} coverage_config)
-else()
+endif(USE_CUDA_TOOLKIT)
+if (USE_ROCM_TOOLKIT)
+  target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc deepmd_op_rocm coverage_config)
+endif(USE_ROCM_TOOLKIT)
+if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config)
-endif()
+endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
 add_test( runUnitTests runUnitTests )
 
 # include(GoogleTest)
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index dbb469f6ef..eb456f9a97 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -9,18 +9,25 @@ file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
+endif()
+
 if (BUILD_CPP_IF) 
   if (USE_CUDA_TOOLKIT)
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC})
     find_package(CUDA REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
     target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES})
-  elseif()
+  endif(USE_CUDA_TOOLKIT)
+  if(USE_ROCM_TOOLKIT)
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC})
     find_package(HIP REQUIRED)
     include_directories(${ROCM_INCLUDE_DIRS})
     target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES})
-  else()
+  endif(USE_ROCM_TOOLKIT)
+  if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT))
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
   endif()
 endif (BUILD_CPP_IF)
@@ -35,17 +42,19 @@ if (BUILD_PY_IF)
     include_directories(${CUDA_INCLUDE_DIRS})
     target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
     target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
-  elseif()
+  endif(USE_CUDA_TOOLKIT)
+  if(USE_ROCM_TOOLKIT)
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
     find_package(HIP REQUIRED)
     include_directories(${HIP_INCLUDE_DIRS})
     target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
     target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
-  else()
+  endif(USE_ROCM_TOOLKIT)
+  if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
-  endif(USE_CUDA_TOOLKIT)
+  endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
   
   message(STATUS ${TensorFlowFramework_LIBRARY})
   target_link_libraries(op_abi ${LIB_DEEPMD})

From 5e36affdb0a83ed7af9bf29ce3d5ea4ca68cdc33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Thu, 27 May 2021 10:27:02 +0800
Subject: [PATCH 03/11] modify CMakeLists.txt

Fixed logic issues, but the absolute path has not yet been modified, leaving multiple errors thrown in the CMakeList
---
 source/CMakeLists.txt                  | 52 ++++++----------------
 source/api_cc/CMakeLists.txt           |  4 +-
 source/lib/CMakeLists.txt              |  5 +--
 source/lib/src/rocm/CMakeLists.txt     | 19 ++------
 source/lib/tests/CMakeLists.txt        | 60 ++++++--------------------
 source/op/CMakeLists.txt               |  3 +-
 source/op/prod_env_mat_multi_device.cc |  2 +-
 7 files changed, 35 insertions(+), 110 deletions(-)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 092f902beb..851873b063 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -51,26 +51,17 @@ file(READ ${PROJECT_SOURCE_DIR}/config/MODEL_VER MODEL_VERSION)
 string(REPLACE "\n" " " MODEL_VERSION ${MODEL_VERSION})
 message(STATUS "Supported model version: ${MODEL_VERSION}")
 
-# define USE_CUDA_TOOLKIT
-if (DEFINED USE_CUDA_TOOLKIT)
-  if (USE_CUDA_TOOLKIT)
-    find_package(CUDA REQUIRED)
-  else()
-    message(STATUS "Will not build nv GPU support")
-  endif()
-else()
-  find_package(CUDA QUIET)  
-  if (CUDA_FOUND)
-    set(USE_CUDA_TOOLKIT TRUE)
-    message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
-  else()
-    set(USE_CUDA_TOOLKIT FALSE)
-    message(STATUS "No cuda support found, will not build nv GPU support")
-  endif()
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
 endif()
+
+# define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
+  find_package(CUDA REQUIRED)
   add_definitions("-D GOOGLE_CUDA")
-endif()
+  message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
+endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
 if(NOT DEFINED HIP_PATH)
@@ -81,32 +72,13 @@ if(NOT DEFINED HIP_PATH)
   endif()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-if (DEFINED USE_ROCM_TOOLKIT)
-  if (USE_ROCM_TOOLKIT)
-    find_package(HIP REQUIRED)
-    add_compile_definitions(__HIP_PLATFORM_HCC__)
-  else()
-    message(STATUS "Will not build nv GPU support")
-  endif()
-else()
-  find_package(HIP QUIET)
-  if (HIP_FOUND)
-    set(USE_ROCM_TOOLKIT TRUE)
-    add_compile_definitions(__HIP_PLATFORM_HCC__)
-    message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support")
-  else()
-    set(USE_ROCM_TOOLKIT FALSE)
-    message(STATUS "No rocm support found, will not build AMD GPU support")
-  endif()
-endif()
 if (USE_ROCM_TOOLKIT)
+  find_package(HIP REQUIRED)
   add_definitions("-D TENSORFLOW_USE_ROCM")
-endif()
+  add_compile_definitions(__HIP_PLATFORM_HCC__)
+  message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support")
+endif (USE_ROCM_TOOLKIT)
 
-# Devices that have both ROCM and CUDA are not currently supported
-if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
-  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
-endif()
 
 # find tensorflow, I need tf abi info
 find_package(tensorflow REQUIRED)
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index cbb62df1b3..9e77e9fa2e 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -1,5 +1,4 @@
 # libmd
-
 set (libname ${LIB_DEEPMD_CC})
 
 configure_file(
@@ -27,7 +26,8 @@ if (USE_CUDA_TOOLKIT)
 endif()
 
 if (USE_ROCM_TOOLKIT)
-  target_link_libraries(${libname} ${HIP_LIBRARIES})
+  LINK_LIBRARIES(${HIP_ROOT_DIR}\lib) 
+  target_link_libraries(${libname} hip_hcc)
 endif()
 
 
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index c34543ab40..8ffbe8c192 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -1,5 +1,4 @@
 # libmd
-
 set (libname ${LIB_DEEPMD})
 
 # Devices that have both ROCM and CUDA are not currently supported
@@ -15,7 +14,6 @@ file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
 add_library(${libname} SHARED ${LIB_SRC})
-
 if (USE_CUDA_TOOLKIT)
   add_definitions("-D GOOGLE_CUDA")
   add_subdirectory(src/cuda)
@@ -27,7 +25,8 @@ if (USE_ROCM_TOOLKIT)
   add_definitions("-D TENSORFLOW_USE_ROCM")
   add_subdirectory(src/rocm)
   set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
-  target_link_libraries (${libname} ${HIP_LIBRARIES} ${EXTRA_LIBS})
+  LINK_LIBRARIES(${HIP_ROOT_DIR}/lib)
+  target_link_libraries (${libname} ${EXTRA_LIBS} hip_hcc)
 endif()
 
 if(BUILD_PY_IF)
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index 27c4d62f68..bd46a9f432 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -1,29 +1,17 @@
 # required cmake version
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.5)
 # project name
 project(deepmd_op_rocm)
 set(CMAKE_LINK_WHAT_YOU_USE TRUE)
-#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake")
-if(NOT DEFINED HIP_PATH)
-  if(NOT DEFINED ENV{HIP_PATH})
-    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-  else()
-    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-  endif()
-endif()
-set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 find_package(HIP REQUIRED)
-
 add_compile_definitions(__HIP_PLATFORM_HCC__)
-link_directories(${HIP_ROOT_DIR}/lib)
 add_definitions("-DTENSORFLOW_USE_ROCM")
 
 # set c++ version c++11
-#SET(CMAKE_CXX_STANDARD 11)
+SET(CMAKE_CXX_STANDARD 11)
 SET(CMAKE_HIP_STANDARD 11)
 
 message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
-message(STATUS "HIP major version is " ${HIP_TOOLKIT_ROOT_DIR})
 
 
 set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__)
@@ -34,9 +22,8 @@ set (SOURCE_FILES
 )
 
 hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES})
-target_link_libraries(deepmd_op_rocm ${HIP_LIBRARIES})
 
-#install(TARGETS deepmd_op_rocm DESTINATION lib/)
+install(TARGETS deepmd_op_rocm DESTINATION lib/)
 if (BUILD_CPP_IF)
   install(TARGETS deepmd_op_rocm DESTINATION lib/)
 endif (BUILD_CPP_IF)
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index 36367797b4..67ad0a7c93 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -18,29 +18,19 @@ message(status "${CMAKE_SOURCE_DIR}")
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
-# define USE_CUDA_TOOLKIT
-if (DEFINED USE_CUDA_TOOLKIT)
-  if (USE_CUDA_TOOLKIT)
-    find_package(CUDA REQUIRED)
-  else()
-    message(STATUS "Will not build nv GPU support")
-  endif()
-else()
-  find_package(CUDA QUIET)  
-  if (CUDA_FOUND)
-    set(USE_CUDA_TOOLKIT TRUE)
-    message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
-  else()
-    set(USE_CUDA_TOOLKIT FALSE)
-    message(STATUS "No cuda support found, will not build nv GPU support")
-  endif()
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
 endif()
 
+# define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
+  find_package(CUDA REQUIRED)
   add_definitions("-D GOOGLE_CUDA")
+  message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
   include_directories(${CUDA_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
-endif()
+endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
 if(NOT DEFINED HIP_PATH)
@@ -51,39 +41,15 @@ if(NOT DEFINED HIP_PATH)
   endif()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-
-if (DEFINED USE_ROCM_TOOLKIT)
-  if(USE_ROCM_TOOLKIT)
-    link_directories(${HIP_ROOT_DIR}/lib)
-    add_definitions("-DTENSORFLOW_USE_ROCM")
-    find_package(HIP REQUIRED)
-    add_compile_definitions(__HIP_PLATFORM_HCC__)
-  else()
-    message(STATUS "Will not build rocm GPU support")
-  endif()
-else()
-  find_package(HIP QUIET)  
-  if (HIP_FOUND)
-    link_directories(${HIP_ROOT_DIR}/lib)
-    add_definitions("-DTENSORFLOW_USE_ROCM")
-    set(USE_ROCM_TOOLKIT TRUE)
-    add_compile_definitions(__HIP_PLATFORM_HCC__)
-    message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build ROCM GPU support")
-  else()
-    set(USE_ROCM_TOOLKIT FALSE)
-    message(STATUS "No ROCM support found, will not build ROCM GPU support")
-  endif()
-endif()
 if (USE_ROCM_TOOLKIT)
-  add_definitions("-DUSE_ROCM_TOOLKIT")
-  include_directories(${ROCM_INCLUDE_DIRS})
+  find_package(HIP REQUIRED)
+  link_directories(${HIP_ROOT_DIR}/lib)
+  add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_compile_definitions(__HIP_PLATFORM_HCC__)
+  message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support")
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
-endif()
+endif (USE_ROCM_TOOLKIT)
 
-# Devices that have both ROCM and CUDA are not currently supported
-if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
-  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
-endif()
 
 file(GLOB TEST_SRC test_*.cc)
 add_executable( runUnitTests ${TEST_SRC} )
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index eb456f9a97..78c0a12803 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -25,7 +25,8 @@ if (BUILD_CPP_IF)
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC})
     find_package(HIP REQUIRED)
     include_directories(${ROCM_INCLUDE_DIRS})
-    target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES})
+    link_directories(${HIP_ROOT_DIR}/lib)
+    target_link_libraries (${LIB_DEEPMD_OP} hpi_hcc)
   endif(USE_ROCM_TOOLKIT)
   if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT))
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 64d40acff4..e4e12cac2b 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -441,7 +441,7 @@ class ProdEnvMatAOp : public OpKernel {
       deepmd::delete_device_memory(firstneigh);
       #endif //GOOGLE_CUDA
 
-            #if TENSORFLOW_USE_ROCM
+      #if TENSORFLOW_USE_ROCM
       int * idx_mapping = NULL;
       int * ilist = NULL, * numneigh = NULL;
       int ** firstneigh = NULL;

From 29ee994ac3ac5d233e9bed892714aa3b92a00105 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Thu, 27 May 2021 21:32:36 +0800
Subject: [PATCH 04/11] ADD FindROCM.cmake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1、ADD FindROCM.cmake
2、modify CMAKELISTS.TXT
---
 source/CMakeLists.txt              | 12 +----
 source/api_cc/CMakeLists.txt       |  7 ++-
 source/api_cc/tests/CMakeLists.txt | 64 +++++------------------
 source/cmake/FindROCM.cmake        | 83 ++++++++++++++++++++++++++++++
 source/lib/CMakeLists.txt          | 13 +++--
 source/lib/src/rocm/CMakeLists.txt |  8 +--
 source/lib/tests/CMakeLists.txt    | 16 ++----
 source/op/CMakeLists.txt           | 11 ++--
 8 files changed, 123 insertions(+), 91 deletions(-)
 create mode 100644 source/cmake/FindROCM.cmake

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 851873b063..dd4b0039e1 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -64,19 +64,11 @@ if (USE_CUDA_TOOLKIT)
 endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
-if(NOT DEFINED HIP_PATH)
-  if(NOT DEFINED ENV{HIP_PATH})
-    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-  else()
-    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-  endif()
-endif()
-set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 if (USE_ROCM_TOOLKIT)
-  find_package(HIP REQUIRED)
+  find_package(ROCM REQUIRED)
   add_definitions("-D TENSORFLOW_USE_ROCM")
   add_compile_definitions(__HIP_PLATFORM_HCC__)
-  message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support")
+  message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support")
 endif (USE_ROCM_TOOLKIT)
 
 
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index 9e77e9fa2e..ad07b9da53 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -16,6 +16,10 @@ if (USE_CUDA_TOOLKIT)
   include_directories("${CUDA_INCLUDE_DIRS}")
 endif()
 
+if (USE_ROCM_TOOLKIT)
+  include_directories("${ROCM_INCLUDE_DIRS}")
+endif()
+
 file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
@@ -26,8 +30,7 @@ if (USE_CUDA_TOOLKIT)
 endif()
 
 if (USE_ROCM_TOOLKIT)
-  LINK_LIBRARIES(${HIP_ROOT_DIR}\lib) 
-  target_link_libraries(${libname} hip_hcc)
+  target_link_libraries (${libname} ${ROCM_LIBRARIES})
 endif()
 
 
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index 59e6bbc1cc..66901011a5 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -52,68 +52,28 @@ if (OPENMP_FOUND)
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
 
-# define USE_CUDA_TOOLKIT
-if (DEFINED USE_CUDA_TOOLKIT)
-  if (USE_CUDA_TOOLKIT)
-    find_package(CUDA REQUIRED)
-  else()
-    message(STATUS "Will not build nv GPU support")
-  endif()
-else()
-  find_package(CUDA QUIET)  
-  if (CUDA_FOUND)
-    set(USE_CUDA_TOOLKIT TRUE)
-    message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
-  else()
-    set(USE_CUDA_TOOLKIT FALSE)
-    message(STATUS "No cuda support found, will not build nv GPU support")
-  endif()
+# Devices that have both ROCM and CUDA are not currently supported
+if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
+  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
 endif()
 
+# define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
+  find_package(CUDA REQUIRED)
   add_definitions("-D GOOGLE_CUDA")
+  message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
   include_directories(${CUDA_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
-endif()
-
-# define USE_ROCM_TOOLKIT
-if(NOT DEFINED HIP_PATH)
-  if(NOT DEFINED ENV{HIP_PATH})
-    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-  else()
-    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-  endif()
-endif()
-set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-if (DEFINED USE_ROCM_TOOLKIT)
-  if (USE_ROCM_TOOLKIT)
-    find_package(HIP REQUIRED)
-    add_compile_definitions(__HIP_PLATFORM_HCC__)
-  else()
-    message(STATUS "Will not build AMD GPU support")
-  endif()
-else()
-  find_package(HIP QUIET)
-  if (HIP_FOUND)
-    set(USE_ROCM_TOOLKIT TRUE)
-    add_compile_definitions(__HIP_PLATFORM_HCC__)
-    message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support")
-  else()
-    set(USE_ROCM_TOOLKIT FALSE)
-    message(STATUS "No rocm support found, will not build AMD GPU support")
-  endif()
-endif()
+endif(USE_CUDA_TOOLKIT)
 
+#define USE_ROCM_TOOLKIT
 if (USE_ROCM_TOOLKIT)
+  find_package(ROCM REQUIRED)
   add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_compile_definitions(__HIP_PLATFORM_HCC__)
   include_directories(${ROCM_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
-endif()
-
-# Devices that have both ROCM and CUDA are not currently supported
-if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
-  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
-endif()
+endif (USE_ROCM_TOOLKIT)
 
 file(GLOB TEST_SRC test_*.cc)
 add_executable( runUnitTests ${TEST_SRC} )
@@ -134,7 +94,7 @@ if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config)
 endif(USE_CUDA_TOOLKIT)
 if(USE_ROCM_TOOLKIT)
-  target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname}  pthread deepmd_op_rocm ${TensorFlow_LIBRARY} coverage_config)
+  target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_rocm coverage_config)
 endif(USE_ROCM_TOOLKIT)
 if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config)
diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake
new file mode 100644
index 0000000000..d012e7772c
--- /dev/null
+++ b/source/cmake/FindROCM.cmake
@@ -0,0 +1,83 @@
+# Input:
+# ROCM_ROOT 
+#
+# Output:
+# ROCM_FOUND        
+# ROCM_INCLUDE_DIRS 
+# ROCM_LIBRARIES
+
+# define the search path
+list(APPEND ROCM_search_PATHS ${ROCM_ROOT})
+list(APPEND ROCM_search_PATHS "/opt/rocm/")
+list(APPEND HIP_search_PATHS ${ROCM_ROOT}/hip)
+list(APPEND HIP_search_PATHS "/opt/rocm/hip")
+
+# define the libs to find
+if (NOT ROCM_FIND_COMPONENTS)
+  set(ROCM_FIND_COMPONENTS hip_hcc hiprtc)
+endif ()
+
+# includes
+find_path (ROCM_INCLUDE_DIRS
+  NAMES 
+  hip/hip_runtime.h
+  rocprim/rocprim.hpp
+  hipcub/hipcub.hpp
+  PATHS ${ROCM_search_PATHS} 
+  PATH_SUFFIXES "include"
+  NO_DEFAULT_PATH
+  )
+if (NOT ROCM_INCLUDE_DIRS AND ROCM_FIND_REQUIRED)
+  message(FATAL_ERROR 
+    "Not found 'hip' or 'rocprim' or 'hipcub' directory in path '${ROCM_search_PATHS}' "
+    "You can manually set the ROCM install path by -DROCM_ROOT ")
+endif ()
+
+# libs
+foreach (module ${ROCM_FIND_COMPONENTS})
+  find_library(ROCM_LIBRARIES_${module}
+    NAMES ${module}
+    PATHS ${ROCM_search_PATHS} PATH_SUFFIXES "lib" NO_DEFAULT_PATH
+    )
+  if (ROCM_LIBRARIES_${module})
+    list(APPEND ROCM_LIBRARIES ${ROCM_LIBRARIES_${module}})
+  elseif (ROCM_FIND_REQUIRED)
+    message(FATAL_ERROR 
+      "Not found lib/'${module}' in '${ROCM_search_PATHS}' "
+      "You can manually set the ROCM install path by -DROCM_ROOT ")
+  endif ()
+endforeach ()
+
+# FindHIP.cmake
+find_path (HIP_CMAKE
+  NAMES 
+  FindHIP.cmake
+  PATHS ${HIP_search_PATHS} 
+  PATH_SUFFIXES "cmake"
+  NO_DEFAULT_PATH
+  )
+
+if (NOT HIP_CMAKE AND ROCM_FIND_REQUIRED)
+  message(FATAL_ERROR 
+    "Not found 'FindHIP.cmake' file in path '${ROCM_search_PATHS}' "
+    "You can manually set the ROCM install path by -DROCM_ROOT ")
+endif ()
+
+list (APPEND CMAKE_MODULE_PATH ${HIP_CMAKE})
+find_package(HIP) 
+
+# define the output variable
+if (ROCM_INCLUDE_DIRS AND ROCM_LIBRARIES AND HIP_CMAKE)
+  set(ROCM_FOUND TRUE)
+else ()
+  set(ROCM_FOUND FALSE)
+endif ()
+
+# print message
+if (NOT ROCM_FIND_QUIETLY)
+  message(STATUS "Found ROCM: ${ROCM_INCLUDE_DIRS}, ${ROCM_LIBRARIES}, ${HIP_CMAKE}"
+    " in ${ROCM_search_PATHS}")
+endif ()
+
+unset(ROCM_search_PATHS)
+unset(HIP_search_PATHS)
\ No newline at end of file
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 8ffbe8c192..895471442d 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -1,19 +1,19 @@
 # libmd
 set (libname ${LIB_DEEPMD})
 
-# Devices that have both ROCM and CUDA are not currently supported
-if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
-  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
-endif()
-
 if (USE_CUDA_TOOLKIT)
   include_directories("${CUDA_INCLUDE_DIRS}")
 endif()
 
+if (USE_ROCM_TOOLKIT)
+  include_directories("${ROCM_INCLUDE_DIRS}")
+endif()
+
 file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
 add_library(${libname} SHARED ${LIB_SRC})
+
 if (USE_CUDA_TOOLKIT)
   add_definitions("-D GOOGLE_CUDA")
   add_subdirectory(src/cuda)
@@ -25,8 +25,7 @@ if (USE_ROCM_TOOLKIT)
   add_definitions("-D TENSORFLOW_USE_ROCM")
   add_subdirectory(src/rocm)
   set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
-  LINK_LIBRARIES(${HIP_ROOT_DIR}/lib)
-  target_link_libraries (${libname} ${EXTRA_LIBS} hip_hcc)
+  target_link_libraries (${libname} ${ROCM_LIBRARIES} ${EXTRA_LIBS})
 endif()
 
 if(BUILD_PY_IF)
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index bd46a9f432..839a51680d 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -3,9 +3,12 @@ cmake_minimum_required(VERSION 3.5)
 # project name
 project(deepmd_op_rocm)
 set(CMAKE_LINK_WHAT_YOU_USE TRUE)
-find_package(HIP REQUIRED)
+
+#find package ROCM
+list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../../cmake/)
+find_package(ROCM REQUIRED)
+
 add_compile_definitions(__HIP_PLATFORM_HCC__)
-add_definitions("-DTENSORFLOW_USE_ROCM")
 
 # set c++ version c++11
 SET(CMAKE_CXX_STANDARD 11)
@@ -13,7 +16,6 @@ SET(CMAKE_HIP_STANDARD 11)
 
 message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
 
-
 set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index 67ad0a7c93..3c738e2973 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -33,20 +33,12 @@ if (USE_CUDA_TOOLKIT)
 endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
-if(NOT DEFINED HIP_PATH)
-  if(NOT DEFINED ENV{HIP_PATH})
-    set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-  else()
-    set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
-  endif()
-endif()
-set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 if (USE_ROCM_TOOLKIT)
-  find_package(HIP REQUIRED)
-  link_directories(${HIP_ROOT_DIR}/lib)
+  list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake/)
+  find_package(ROCM REQUIRED)
   add_definitions("-D TENSORFLOW_USE_ROCM")
   add_compile_definitions(__HIP_PLATFORM_HCC__)
-  message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support")
+  include_directories(${ROCM_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
 endif (USE_ROCM_TOOLKIT)
 
@@ -78,7 +70,7 @@ if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config)
 endif(USE_CUDA_TOOLKIT)
 if (USE_ROCM_TOOLKIT)
-  target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc deepmd_op_rocm coverage_config)
+  target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_rocm coverage_config ${ROCM_LIBRARIES})
 endif(USE_ROCM_TOOLKIT)
 if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config)
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 78c0a12803..b03eb8a2b0 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -9,6 +9,8 @@ file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
+list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../cmake/)
+
 # Devices that have both ROCM and CUDA are not currently supported
 if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
   message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
@@ -23,10 +25,9 @@ if (BUILD_CPP_IF)
   endif(USE_CUDA_TOOLKIT)
   if(USE_ROCM_TOOLKIT)
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC})
-    find_package(HIP REQUIRED)
+    find_package(ROCM REQUIRED)
     include_directories(${ROCM_INCLUDE_DIRS})
-    link_directories(${HIP_ROOT_DIR}/lib)
-    target_link_libraries (${LIB_DEEPMD_OP} hpi_hcc)
+    target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES})
   endif(USE_ROCM_TOOLKIT)
   if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT))
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
@@ -47,8 +48,8 @@ if (BUILD_PY_IF)
   if(USE_ROCM_TOOLKIT)
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
-    find_package(HIP REQUIRED)
-    include_directories(${HIP_INCLUDE_DIRS})
+    find_package(ROCM REQUIRED)
+    include_directories(${ROCM_INCLUDE_DIRS})
     target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
     target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
   endif(USE_ROCM_TOOLKIT)

From 98c149ab33dd3adbe2b3a0fda1b3c65a6cf4af58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Fri, 28 May 2021 19:58:14 +0800
Subject: [PATCH 05/11] modify FindROCM.cmake

---
 source/cmake/FindROCM.cmake | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake
index d012e7772c..c9fa80ec5d 100644
--- a/source/cmake/FindROCM.cmake
+++ b/source/cmake/FindROCM.cmake
@@ -9,8 +9,6 @@
 # define the search path
 list(APPEND ROCM_search_PATHS ${ROCM_ROOT})
 list(APPEND ROCM_search_PATHS "/opt/rocm/")
-list(APPEND HIP_search_PATHS ${ROCM_ROOT}/hip)
-list(APPEND HIP_search_PATHS "/opt/rocm/hip")
 
 # define the libs to find
 if (NOT ROCM_FIND_COMPONENTS)
@@ -52,8 +50,8 @@ endforeach ()
 find_path (HIP_CMAKE
   NAMES 
   FindHIP.cmake
-  PATHS ${HIP_search_PATHS} 
-  PATH_SUFFIXES "cmake"
+  PATHS ${ROCM_search_PATHS} 
+  PATH_SUFFIXES "hip/cmake"
   NO_DEFAULT_PATH
   )
 
@@ -79,5 +77,4 @@ if (NOT ROCM_FIND_QUIETLY)
     " in ${ROCM_search_PATHS}")
 endif ()
 
-unset(ROCM_search_PATHS)
-unset(HIP_search_PATHS)
\ No newline at end of file
+unset(ROCM_search_PATHS)
\ No newline at end of file

From 5d40b4a6f2954348592b8020bcd7b031a70f2993 Mon Sep 17 00:00:00 2001
From: pkulzy <47965866+pkulzy@users.noreply.github.com>
Date: Fri, 28 May 2021 20:06:57 +0800
Subject: [PATCH 06/11] modify FindROCM.cmake

---
 source/cmake/FindROCM.cmake | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake
index d012e7772c..e70c560084 100644
--- a/source/cmake/FindROCM.cmake
+++ b/source/cmake/FindROCM.cmake
@@ -9,8 +9,6 @@
 # define the search path
 list(APPEND ROCM_search_PATHS ${ROCM_ROOT})
 list(APPEND ROCM_search_PATHS "/opt/rocm/")
-list(APPEND HIP_search_PATHS ${ROCM_ROOT}/hip)
-list(APPEND HIP_search_PATHS "/opt/rocm/hip")
 
 # define the libs to find
 if (NOT ROCM_FIND_COMPONENTS)
@@ -52,8 +50,8 @@ endforeach ()
 find_path (HIP_CMAKE
   NAMES 
   FindHIP.cmake
-  PATHS ${HIP_search_PATHS} 
-  PATH_SUFFIXES "cmake"
+  PATHS ${ROCM_search_PATHS} 
+  PATH_SUFFIXES "hip/cmake"
   NO_DEFAULT_PATH
   )
 
@@ -80,4 +78,3 @@ if (NOT ROCM_FIND_QUIETLY)
 endif ()
 
 unset(ROCM_search_PATHS)
-unset(HIP_search_PATHS)
\ No newline at end of file

From cd17a217a5e7a91cacc75f13533ebf20a1d673d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Sat, 29 May 2021 22:24:08 +0800
Subject: [PATCH 07/11] Fixed issue

---
 source/CMakeLists.txt              | 12 +++++++-----
 source/api_cc/CMakeLists.txt       |  5 -----
 source/api_cc/tests/CMakeLists.txt | 12 +++++++-----
 source/lib/src/rocm/CMakeLists.txt | 10 +---------
 source/lib/tests/CMakeLists.txt    | 12 +++++++-----
 source/op/CMakeLists.txt           | 21 +++++----------------
 6 files changed, 27 insertions(+), 45 deletions(-)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index dd4b0039e1..5feb38f3ac 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -61,6 +61,8 @@ if (USE_CUDA_TOOLKIT)
   find_package(CUDA REQUIRED)
   add_definitions("-D GOOGLE_CUDA")
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
+else()
+  message(STATUS "Will not build nv GPU support")
 endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
@@ -69,6 +71,8 @@ if (USE_ROCM_TOOLKIT)
   add_definitions("-D TENSORFLOW_USE_ROCM")
   add_compile_definitions(__HIP_PLATFORM_HCC__)
   message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support")
+else()
+  message(STATUS "Will not build AMD GPU support")  
 endif (USE_ROCM_TOOLKIT)
 
 
@@ -196,13 +200,11 @@ if (BUILD_CPP_IF)
   set (LIB_DEEPMD_CC		"deepmd_cc")
   if (USE_CUDA_TOOLKIT)
     set (LIB_DEEPMD_OP_DEVICE		"deepmd_op_cuda")
-  endif(USE_CUDA_TOOLKIT)
-  if (USE_ROCM_TOOLKIT)
+  elseif (USE_ROCM_TOOLKIT)
     set (LIB_DEEPMD_OP_DEVICE   "deepmd_op_rocm")
-  endif(USE_ROCM_TOOLKIT)
-  if ((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) 
+  else()
     set (LIB_DEEPMD_OP_DEVICE		"deepmd_op")
-  endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+  endif()
   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
     set (LIB_DEEPMD_NATIVE	"deepmd_native_md")
     set (LIB_DEEPMD_IPI		"deepmd_ipi")
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index ad07b9da53..cfdfce9b0e 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -7,11 +7,6 @@ configure_file(
   @ONLY
 )
 
-# Devices that have both ROCM and CUDA are not currently supported
-if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
-  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
-endif()
-
 if (USE_CUDA_TOOLKIT)
   include_directories("${CUDA_INCLUDE_DIRS}")
 endif()
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index 66901011a5..67a2c1c467 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -64,6 +64,8 @@ if (USE_CUDA_TOOLKIT)
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
   include_directories(${CUDA_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
+else()
+  message(STATUS "Will not build nv GPU support")
 endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
@@ -73,6 +75,8 @@ if (USE_ROCM_TOOLKIT)
   add_compile_definitions(__HIP_PLATFORM_HCC__)
   include_directories(${ROCM_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
+else()
+  message(STATUS "Will not build AMD GPU support")
 endif (USE_ROCM_TOOLKIT)
 
 file(GLOB TEST_SRC test_*.cc)
@@ -92,13 +96,11 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config)
-endif(USE_CUDA_TOOLKIT)
-if(USE_ROCM_TOOLKIT)
+elseif(USE_ROCM_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_rocm coverage_config)
-endif(USE_ROCM_TOOLKIT)
-if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+else()
   target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config)
-endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+endif()
 
 add_test( runUnitTests runUnitTests )
 
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index 839a51680d..63c04360d9 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -4,12 +4,6 @@ cmake_minimum_required(VERSION 3.5)
 project(deepmd_op_rocm)
 set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 
-#find package ROCM
-list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../../cmake/)
-find_package(ROCM REQUIRED)
-
-add_compile_definitions(__HIP_PLATFORM_HCC__)
-
 # set c++ version c++11
 SET(CMAKE_CXX_STANDARD 11)
 SET(CMAKE_HIP_STANDARD 11)
@@ -19,9 +13,7 @@ message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
 set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
-set (SOURCE_FILES
-    prod_env_mat.hip.cu prod_force.hip.cu prod_virial.hip.cu gelu.hip.cu tabulate.hip.cu coord.hip.cu neighbor_list.hip.cu prod_force_grad.hip.cu prod_virial_grad.hip.cu region.hip.cu
-)
+set (SOURCE_FILES *.hip.cu )
 
 hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES})
 
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index 3c738e2973..4dc591b0eb 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -30,6 +30,8 @@ if (USE_CUDA_TOOLKIT)
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
   include_directories(${CUDA_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
+else()
+  message(STATUS "Will not build nv GPU support")
 endif(USE_CUDA_TOOLKIT)
 
 #define USE_ROCM_TOOLKIT
@@ -40,6 +42,8 @@ if (USE_ROCM_TOOLKIT)
   add_compile_definitions(__HIP_PLATFORM_HCC__)
   include_directories(${ROCM_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
+else()
+  message(STATUS "Will not build AMD GPU support")
 endif (USE_ROCM_TOOLKIT)
 
 
@@ -68,13 +72,11 @@ endif()
 
 if (USE_CUDA_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config)
-endif(USE_CUDA_TOOLKIT)
-if (USE_ROCM_TOOLKIT)
+elseif (USE_ROCM_TOOLKIT)
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_rocm coverage_config ${ROCM_LIBRARIES})
-endif(USE_ROCM_TOOLKIT)
-if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+else()
   target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config)
-endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+endif()
 add_test( runUnitTests runUnitTests )
 
 # include(GoogleTest)
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index b03eb8a2b0..340c5601fb 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -9,27 +9,18 @@ file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
-list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../cmake/)
-
-# Devices that have both ROCM and CUDA are not currently supported
-if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT)
-  message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported")
-endif()
-
 if (BUILD_CPP_IF) 
   if (USE_CUDA_TOOLKIT)
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC})
     find_package(CUDA REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
     target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES})
-  endif(USE_CUDA_TOOLKIT)
-  if(USE_ROCM_TOOLKIT)
+  elseif (USE_ROCM_TOOLKIT)
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC})
     find_package(ROCM REQUIRED)
     include_directories(${ROCM_INCLUDE_DIRS})
     target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES})
-  endif(USE_ROCM_TOOLKIT)
-  if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT))
+  else ()
     add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC})
   endif()
 endif (BUILD_CPP_IF)
@@ -44,19 +35,17 @@ if (BUILD_PY_IF)
     include_directories(${CUDA_INCLUDE_DIRS})
     target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
     target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
-  endif(USE_CUDA_TOOLKIT)
-  if(USE_ROCM_TOOLKIT)
+  elseif(USE_ROCM_TOOLKIT)
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
     find_package(ROCM REQUIRED)
     include_directories(${ROCM_INCLUDE_DIRS})
     target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE})
     target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE})
-  endif(USE_ROCM_TOOLKIT)
-  if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+  else()
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
-  endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT))
+  endif()
   
   message(STATUS ${TensorFlowFramework_LIBRARY})
   target_link_libraries(op_abi ${LIB_DEEPMD})

From 4fb9b3be15d7451421f6ba4b95613191757cda2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Sat, 29 May 2021 22:32:59 +0800
Subject: [PATCH 08/11] Update FindROCM.cmake

---
 source/cmake/FindROCM.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake
index e70c560084..cab69e4999 100644
--- a/source/cmake/FindROCM.cmake
+++ b/source/cmake/FindROCM.cmake
@@ -74,7 +74,7 @@ endif ()
 # print message
 if (NOT ROCM_FIND_QUIETLY)
   message(STATUS "Found ROCM: ${ROCM_INCLUDE_DIRS}, ${ROCM_LIBRARIES}, ${HIP_CMAKE}"
-    " in ${ROCM_search_PATHS}")
+    " in ${ROCM_search_PATHS}, build AMD GPU support")
 endif ()
 
 unset(ROCM_search_PATHS)

From 7f6273e278f69445a1ca2afab3fc8b8ae2f90e60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Sat, 29 May 2021 23:41:15 +0800
Subject: [PATCH 09/11] fix issue about wildcard

---
 source/CMakeLists.txt              | 4 ++--
 source/api_cc/tests/CMakeLists.txt | 4 ++--
 source/lib/CMakeLists.txt          | 4 ++--
 source/lib/src/cuda/CMakeLists.txt | 4 +---
 source/lib/src/rocm/CMakeLists.txt | 2 +-
 source/lib/tests/CMakeLists.txt    | 4 ++--
 6 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 5feb38f3ac..1c89c3cf25 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -59,7 +59,7 @@ endif()
 # define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
   find_package(CUDA REQUIRED)
-  add_definitions("-D GOOGLE_CUDA")
+  add_definitions("-DGOOGLE_CUDA")
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
 else()
   message(STATUS "Will not build nv GPU support")
@@ -68,7 +68,7 @@ endif(USE_CUDA_TOOLKIT)
 #define USE_ROCM_TOOLKIT
 if (USE_ROCM_TOOLKIT)
   find_package(ROCM REQUIRED)
-  add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_definitions("-DTENSORFLOW_USE_ROCM")
   add_compile_definitions(__HIP_PLATFORM_HCC__)
   message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support")
 else()
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index 67a2c1c467..111c7646bb 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -60,7 +60,7 @@ endif()
 # define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
   find_package(CUDA REQUIRED)
-  add_definitions("-D GOOGLE_CUDA")
+  add_definitions("-DGOOGLE_CUDA")
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
   include_directories(${CUDA_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
@@ -71,7 +71,7 @@ endif(USE_CUDA_TOOLKIT)
 #define USE_ROCM_TOOLKIT
 if (USE_ROCM_TOOLKIT)
   find_package(ROCM REQUIRED)
-  add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_definitions("-DTENSORFLOW_USE_ROCM")
   add_compile_definitions(__HIP_PLATFORM_HCC__)
   include_directories(${ROCM_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 895471442d..c1369c23ee 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -15,14 +15,14 @@ file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 add_library(${libname} SHARED ${LIB_SRC})
 
 if (USE_CUDA_TOOLKIT)
-  add_definitions("-D GOOGLE_CUDA")
+  add_definitions("-DGOOGLE_CUDA")
   add_subdirectory(src/cuda)
   set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
   target_link_libraries (${libname} ${CUDA_LIBRARIES} ${EXTRA_LIBS})
 endif()
 
 if (USE_ROCM_TOOLKIT)
-  add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_definitions("-DTENSORFLOW_USE_ROCM")
   add_subdirectory(src/rocm)
   set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
   target_link_libraries (${libname} ${ROCM_LIBRARIES} ${EXTRA_LIBS})
diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt
index 41a2ea091e..0c57e4e396 100644
--- a/source/lib/src/cuda/CMakeLists.txt
+++ b/source/lib/src/cuda/CMakeLists.txt
@@ -105,9 +105,7 @@ endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
-set (SOURCE_FILES
-    prod_env_mat.cu prod_force.cu prod_force_grad.cu prod_virial.cu prod_virial_grad.cu gelu.cu tabulate.cu coord.cu neighbor_list.cu region.cu
-)
+file (GLOB SOURCE_FILES "*.cu" )
 
 cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
 
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index 63c04360d9..6caf7cb658 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -13,7 +13,7 @@ message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
 set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
-set (SOURCE_FILES *.hip.cu )
+file (GLOB SOURCE_FILES "*.hip.cu" )
 
 hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES})
 
diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt
index 4dc591b0eb..b12734af9b 100644
--- a/source/lib/tests/CMakeLists.txt
+++ b/source/lib/tests/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
 # define USE_CUDA_TOOLKIT
 if (USE_CUDA_TOOLKIT)
   find_package(CUDA REQUIRED)
-  add_definitions("-D GOOGLE_CUDA")
+  add_definitions("-DGOOGLE_CUDA")
   message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support")
   include_directories(${CUDA_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir)
@@ -38,7 +38,7 @@ endif(USE_CUDA_TOOLKIT)
 if (USE_ROCM_TOOLKIT)
   list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake/)
   find_package(ROCM REQUIRED)
-  add_definitions("-D TENSORFLOW_USE_ROCM")
+  add_definitions("-DTENSORFLOW_USE_ROCM")
   add_compile_definitions(__HIP_PLATFORM_HCC__)
   include_directories(${ROCM_INCLUDE_DIRS})
   add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir)

From 18a79682216c34746297c3952e583a5307c379aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Sun, 30 May 2021 22:18:14 +0800
Subject: [PATCH 10/11] Modified duplicate

---
 source/lib/src/rocm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index 6caf7cb658..903d1efec5 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -10,7 +10,7 @@ SET(CMAKE_HIP_STANDARD 11)
 
 message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
 
-set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__)
+set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
 file (GLOB SOURCE_FILES "*.hip.cu" )

From f1535192b1ac0d36631b0ca77fbc77e7c264d0c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= <li_zeyu@pku.edu.cn>
Date: Tue, 1 Jun 2021 19:06:30 +0800
Subject: [PATCH 11/11] Use add_definitions to replace set(CMAKE_CXX_FLAGS)

---
 source/lib/src/rocm/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index 903d1efec5..393844b8bb 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -7,11 +7,12 @@ set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 # set c++ version c++11
 SET(CMAKE_CXX_STANDARD 11)
 SET(CMAKE_HIP_STANDARD 11)
+add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
+add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
 message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
 
 set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
 file (GLOB SOURCE_FILES "*.hip.cu" )