From 57cb80c7836eb08cb7700c76022ec3e3a5ed23ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Fri, 21 May 2021 17:17:42 +0800 Subject: [PATCH 01/11] PR --- source/CMakeLists.txt | 35 +- source/api_cc/CMakeLists.txt | 5 + source/api_cc/include/custom_op.h | 4 +- source/api_cc/src/DeepPot.cc | 41 ++ source/api_cc/tests/CMakeLists.txt | 41 +- source/lib/CMakeLists.txt | 7 + source/lib/include/coord.h | 44 ++ source/lib/include/device.h | 6 +- source/lib/include/fmt_nlist.h | 28 + source/lib/include/gelu.h | 23 + source/lib/include/gpu_rocm.h | 115 ++++ source/lib/include/neighbor_list.h | 42 ++ source/lib/include/prod_env_mat.h | 50 ++ source/lib/include/prod_force.h | 22 + source/lib/include/prod_force_grad.h | 19 + source/lib/include/prod_virial.h | 27 + source/lib/include/prod_virial_grad.h | 22 + source/lib/include/region.h | 22 + source/lib/include/tabulate.h | 25 + source/lib/src/neighbor_list.cc | 33 ++ source/lib/src/prod_env_mat.cc | 65 +++ source/lib/src/rocm/CMakeLists.txt | 45 ++ source/lib/src/rocm/coord.hip.cu | 414 ++++++++++++++ source/lib/src/rocm/gelu.hip.cu | 98 ++++ source/lib/src/rocm/neighbor_list.hip.cu | 184 +++++++ source/lib/src/rocm/prod_env_mat.hip.cu | 579 ++++++++++++++++++++ source/lib/src/rocm/prod_force.hip.cu | 161 ++++++ source/lib/src/rocm/prod_force_grad.hip.cu | 143 +++++ source/lib/src/rocm/prod_virial.hip.cu | 176 ++++++ source/lib/src/rocm/prod_virial_grad.hip.cu | 141 +++++ source/lib/src/rocm/region.hip.cu | 69 +++ source/lib/src/rocm/tabulate.hip.cu | 246 +++++++++ source/lib/tests/CMakeLists.txt | 46 ++ source/lib/tests/test_coord.cc | 396 ++++++++++++- source/lib/tests/test_env_mat_a.cc | 213 +++++++ source/lib/tests/test_env_mat_r.cc | 203 +++++++ source/lib/tests/test_fmt_nlist.cc | 155 +++++- source/lib/tests/test_gelu.cc | 70 +++ source/lib/tests/test_neighbor_list.cc | 113 +++- source/lib/tests/test_prod_force_a.cc | 30 + source/lib/tests/test_prod_force_grad_a.cc | 30 + source/lib/tests/test_prod_force_grad_r.cc | 30 + source/lib/tests/test_prod_force_r.cc | 30 + source/lib/tests/test_prod_virial_a.cc | 47 ++ source/lib/tests/test_prod_virial_grad_a.cc | 36 +- source/lib/tests/test_prod_virial_grad_r.cc | 33 ++ source/lib/tests/test_prod_virial_r.cc | 47 ++ source/lib/tests/test_simulation_region.cc | 74 ++- source/lib/tests/test_tabulate.cc | 60 ++ source/lmp/env.sh.in | 2 +- source/op/CMakeLists.txt | 37 +- source/op/custom_op.h | 4 +- source/op/gelu_multi_device.cc | 22 +- source/op/prod_env_mat_multi_device.cc | 371 ++++++++++++- source/op/prod_force_grad_multi_device.cc | 16 +- source/op/prod_force_multi_device.cc | 17 +- source/op/prod_virial_grad_multi_device.cc | 16 +- source/op/prod_virial_multi_device.cc | 16 +- source/op/tabulate_multi_device.cc | 16 +- source/op/unaggregated_grad.cc | 16 +- 60 files changed, 5029 insertions(+), 49 deletions(-) create mode 100644 source/lib/include/gpu_rocm.h create mode 100644 source/lib/src/rocm/CMakeLists.txt create mode 100644 source/lib/src/rocm/coord.hip.cu create mode 100644 source/lib/src/rocm/gelu.hip.cu create mode 100644 source/lib/src/rocm/neighbor_list.hip.cu create mode 100644 source/lib/src/rocm/prod_env_mat.hip.cu create mode 100644 source/lib/src/rocm/prod_force.hip.cu create mode 100644 source/lib/src/rocm/prod_force_grad.hip.cu create mode 100644 source/lib/src/rocm/prod_virial.hip.cu create mode 100644 source/lib/src/rocm/prod_virial_grad.hip.cu create mode 100644 source/lib/src/rocm/region.hip.cu create mode 100644 source/lib/src/rocm/tabulate.hip.cu diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 908edd2b0a..e0ab5ad564 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.7) project(DeePMD) set(CMAKE_LINK_WHAT_YOU_USE TRUE) +# note +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake") # build cpp or python interfaces if (NOT DEFINED BUILD_CPP_IF) set(BUILD_CPP_IF TRUE) @@ -72,6 +74,29 @@ if (USE_CUDA_TOOLKIT) add_definitions("-D GOOGLE_CUDA") endif() +#define USE_ROCM_TOOLKIT +if (DEFINED USE_ROCM_TOOLKIT) + if (USE_ROCM_TOOLKIT) + find_package(HIP REQUIRED) + add_compile_definitions(__HIP_PLATFORM_HCC__) + else() + message(STATUS "Will not build nv GPU support") + endif() +else() + find_package(HIP QUIET) + if (HIP_FOUND) + set(USE_ROCM_TOOLKIT TRUE) + add_compile_definitions(__HIP_PLATFORM_HCC__) + message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support") + else() + set(USE_ROCM_TOOLKIT FALSE) + message(STATUS "No rocm support found, will not build AMD GPU support") + endif() +endif() +if (USE_ROCM_TOOLKIT) + add_definitions("-D TENSORFLOW_USE_ROCM") +endif() + # find tensorflow, I need tf abi info find_package(tensorflow REQUIRED) @@ -195,9 +220,13 @@ if (BUILD_CPP_IF) set (LIB_DEEPMD_OP "deepmd_op") set (LIB_DEEPMD_CC "deepmd_cc") if (USE_CUDA_TOOLKIT) - set (LIB_DEEPMD_OP_CUDA "deepmd_op_cuda") - else () - set (LIB_DEEPMD_OP_CUDA "deepmd_op") + set (LIB_DEEPMD_OP_DEVICE "deepmd_op_cuda") + else() + if(USE_ROCM_TOOLKIT) + set (LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm") + else () + set (LIB_DEEPMD_OP_DEVICE "deepmd_op") + endif() endif() if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9) set (LIB_DEEPMD_NATIVE "deepmd_native_md") diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index 855732b26b..f2019928c9 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -12,6 +12,7 @@ if (USE_CUDA_TOOLKIT) include_directories("${CUDA_INCLUDE_DIRS}") endif() + file(GLOB LIB_SRC src/*.cc src/*.cpp) file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) @@ -21,6 +22,10 @@ if (USE_CUDA_TOOLKIT) target_link_libraries (${libname} ${CUDA_LIBRARIES}) endif() +if (USE_ROCM_TOOLKIT) + target_link_libraries(${libname} ${HIP_LIBRARIES}) +endif() + install(TARGETS ${libname} DESTINATION lib/) install( diff --git a/source/api_cc/include/custom_op.h b/source/api_cc/include/custom_op.h index 60e15234aa..bd27821fb3 100644 --- a/source/api_cc/include/custom_op.h +++ b/source/api_cc/include/custom_op.h @@ -23,12 +23,12 @@ struct DeviceFunctor { { device = "CPU"; } - #if GOOGLE_CUDA + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void operator()( std::string& device, const GPUDevice& d) { device = "GPU"; } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; \ No newline at end of file diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc index e88357e04a..9400b47691 100644 --- a/source/api_cc/src/DeepPot.cc +++ b/source/api_cc/src/DeepPot.cc @@ -19,6 +19,19 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= } #endif +#if TENSORFLOW_USE_ROCM +#include + +#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); } +inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) +{ + if (code != hipSuccess) + { + fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line); + if (abort) exit(code); + } +} +#endif //TENSORFLOW_USE_ROCM static std::vector cum_sum (const std::vector & n_sel) { @@ -217,6 +230,20 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_ graph::SetDefaultDevice(str, &graph_def); } #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + hipGetDeviceCount(&gpu_num); // check current device environment + if (gpu_num > 0) { + options.config.set_allow_soft_placement(true); + options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); + options.config.mutable_gpu_options()->set_allow_growth(true); + hipErrcheck(hipSetDevice(gpu_rank % gpu_num)); + std::string str = "/gpu:"; + str += std::to_string(gpu_rank % gpu_num); + graph::SetDefaultDevice(str, &graph_def); + } + #endif // TENSORFLOW_USE_ROCM + check_status (NewSession(options, &session)); check_status (session->Create(graph_def)); rcut = get_scalar("descrpt_attr/rcut"); @@ -524,6 +551,10 @@ init (const std::vector & models, const int & gpu_rank, const std:: cudaGetDeviceCount(&gpu_num); #endif // GOOGLE_CUDA + #if TENSORFLOW_USE_ROCM + hipGetDeviceCount(&gpu_num); + #endif //TENSORFLOW_USE_ROCM + SessionOptions options; options.config.set_inter_op_parallelism_threads(num_inter_nthreads); options.config.set_intra_op_parallelism_threads(num_intra_nthreads); @@ -542,6 +573,16 @@ init (const std::vector & models, const int & gpu_rank, const std:: } #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + if (gpu_num > 0) { + options.config.set_allow_soft_placement(true); + options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); + options.config.mutable_gpu_options()->set_allow_growth(true); + hipErrcheck(hipSetDevice(gpu_rank % gpu_num)); + } + #endif // TENSORFLOW_USE_ROCM + for (unsigned ii = 0; ii < numb_models; ++ii) { if (gpu_num > 0) { std::string str = "/gpu:"; diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index 8692f5427d..e0785298b8 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -48,8 +48,8 @@ find_package(Threads) # find openmp find_package(OpenMP) if (OPENMP_FOUND) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() # define USE_CUDA_TOOLKIT @@ -76,6 +76,41 @@ if (USE_CUDA_TOOLKIT) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) endif() +# define USE_ROCM_TOOLKIT +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +if (DEFINED USE_ROCM_TOOLKIT) + if (USE_ROCM_TOOLKIT) + find_package(HIP REQUIRED) + add_compile_definitions(__HIP_PLATFORM_HCC__) + else() + message(STATUS "Will not build AMD GPU support") + endif() +else() + find_package(HIP QUIET) + if (HIP_FOUND) + set(USE_ROCM_TOOLKIT TRUE) + add_compile_definitions(__HIP_PLATFORM_HCC__) + message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support") + else() + set(USE_ROCM_TOOLKIT FALSE) + message(STATUS "No rocm support found, will not build AMD GPU support") + endif() +endif() + +if (USE_ROCM_TOOLKIT) + add_definitions("-D TENSORFLOW_USE_ROCM") + include_directories(${ROCM_INCLUDE_DIRS}) + add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) +endif() + + file(GLOB TEST_SRC test_*.cc) add_executable( runUnitTests ${TEST_SRC} ) @@ -93,6 +128,8 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config) +elseif(USE_ROCM_TOOLKIT) + target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread deepmd_op_rocm ${TensorFlow_LIBRARY} coverage_config) else() target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config) endif() diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 8d965edde9..8c15c1d64b 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -18,6 +18,13 @@ if (USE_CUDA_TOOLKIT) target_link_libraries (${libname} ${CUDA_LIBRARIES} ${EXTRA_LIBS}) endif() +if (USE_ROCM_TOOLKIT) + add_definitions("-D TENSORFLOW_USE_ROCM") + add_subdirectory(src/rocm) + set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) + target_link_libraries (${libname} ${HIP_LIBRARIES} ${EXTRA_LIBS}) +endif() + if(BUILD_PY_IF) install(TARGETS ${libname} DESTINATION deepmd/op/) endif(BUILD_PY_IF) diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h index d2beb1a6f2..a6beb6a013 100644 --- a/source/lib/include/coord.h +++ b/source/lib/include/coord.h @@ -91,4 +91,48 @@ copy_coord_gpu( const deepmd::Region & region); #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +// normalize coords +// output: +// coord +// input: +// natom, box_info: boxt, rec_boxt +template +void +normalize_coord_gpu_rocm( + FPTYPE * coord, + const int natom, + const deepmd::Region & region); + +// copy coordinates +// outputs: +// out_c, out_t, mapping, nall, +// int_data(temp cuda memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map, +// cell_map,cell_shift_map,sec_loc_cellnum_map,sec_total_cellnum_map,loc_clist +// inputs: +// in_c, in_t, nloc, mem_nall, loc_cellnum, total_cellnum, cell_info, box_info +// mem_nall is the size of allocated memory for out_c, out_t, mapping +// returns +// 0: succssful +// 1: the memory is not large enough to hold all copied coords and types. +// i.e. nall > mem_nall +template +int +copy_coord_gpu_rocm( + FPTYPE * out_c, + int * out_t, + int * mapping, + int * nall, + int * int_data, + const FPTYPE * in_c, + const int * in_t, + const int & nloc, + const int & mem_nall, + const int & loc_cellnum, + const int & total_cellnum, + const int * cell_info, + const deepmd::Region & region); +#endif // TENSORFLOW_USE_ROCM + } diff --git a/source/lib/include/device.h b/source/lib/include/device.h index f0e36ae73d..7fe6cc127c 100644 --- a/source/lib/include/device.h +++ b/source/lib/include/device.h @@ -10,4 +10,8 @@ typedef unsigned long long uint_64; #if GOOGLE_CUDA #include "gpu_cuda.h" -#endif \ No newline at end of file +#endif + +#if TENSORFLOW_USE_ROCM +#include "gpu_rocm.h" +#endif diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h index 1c6f66059c..f893897f6b 100644 --- a/source/lib/include/fmt_nlist.h +++ b/source/lib/include/fmt_nlist.h @@ -42,9 +42,37 @@ void test_encoding_decoding_nbor_info_gpu_cuda( const int * in_index, const int size_of_array); #endif //GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +template +void format_nbor_list_gpu_rocm( + int * nlist, + const FPTYPE * coord, + const int * type, + const deepmd::InputNlist & gpu_inlist, + int * array_int, + uint_64 * array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); + +template +void test_encoding_decoding_nbor_info_gpu_rocm( + uint_64 * key, + int * out_type, + int * out_index, + const int * in_type, + const FPTYPE * in_dist, + const int * in_index, + const int size_of_array); +#endif //TENSORFLOW_USE_ROCM + } + //////////////////////////////////////////////////////// // legacy code //////////////////////////////////////////////////////// diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h index 20f2d96de5..cf82720434 100644 --- a/source/lib/include/gelu.h +++ b/source/lib/include/gelu.h @@ -46,4 +46,27 @@ void gelu_grad_grad_gpu_cuda( const int size); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +void gelu_gpu_rocm( + FPTYPE * out, + const FPTYPE * xx, + const int size); + +template +void gelu_grad_gpu_rocm( + FPTYPE * out, + const FPTYPE * xx, + const FPTYPE * dy, + const int size); + +template +void gelu_grad_grad_gpu_rocm( + FPTYPE * out, + const FPTYPE * xx, + const FPTYPE * dy, + const FPTYPE * dy_2, + const int size); + +#endif//TENSORFLOW_USE_ROCM } diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h new file mode 100644 index 0000000000..ee3e88ee9e --- /dev/null +++ b/source/lib/include/gpu_rocm.h @@ -0,0 +1,115 @@ +#pragma once +#include +#include +#include +#include +//#include +//#include + +#define GPU_MAX_NBOR_SIZE 4096 + +#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); } +inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) { + if (code != hipSuccess) { + fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);} +inline void nborAssert(hipError_t code, const char *file, int line, bool abort=true) { + if (code != hipSuccess) { + fprintf(stderr,"hip assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line); + if (abort) exit(code); + } +} + +namespace deepmd { +template +void memcpy_host_to_device( + FPTYPE * device, + std::vector &host) +{ + hipErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice)); +} + +template +void memcpy_host_to_device( + FPTYPE * device, + const FPTYPE * host, + const int size) +{ + hipErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice)); +} + +template +void memcpy_device_to_host( + FPTYPE * device, + std::vector &host) +{ + hipErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost)); +} +template +void memcpy_device_to_host( + const FPTYPE * device, + FPTYPE * host, + const int size) +{ + hipErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost)); +} + +template +void malloc_device_memory( + FPTYPE * &device, + std::vector &host) +{ + hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size())); +} + +template +void malloc_device_memory( + FPTYPE * &device, + const int size) +{ + hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size)); +} + +template +void malloc_device_memory_sync( + FPTYPE * &device, + std::vector &host) +{ + hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size())); + memcpy_host_to_device(device, host); +} +template +void malloc_device_memory_sync( + FPTYPE * &device, + const FPTYPE * host, + const int size) +{ + hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size)); + memcpy_host_to_device(device, host, size); +} + +template +void delete_device_memory( + FPTYPE * &device) +{ + if (device != NULL) { + hipErrcheck(hipFree(device)); + } +} + +template +void memset_device_memory( + FPTYPE * device, + const FPTYPE var, + const int size) + { + hipErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size)); + } +} + + + diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h index afd0a674a0..bc717255b9 100644 --- a/source/lib/include/neighbor_list.h +++ b/source/lib/include/neighbor_list.h @@ -104,6 +104,48 @@ void use_nlist_map( #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +void convert_nlist_gpu_rocm( + InputNlist & gpu_nlist, + InputNlist & cpu_nlist, + int* & gpu_memory, + const int & max_nbor_size); + +void free_nlist_gpu_rocm( + InputNlist & gpu_nlist); + +// build neighbor list. +// outputs +// nlist, max_list_size +// max_list_size is the maximal size of jlist. +// inputs +// c_cpy, nloc, nall, mem_size, rcut, region +// mem_size is the size of allocated memory for jlist. +// returns +// 0: succssful +// 1: the memory is not large enough to hold all neighbors. +// i.e. max_list_size > mem_nall +template +int +build_nlist_gpu_rocm( + InputNlist & nlist, + int * max_list_size, + int * nlist_data, + const FPTYPE * c_cpy, + const int & nloc, + const int & nall, + const int & mem_size, + const float & rcut); + +void use_nlist_map( + int * nlist, + const int * nlist_map, + const int nloc, + const int nnei); + +#endif // TENSORFLOW_USE_ROCM + } // namespace deepmd diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h index 0789bf75a6..58f1ae8485 100644 --- a/source/lib/include/prod_env_mat.h +++ b/source/lib/include/prod_env_mat.h @@ -91,5 +91,55 @@ void env_mat_nbor_update( const int size); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +void prod_env_mat_a_gpu_rocm( + FPTYPE * em, + FPTYPE * em_deriv, + FPTYPE * rij, + int * nlist, + const FPTYPE * coord, + const int * type, + const InputNlist & gpu_inlist, + int * array_int, + unsigned long long * array_longlong, + const int max_nbor_size, + const FPTYPE * avg, + const FPTYPE * std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); + +template +void prod_env_mat_r_gpu_rocm( + FPTYPE * em, + FPTYPE * em_deriv, + FPTYPE * rij, + int * nlist, + const FPTYPE * coord, + const int * type, + const InputNlist & gpu_inlist, + int * array_int, + unsigned long long * array_longlong, + const int max_nbor_size, + const FPTYPE * avg, + const FPTYPE * std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); + +void env_mat_nbor_update( + InputNlist &inlist, + InputNlist &gpu_inlist, + int &max_nbor_size, + int* &nbor_list_dev, + const int * mesh, + const int size); +#endif // TENSORFLOW_USE_ROCM + } diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h index 1667edb61c..49530a1952 100644 --- a/source/lib/include/prod_force.h +++ b/source/lib/include/prod_force.h @@ -44,4 +44,26 @@ void prod_force_r_gpu_cuda( const int nnei); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +void prod_force_a_gpu_rocm( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int nnei); + +template +void prod_force_r_gpu_rocm( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int nnei); +#endif // TENSORFLOW_USE_ROCM + } diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h index f6ac58269f..ff0bbe8015 100644 --- a/source/lib/include/prod_force_grad.h +++ b/source/lib/include/prod_force_grad.h @@ -40,4 +40,23 @@ void prod_force_grad_r_gpu_cuda( const int nnei); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +void prod_force_grad_a_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int * nlist, + const int nloc, + const int nnei); + +template +void prod_force_grad_r_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int * nlist, + const int nloc, + const int nnei); +#endif // TENSORFLOW_USE_ROCM } diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h index 6655059e12..229ba968da 100644 --- a/source/lib/include/prod_virial.h +++ b/source/lib/include/prod_virial.h @@ -52,4 +52,31 @@ void prod_virial_r_gpu_cuda( const int nnei); #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +template +void prod_virial_a_gpu_rocm( + FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei); + +template +void prod_virial_r_gpu_rocm( + FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei); +#endif // TENSORFLOW_USE_ROCM + } //namespace deepmd diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h index 7a8c87c0dd..8ab8171e14 100644 --- a/source/lib/include/prod_virial_grad.h +++ b/source/lib/include/prod_virial_grad.h @@ -44,4 +44,26 @@ void prod_virial_grad_r_gpu_cuda( const int nnei); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +void prod_virial_grad_a_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei); + +template +void prod_virial_grad_r_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei); +#endif // TENSORFLOW_USE_ROCM + } diff --git a/source/lib/include/region.h b/source/lib/include/region.h index 901299a241..b6428a1bf4 100644 --- a/source/lib/include/region.h +++ b/source/lib/include/region.h @@ -59,6 +59,28 @@ volume_gpu( const Region & region); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +//only for unittest +template +void +convert_to_inter_gpu_rocm( + FPTYPE * ri, + const Region & region, + const FPTYPE * rp); + +template +void +convert_to_phys_gpu_rocm( + FPTYPE * rp, + const Region & region, + const FPTYPE * ri); + +template +void +volume_gpu_rocm( + FPTYPE * volume, + const Region & region); +#endif // TENSORFLOW_USE_ROCM } diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index b684be3c4c..bce0913ebd 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -52,5 +52,30 @@ void tabulate_fusion_grad_gpu_cuda( const int last_layer_size); #endif // GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +void tabulate_fusion_gpu_rocm( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei, + const int last_layer_size); + +template +void tabulate_fusion_grad_gpu_rocm( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei, + const int last_layer_size); +#endif // TENSORFLOW_USE_ROCM } diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc index 89e8552524..a41784bc29 100644 --- a/source/lib/src/neighbor_list.cc +++ b/source/lib/src/neighbor_list.cc @@ -875,3 +875,36 @@ void deepmd::free_nlist_gpu_cuda( delete_device_memory(gpu_nlist.firstneigh); } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +void deepmd::convert_nlist_gpu_rocm( + InputNlist & gpu_nlist, + InputNlist & cpu_nlist, + int* & gpu_memory, + const int & max_nbor_size) +{ + const int inum = cpu_nlist.inum; + gpu_nlist.inum = inum; + malloc_device_memory(gpu_nlist.ilist, inum); + malloc_device_memory(gpu_nlist.numneigh, inum); + malloc_device_memory(gpu_nlist.firstneigh, inum); + memcpy_host_to_device(gpu_nlist.ilist, cpu_nlist.ilist, inum); + memcpy_host_to_device(gpu_nlist.numneigh, cpu_nlist.numneigh, inum); + int ** _firstneigh = NULL; + _firstneigh = (int**)malloc(sizeof(int*) * inum); + for (int ii = 0; ii < inum; ii++) { + memcpy_host_to_device(gpu_memory + ii * max_nbor_size, cpu_nlist.firstneigh[ii], cpu_nlist.numneigh[ii]); + _firstneigh[ii] = gpu_memory + ii * max_nbor_size; + } + memcpy_host_to_device(gpu_nlist.firstneigh, _firstneigh, inum); + free(_firstneigh); +} + +void deepmd::free_nlist_gpu_rocm( + InputNlist & gpu_nlist) +{ + delete_device_memory(gpu_nlist.ilist); + delete_device_memory(gpu_nlist.numneigh); + delete_device_memory(gpu_nlist.firstneigh); +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc index c5e3223e9e..44f6dc516c 100644 --- a/source/lib/src/prod_env_mat.cc +++ b/source/lib/src/prod_env_mat.cc @@ -320,3 +320,68 @@ void deepmd::env_mat_nbor_update( delete [] mesh_host; } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +void deepmd::env_mat_nbor_update( + InputNlist &inlist, + InputNlist &gpu_inlist, + int &max_nbor_size, + int* &nbor_list_dev, + const int * mesh, + const int size) +{ + int *mesh_host = new int[size]; + hipErrcheck(hipMemcpy(mesh_host, mesh, sizeof(int) * size, hipMemcpyDeviceToHost)); + memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *)); + memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *)); + memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **)); + const int ago = mesh_host[0]; + if (ago == 0) { + const int inum = inlist.inum; + if (gpu_inlist.inum < inum) { + delete_device_memory(gpu_inlist.ilist); + delete_device_memory(gpu_inlist.numneigh); + delete_device_memory(gpu_inlist.firstneigh); + malloc_device_memory(gpu_inlist.ilist, inum); + malloc_device_memory(gpu_inlist.numneigh, inum); + malloc_device_memory(gpu_inlist.firstneigh, inum); + } + memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum); + memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum); + int _max_nbor_size = max_numneigh(inlist); + if (_max_nbor_size <= 1024) { + _max_nbor_size = 1024; + } + else if (_max_nbor_size <= 2048) { + _max_nbor_size = 2048; + } + else { + _max_nbor_size = 4096; + } + if ( nbor_list_dev == NULL + || _max_nbor_size > max_nbor_size + || inum > gpu_inlist.inum) + { + delete_device_memory(nbor_list_dev); + malloc_device_memory(nbor_list_dev, inum * _max_nbor_size); + } + // update info + gpu_inlist.inum = inum; + max_nbor_size = _max_nbor_size; + + // copy nbor list from host to the device + std::vector nbor_list_host(inum * max_nbor_size, 0); + int ** _firstneigh = (int**)malloc(sizeof(int*) * inum); + for (int ii = 0; ii < inum; ii++) { + _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size; + for (int jj = 0; jj < inlist.numneigh[ii]; jj++) { + nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj]; + } + } + memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0], inum * max_nbor_size); + memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum); + free(_firstneigh); + } + delete [] mesh_host; +} +#endif // TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt new file mode 100644 index 0000000000..27c4d62f68 --- /dev/null +++ b/source/lib/src/rocm/CMakeLists.txt @@ -0,0 +1,45 @@ +# required cmake version +cmake_minimum_required(VERSION 3.15) +# project name +project(deepmd_op_rocm) +set(CMAKE_LINK_WHAT_YOU_USE TRUE) +#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake") +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +find_package(HIP REQUIRED) + +add_compile_definitions(__HIP_PLATFORM_HCC__) +link_directories(${HIP_ROOT_DIR}/lib) +add_definitions("-DTENSORFLOW_USE_ROCM") + +# set c++ version c++11 +#SET(CMAKE_CXX_STANDARD 11) +SET(CMAKE_HIP_STANDARD 11) + +message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) +message(STATUS "HIP major version is " ${HIP_TOOLKIT_ROOT_DIR}) + + +set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") + +set (SOURCE_FILES + prod_env_mat.hip.cu prod_force.hip.cu prod_virial.hip.cu gelu.hip.cu tabulate.hip.cu coord.hip.cu neighbor_list.hip.cu prod_force_grad.hip.cu prod_virial_grad.hip.cu region.hip.cu +) + +hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES}) +target_link_libraries(deepmd_op_rocm ${HIP_LIBRARIES}) + +#install(TARGETS deepmd_op_rocm DESTINATION lib/) +if (BUILD_CPP_IF) + install(TARGETS deepmd_op_rocm DESTINATION lib/) +endif (BUILD_CPP_IF) +if (BUILD_PY_IF) + install(TARGETS deepmd_op_rocm DESTINATION deepmd/op/) +endif (BUILD_PY_IF) diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu new file mode 100644 index 0000000000..b5516d3603 --- /dev/null +++ b/source/lib/src/rocm/coord.hip.cu @@ -0,0 +1,414 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "gpu_rocm.h" +#include "coord.h" +#include "region.cuh" + +__device__ inline int collapse_index( + const int * idx, + const int * size) +{ + return (idx[0] * size[1] + idx[1]) * size[2] + idx[2]; +} +__device__ inline void index_recover( + const int in_idx, + const int * size, + int * idx) +{ + idx[2]=in_idx%size[2]; + idx[1]=int(in_idx/size[2])%size[1]; + idx[0]=int(int(in_idx/size[2])/size[1]); +} +__device__ inline void idx_addshift( + int * idx, + const int * shift) +{ + for(int dd=0;dd<3;dd++) + { + idx[dd]+=shift[dd]; + } +} +__device__ inline void idx_unshift( + int * idx, + const int * shift) +{ + for(int dd=0;dd<3;dd++) + { + idx[dd]-=shift[dd]; + } +} +__device__ inline int compute_pbc_shift( + int idx, + int ncell) +{ + int shift = 0; + if (idx < 0) { + shift = 1; + while (idx + shift * ncell < 0) shift ++; + } + else if (idx >= ncell) { + shift = -1; + while (idx + shift * ncell >= ncell) shift --; + } + return shift; +} + +template +__global__ void normalize_one( + FPTYPE *out_c, + const FPTYPE *boxt, + const FPTYPE *rec_boxt, + const int nall) +{ + // <<>> + int idy=blockIdx.x*blockDim.x+threadIdx.x; + if (idy>=nall){return;} + FPTYPE inter[3]; + phys2Inter(inter,out_c+idy*3,rec_boxt); + for (int dd = 0; dd < 3; ++dd) { + while(inter[dd] >= 1.) inter[dd] -= 1.; + while(inter[dd] < 0.) inter[dd] += 1.; + } + inter2Phys(out_c+idy*3,inter,boxt); +} + +template +__global__ void _fill_idx_cellmap( + int * idx_cellmap, + int * idx_cellmap_noshift, + const FPTYPE *in_c, + const FPTYPE *rec_boxt, + const int *nat_stt, + const int *nat_end, + const int *ext_stt, + const int *ext_end, + const int nloc) +{ + int idy = blockIdx.x*blockDim.x+threadIdx.x; + int ext_ncell[3]; + int global_grid[3]; + int idx_orig_shift[3]; + FPTYPE cell_size[3]; + FPTYPE nat_orig[3]; + for (int dd = 0; dd < 3; ++dd) + { + ext_ncell[dd] = ext_end[dd] - ext_stt[dd]; + global_grid[dd] = nat_end[dd] - nat_stt[dd]; + idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd]; + cell_size[dd] = 1./global_grid[dd]; + nat_orig[dd] = nat_stt[dd] * cell_size[dd]; + } + if (idy= nat_end[dd]) + { + idx_noshift[dd] = nat_end[dd] - 1; + } + idx[dd] = idx_noshift[dd]+idx_orig_shift[dd]; + } + idx_cellmap_noshift[idy]=collapse_index(idx_noshift, global_grid); + idx_cellmap[idy]=collapse_index(idx, ext_ncell); + } +} + +__global__ void _fill_loc_cellnum_map( + int * temp_idx_order, + int * loc_cellnum_map, + const int * idx_cellmap_noshift, + const int nloc, + const int loc_cellnum) +{ + int idy = blockIdx.x*blockDim.x+threadIdx.x; + if (idy=nloc){return;} + int cell_idx=idx_cellmap[idy]; + int * clist_row = clist+sec_num_map[cell_idx]; + clist_row[idx_order[idy]]=idy; +} + +template +__global__ void _copy_coord( + FPTYPE * out_c, + int * out_t, + int * mapping, + const FPTYPE * in_c, + const int * in_t, + const int * cell_map, + const int * cell_shift_map, + const int * sec_loc_cellnum_map, + const int * sec_total_cellnum_map, + const int * loc_clist, + const int nloc, + const int nall, + const int total_cellnum, + const FPTYPE * boxt, + const FPTYPE * rec_boxt) +{ + int idy = blockIdx.x*blockDim.x+threadIdx.x; + if(idy>=nall){return;} + if(idy=sec_total_cellnum_map[ii+1])cell_idx++; + else break; + } + for(int dd=0;dd<3;dd++) + { + shift[dd]=cell_shift_map[cell_idx*3+dd]; + d_shift[dd]=shift[dd]; + } + atom_idx=idy-sec_total_cellnum_map[cell_idx]; + orig_cell_idx=cell_map[cell_idx]; + orig_idy=loc_clist[sec_loc_cellnum_map[orig_cell_idx]+atom_idx]; + mapping[idy]=orig_idy; + out_t[idy]=in_t[orig_idy]; + FPTYPE shift_v[3]; + inter2Phys(shift_v,d_shift,boxt); + for(int dd=0;dd<3;dd++) + { + out_c[idy*3+dd]=in_c[orig_idy*3+dd]-shift_v[dd]; + } + } +} + +template +void compute_int_data( + int * int_data, + const FPTYPE * in_c, + const int * cell_info, + const deepmd::Region & region, + const int nloc, + const int loc_cellnum, + const int total_cellnum) +{ + int * idx_cellmap=int_data; + int * idx_cellmap_noshift=idx_cellmap+nloc; + int * temp_idx_order=idx_cellmap_noshift+nloc; + int * loc_cellnum_map=temp_idx_order+nloc; + int * total_cellnum_map=loc_cellnum_map+loc_cellnum; + int * mask_cellnum_map=total_cellnum_map+total_cellnum; + int * cell_map=mask_cellnum_map+total_cellnum; + int * cell_shift_map=cell_map+total_cellnum; + const int * nat_stt=cell_info; + const int * nat_end=cell_info+3; + const int * ext_stt=cell_info+6; + const int * ext_end=cell_info+9; + const FPTYPE * rec_boxt = region.rec_boxt; + + const int nblock_loc=(nloc+TPB-1)/TPB; + hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, + nat_stt, nat_end, ext_stt, ext_end, nloc); + + const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB; + hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0, temp_idx_order, loc_cellnum_map, + idx_cellmap_noshift, nloc, loc_cellnum); + + const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB; + hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0, total_cellnum_map, mask_cellnum_map, cell_map, + cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum); +} + +void build_loc_clist( + int * int_data, + const int nloc, + const int loc_cellnum, + const int total_cellnum) +{ + const int nblock=(nloc+TPB-1)/TPB; + const int * idx_cellmap_noshift=int_data+nloc; + const int * temp_idx_order=idx_cellmap_noshift+nloc; + const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum; + int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1; + hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc); +} + +template +void copy_coord( + FPTYPE * out_c, + int * out_t, + int * mapping, + const int * int_data, + const FPTYPE * in_c, + const int * in_t, + const int nloc, + const int nall, + const int loc_cellnum, + const int total_cellnum, + const deepmd::Region & region) +{ + const int nblock=(nall+TPB-1)/TPB; + const int * cell_map=int_data+3*nloc+loc_cellnum+2*total_cellnum; + const int * cell_shift_map=cell_map+total_cellnum; + const int * sec_loc_cellnum_map=cell_shift_map+3*total_cellnum; + const int * sec_total_cellnum_map=sec_loc_cellnum_map+loc_cellnum+1; + const int * loc_clist=sec_total_cellnum_map+total_cellnum+1; + + const FPTYPE *boxt = region.boxt; + const FPTYPE *rec_boxt = region.rec_boxt; + hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, + sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt); +} + +namespace deepmd { +template +void +normalize_coord_gpu_rocm( + FPTYPE * coord, + const int natom, + const Region & region) +{ + const FPTYPE * boxt=region.boxt; + const FPTYPE * rec_boxt=region.rec_boxt; + const int nblock=(natom+TPB-1)/TPB; + hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt, natom); +} + +template +int +copy_coord_gpu_rocm( + FPTYPE * out_c, + int * out_t, + int * mapping, + int * nall, + int * int_data, + const FPTYPE * in_c, + const int * in_t, + const int & nloc, + const int & mem_nall, + const int & loc_cellnum, + const int & total_cellnum, + const int * cell_info, + const Region & region) +{ + compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum); + int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map + hipErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost)); + int * loc_cellnum_map=int_data_cpu; + int * total_cellnum_map=loc_cellnum_map+loc_cellnum; + int * mask_cellnum_map=total_cellnum_map+total_cellnum; + int * sec_loc_cellnum_map=mask_cellnum_map+total_cellnum; + int * sec_total_cellnum_map=sec_loc_cellnum_map+loc_cellnum+1; + sec_loc_cellnum_map[0]=0; + sec_total_cellnum_map[0]=nloc; + int max_cell=0; + for(int iii=0;iii mem_nall){ + delete[] int_data_cpu; + // size of the output arrays is not large enough + return 1; + } + else{ + hipErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, + sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), hipMemcpyHostToDevice)); + delete[] int_data_cpu; + build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum); + copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall, loc_cellnum, total_cellnum, region); + } + return 0; +} + +template void normalize_coord_gpu_rocm(float * coord, const int natom, const Region & region); +template void normalize_coord_gpu_rocm(double * coord, const int natom, const Region & region); +template int copy_coord_gpu_rocm(float * out_c, int * out_t, int * mapping, int * nall, int * int_data, const float * in_c, const int * in_t, const int & nloc, const int & mem_nall, const int & loc_cellnum, const int & total_cellnum, const int * cell_info, const Region & region); +template int copy_coord_gpu_rocm(double * out_c, int * out_t, int * mapping, int * nall, int * int_data, const double * in_c, const int * in_t, const int & nloc, const int & mem_nall, const int & loc_cellnum, const int & total_cellnum, const int * cell_info, const Region & region); +} \ No newline at end of file diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu new file mode 100644 index 0000000000..d8f02ae9fc --- /dev/null +++ b/source/lib/src/rocm/gelu.hip.cu @@ -0,0 +1,98 @@ +#include "hip/hip_runtime.h" +#include "gelu.h" +#include "device.h" + +template +__global__ void gelu( + FPTYPE * out, + const FPTYPE * xx, + int const size) +{ + int const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) { + return; + } + out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); +} + +template +__global__ void gelu_grad( + FPTYPE * out, + const FPTYPE * xx, + const FPTYPE * dy, + int const size) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) { + return; + } + // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); + const FPTYPE var = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])); + out[idx] = dy[idx] * (0.5 * SQRT_2_PI * xx[idx] * (1 - var * var) * (0.134145 * xx[idx] * xx[idx] + 1) + 0.5 * var + 0.5); +} + +template +__global__ void gelu_grad_grad( + FPTYPE * out, + const FPTYPE * xx, + const FPTYPE * dy, + const FPTYPE * dy_2, + int const size) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) { + return; + } + // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx]))); + const FPTYPE var1 = tanh(SQRT_2_PI * (xx[idx] + 0.044715 * xx[idx] * xx[idx] *xx[idx])); + const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * xx[idx] * xx[idx] + 1); + out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2); +} + +namespace deepmd { + template + void gelu_gpu_rocm( + FPTYPE * out, + const FPTYPE * xx, + const int size) + { + const int THREAD_ITEMS = 1024; + const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size); + } + + template + void gelu_grad_gpu_rocm( + FPTYPE * out, + const FPTYPE * xx, + const FPTYPE * dy, + const int size) + { + const int THREAD_ITEMS = 1024; + const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, size); + } + + template + void gelu_grad_grad_gpu_rocm( + FPTYPE * out, + const FPTYPE * xx, + const FPTYPE * dy, + const FPTYPE * dy_2, + const int size) + { + const int THREAD_ITEMS = 1024; + const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, dy_2, size); + } + + template void gelu_gpu_rocm(float * out, const float * x, const int size); + template void gelu_gpu_rocm(double * out, const double * x, const int size); + template void gelu_grad_gpu_rocm(float * out, const float * x, const float * dy, const int size); + template void gelu_grad_gpu_rocm(double * out, const double * x, const double * dy, const int size); + template void gelu_grad_grad_gpu_rocm(float * out, const float * x, const float * dy, const float * dy_2, const int size); + template void gelu_grad_grad_gpu_rocm(double * out, const double * x, const double * dy, const double * dy_2, const int size); +} \ No newline at end of file diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu new file mode 100644 index 0000000000..a0da866d12 --- /dev/null +++ b/source/lib/src/rocm/neighbor_list.hip.cu @@ -0,0 +1,184 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "neighbor_list.h" + +template +__device__ inline FPTYPE dev_dot( + FPTYPE * arr1, + FPTYPE * arr2) +{ + return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; +} + +template +__global__ void build_nlist( + int * ilist, + int * temp_nlist, + const FPTYPE * c_cpy, + const FPTYPE rcut2, + const int nloc, + const int nall, + const int mem_size) +{ + const unsigned int atom_idx = blockIdx.x; + const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y; + if(neighbor_idx=nnei){return;} + int nlist_idx=atom_idx*nnei+nei_idx; + int nlist_item=nlist[nlist_idx]; + if(nlist_item!=-1){ + nlist[nlist_idx]=nlist_map[nlist_item]; + } +} + +namespace deepmd { +template +int build_nlist_gpu_rocm( + InputNlist & nlist, + int * max_list_size, + int * nlist_data, + const FPTYPE * c_cpy, + const int & nloc, + const int & nall, + const int & mem_size, + const float & rcut) +{ + if(mem_size < nall){ + return 1; + } + const int nblock = (nall+TPB-1)/TPB; + int * ilist = nlist.ilist; + int * numneigh = nlist.numneigh; + int ** firstneigh = nlist.firstneigh; + hipErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); + int * temp_nlist = nlist_data; //nloc*mem_size + int * nei_order = temp_nlist + nloc * mem_size; + nlist.inum = nloc; + FPTYPE rcut2 = rcut * rcut; + + + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, TPB); + hipLaunchKernelGGL(build_nlist, block_grid, thread_grid, 0, 0, + ilist, + temp_nlist, + c_cpy, + rcut2, + nloc, + nall, + mem_size); + const int nblock_ = (nloc+TPB-1)/TPB; + hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0, + numneigh, + nei_order, + temp_nlist, + mem_size, + nloc, + nall); + hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, + firstneigh, + temp_nlist, + nei_order, + mem_size, + nall + ); + int * numneigh_host = new int[nloc]; + hipErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost)); + int max_nei = 0; + for(int ii=0;iimax_nei)max_nei=numneigh_host[ii]; + } + *max_list_size = max_nei; + delete [] numneigh_host; + return 0; +} + +void use_nlist_map( + int * nlist, + const int * nlist_map, + const int nloc, + const int nnei) +{ + int nblock=(nnei+TPB-1)/TPB; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, TPB); + hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map, nloc, nnei); +} + +template int build_nlist_gpu_rocm(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut); +template int build_nlist_gpu_rocm(InputNlist & nlist, int * max_list_size, int * nlist_data, const double * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut); +} \ No newline at end of file diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu new file mode 100644 index 0000000000..61635aaf83 --- /dev/null +++ b/source/lib/src/rocm/prod_env_mat.hip.cu @@ -0,0 +1,579 @@ +#include "hip/hip_runtime.h" +#include "gpu_rocm.h" +#include "fmt_nlist.h" +#include "prod_env_mat.h" +#include "device.h" +#include "prod_env_mat.h" +#include "rocprim/rocprim.hpp" +#include "hipcub/hipcub.hpp" + +// common part of prod_env_mat +template < + typename Key, + int BLOCK_THREADS, + int ITEMS_PER_THREAD> +__launch_bounds__ (BLOCK_THREADS) +__global__ void BlockSortKernel( + Key * d_in, + Key * d_out) // Tile of output +{ + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) + typedef hipcub::BlockLoad BlockLoadT; + // Specialize BlockRadixSort type for our thread block + typedef hipcub::BlockRadixSort BlockRadixSortT; + // Shared memory + __shared__ union TempStorage + { + typename BlockLoadT::TempStorage load; + typename BlockRadixSortT::TempStorage sort; + } temp_storage; + // Per-thread tile items + Key items[ITEMS_PER_THREAD]; + // Our current block's offset + int block_offset = blockIdx.x * TILE_SIZE; + // Load items into a blocked arrangement + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + // Barrier for smem reuse + __syncthreads(); + // Sort keys + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items); + // Store output in striped fashion + hipcub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); +} + +template +__device__ inline FPTYPE dev_dot( + FPTYPE * arr1, + FPTYPE * arr2) +{ + return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; +} + +template +__device__ inline void spline5_switch( + FPTYPE & vv, + FPTYPE & dd, + FPTYPE & xx, + const float & rmin, + const float & rmax) +{ + if (xx < rmin) { + dd = 0; + vv = 1; + } + else if (xx < rmax) { + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; + vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; + dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + } + else { + dd = 0; + vv = 0; + } +} + +template +__device__ inline uint_64 encoding_nbor_info( + const int type, + const FPTYPE dist, + const int index) +{ + // nbor info checking: + // the type of nbor atom must be smaller than 128 + // the distance of center atom between nbor atom must be smaller than 128 + // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24) + if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) { + __builtin_trap(); + } + return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index; +} + +__device__ inline void decoding_nbor_info( + int &type, + int &index, + const uint_64 key) +{ + type = key >> 57; + index = key & 0xFFFFFF; +} + +template +__global__ void get_i_idx( + FPTYPE * i_idx, + const int nloc, + const FPTYPE * ilist) +{ + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= nloc) { + return; + } + i_idx[ilist[idx]] = idx; +} + +template +__global__ void format_nlist_fill_a( + uint_64 * key, + const FPTYPE * coord, + const int * type, + const int * numneigh, + int ** firstneigh, + const float rcut, + int * i_idx, + const int MAX_NBOR_SIZE) +{ + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + + const int nsize = numneigh[i_idx[idx]]; + if (idy >= nsize) { + return; + } + + const int * nei_idx = firstneigh[i_idx[idx]]; + // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); + uint_64 * key_in = key + idx * MAX_NBOR_SIZE; + FPTYPE diff[3]; + const int & j_idx = nei_idx[idy]; + for (int dd = 0; dd < 3; dd++) { + diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; + } + FPTYPE rr = sqrt(dev_dot(diff, diff)); + if (rr <= rcut) { + key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx); + } +} + +template +__global__ void format_nlist_fill_b( + int * nlist, + const int nlist_size, + const int nloc, + FPTYPE * key, + const int * sec, + const int sec_size, + int * nei_iter_dev, + const int max_nbor_size) +{ + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= nloc) { + return; + } + + int * row_nlist = nlist + idx * nlist_size; + int * nei_iter = nei_iter_dev + idx * sec_size; + FPTYPE * key_out = key + nloc * max_nbor_size + idx * max_nbor_size; + for (int ii = 0; ii < sec_size; ii++) { + nei_iter[ii] = sec[ii]; + } + + int nei_type = 0, nbor_idx = 0; + for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) { + decoding_nbor_info(nei_type, nbor_idx, key_out[kk]); + if (nei_iter[nei_type] < sec[nei_type + 1]) { + row_nlist[nei_iter[nei_type]++] = nbor_idx; + } + } +} + +template +__global__ void encoding_decoding_nbor_info( + uint_64 * key, + int * out_type, + int * out_index, + const int * in_type, + const FPTYPE * in_dist, + const int * in_index, + const int size_of_array) +{ + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size_of_array) { + return; + } + + key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]); + decoding_nbor_info(out_type[idx], out_index[idx], key[idx]); +} + +template +void format_nbor_list_1024 ( + uint_64 * key, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist & gpu_inlist, + const int& nloc, + const float& rcut, + int * i_idx) +{ + const int LEN = 256; + const int MAX_NBOR_SIZE = 1024; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, + key, + coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; + // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), nloc, BLOCK_THREADS, 0, 0, + key, + key + nloc * MAX_NBOR_SIZE); +} + +template +void format_nbor_list_2048 ( + uint_64 * key, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist & gpu_inlist, + const int& nloc, + const float& rcut, + int * i_idx) +{ + const int LEN = 256; + const int MAX_NBOR_SIZE = 2048; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, + key, + coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; + // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), nloc, BLOCK_THREADS, 0, 0, + key, + key + nloc * MAX_NBOR_SIZE); +} + +template +void format_nbor_list_4096 ( + uint_64 * key, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist & gpu_inlist, + const int& nloc, + const float& rcut, + int * i_idx) +{ + const int LEN = 256; + const int MAX_NBOR_SIZE = 4096; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, + key, + coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + const int ITEMS_PER_THREAD = 16; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; + // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), nloc, BLOCK_THREADS, 0, 0, + key, + key + nloc * MAX_NBOR_SIZE); +} + +template< + typename FPTYPE, + int THREADS_PER_BLOCK> +__global__ void compute_env_mat_a( + FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + const FPTYPE* coord, + const FPTYPE* avg, + const FPTYPE* std, + const int* type, + const int* nlist, + const int nnei, + const float rmin, + const float rmax) +{ + // <<>> + const unsigned int bid = blockIdx.x; + const unsigned int tid = threadIdx.x; + if (tid >= nnei) { + return; + } + const int ndescrpt = nnei * 4; + const int * row_nlist = nlist + bid * nnei; + FPTYPE * row_rij = rij + bid * nnei * 3; + FPTYPE * row_descript = em + bid * nnei * 4; + FPTYPE * row_descript_deriv = em_deriv + bid * nnei * 12; + for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) { + const int idx_value = ii * 4; // 4 components + const int idx_deriv = ii * 12; // 4 components time 3 directions + if (row_nlist[ii] >= 0) { + FPTYPE rr[3] = {0}; + FPTYPE dd[4] = {0}; + FPTYPE vv[12] = {0}; + const int j_idx = row_nlist[ii]; + for (int kk = 0; kk < 3; kk++) { + rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; + row_rij[ii * 3 + kk] = rr[kk]; + } + // const FPTYPE * rr = &row_rij[ii * 3]; + FPTYPE nr2 = dev_dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + dd[0] = (1./nr) ;//* sw; + dd[1] = (rr[0] / nr2) ;//* sw; + dd[2] = (rr[1] / nr2) ;//* sw; + dd[3] = (rr[2] / nr2) ;//* sw; + vv[0] = (rr[0] * inr3 * sw - dd[0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; + vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; + vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; + // ****deriv of component x/r2 + vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; + vv[4] = ((2. * rr[0] * rr[1] * inr4 ) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; + vv[5] = ((2. * rr[0] * rr[2] * inr4 ) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; + // ***deriv of component y/r2 + vv[6] = ((2. * rr[1] * rr[0] * inr4 ) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; + vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; + vv[8] = ((2. * rr[1] * rr[2] * inr4 ) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; + // ***deriv of component z/r2 + vv[9] = ((2. * rr[2] * rr[0] * inr4 ) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; + vv[10]= ((2. * rr[2] * rr[1] * inr4 ) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; + vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; + // 4 value components + dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; + dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1]; + dd[2] *= sw; // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2]; + dd[3] *= sw; // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3]; + for (int ii = 0; ii < 12; ii++) { + row_descript_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3]; + } + for (int ii = 0; ii < 4; ii++) { + row_descript[idx_value + ii] = (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) / std[type[bid] * ndescrpt + idx_value + ii]; + } + } + else { + // TODO: move it to the memset. + row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value]; + } + } +} + +template< + typename FPTYPE, + int THREADS_PER_BLOCK> +__global__ void compute_env_mat_r( + FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + const FPTYPE* coord, + const FPTYPE* avg, + const FPTYPE* std, + const int* type, + const int* nlist, + const int nnei, + const float rmin, + const float rmax) +{ + // <<>> + const unsigned int bid = blockIdx.x; + const unsigned int tid = threadIdx.x; + if (tid >= nnei) { + return; + } + const int ndescrpt = nnei; + const int * row_nlist = nlist + bid * nnei; + FPTYPE * row_rij = rij + bid * nnei * 3; + FPTYPE * row_em = em + bid * nnei; + FPTYPE * row_em_deriv = em_deriv + bid * nnei * 3; + for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) { + const int idx_value = ii; // 4 components + const int idx_deriv = ii * 3; // 4 components time 3 directions + if (row_nlist[ii] >= 0) { + FPTYPE rr[3] = {0}; + FPTYPE vv[3] = {0}; + FPTYPE dd = 0; + const int & j_idx = row_nlist[ii]; + for (int kk = 0; kk < 3; kk++) { + rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; + row_rij[ii * 3 + kk] = rr[kk]; + } + // const FPTYPE * rr = &row_rij[ii * 3]; + FPTYPE nr2 = dev_dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + dd = (1./nr) ;//* sw; + vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; + vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; + vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; + + // 4 value components + dd *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; + for (int ii = 0; ii < 3; ii++) { + row_em_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3]; + } + row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) / std[type[bid] * ndescrpt + idx_value]; + } + else { + // TODO: move it to the memset. + row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value]; + } + } +} + +namespace deepmd { +template +void format_nbor_list_gpu_rocm( + int * nlist, + const FPTYPE * coord, + const int * type, + const deepmd::InputNlist & gpu_inlist, + int * array_int, + uint_64 * array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec) +{ + const int LEN = 256; + const int nnei = sec.back(); + const int nblock = (nloc + LEN -1) / LEN; + int * sec_dev = array_int; + int * nei_iter = array_int + sec.size(); // = new int[sec_size]; + int * i_idx = array_int + sec.size() + nloc * sec.size(); + uint_64 * key = array_longlong; + assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); + hipErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei)); + hipErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size)); + hipErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, + i_idx, + nloc, gpu_inlist.ilist); + + if (max_nbor_size == 1024) { + format_nbor_list_1024 ( + key, + coord, type, gpu_inlist, nloc, rcut, i_idx); + } + else if (max_nbor_size == 2048) { + format_nbor_list_2048 ( + key, + coord, type, gpu_inlist, nloc, rcut, i_idx); + } + else if (max_nbor_size == 4096) { + format_nbor_list_4096 ( + key, + coord, type, gpu_inlist, nloc, rcut, i_idx); + } + + hipLaunchKernelGGL(format_nlist_fill_b, nblock, LEN, 0, 0, + nlist, + nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); +} + +template +void prod_env_mat_a_gpu_rocm( + FPTYPE * em, + FPTYPE * em_deriv, + FPTYPE * rij, + int * nlist, + const FPTYPE * coord, + const int * type, + const InputNlist & gpu_inlist, + int * array_int, + uint_64 * array_longlong, + const int max_nbor_size, + const FPTYPE * avg, + const FPTYPE * std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec) +{ + const int nnei = sec.back(); + const int ndescrpt = nnei * 4; + hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); + + format_nbor_list_gpu_rocm( + nlist, + coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec); + nborErrcheck(hipGetLastError()); + nborErrcheck(hipDeviceSynchronize()); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a), nloc, TPB, 0, 0, + em, em_deriv, rij, + coord, avg, std, type, nlist, nnei, rcut_smth, rcut); +} + +template +void prod_env_mat_r_gpu_rocm( + FPTYPE * em, + FPTYPE * em_deriv, + FPTYPE * rij, + int * nlist, + const FPTYPE * coord, + const int * type, + const InputNlist & gpu_inlist, + int * array_int, + uint_64 * array_longlong, + const int max_nbor_size, + const FPTYPE * avg, + const FPTYPE * std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec) +{ + const int nnei = sec.back(); + const int ndescrpt = nnei * 1; + hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); + + format_nbor_list_gpu_rocm( + nlist, + coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec); + nborErrcheck(hipGetLastError()); + nborErrcheck(hipDeviceSynchronize()); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r), nloc, TPB, 0, 0, + em, em_deriv, rij, + coord, avg, std, type, nlist, nnei, rcut_smth, rcut); +} + +template +void test_encoding_decoding_nbor_info_gpu_rocm( + uint_64 * key, + int * out_type, + int * out_index, + const int * in_type, + const FPTYPE * in_dist, + const int * in_index, + const int size_of_array) +{ + const int nblock = (size_of_array + TPB - 1) / TPB; + hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, + key, out_type, out_index, + in_type, in_dist, in_index, size_of_array); +} + +template void prod_env_mat_a_gpu_rocm(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); +template void prod_env_mat_a_gpu_rocm(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); +template void prod_env_mat_r_gpu_rocm(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); +template void prod_env_mat_r_gpu_rocm(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); +template void format_nbor_list_gpu_rocm(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector sec); +template void format_nbor_list_gpu_rocm(int * nlist, const double * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector sec); +template void test_encoding_decoding_nbor_info_gpu_rocm(uint_64 * key, int * out_type, int * out_index, const int * in_type, const float * in_dist, const int * in_index, const int size_of_array); +template void test_encoding_decoding_nbor_info_gpu_rocm(uint_64 * key, int * out_type, int * out_index, const int * in_type, const double * in_dist, const int * in_index, const int size_of_array); +} diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu new file mode 100644 index 0000000000..3c56b8155a --- /dev/null +++ b/source/lib/src/rocm/prod_force.hip.cu @@ -0,0 +1,161 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "prod_force.h" +#include "rocprim/rocprim.hpp" + +template < + typename FPTYPE, + int THREADS_PER_BLOCK> +__global__ void force_deriv_wrt_center_atom( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int ndescrpt) +{ + __shared__ FPTYPE data[THREADS_PER_BLOCK * 3]; + unsigned int bid = blockIdx.x; + unsigned int tid = threadIdx.x; + for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) { + data[ii] = 0.f; + } + for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) { + for (int jj = 0; jj < 3; jj++) { + data[jj * THREADS_PER_BLOCK + tid] += net_deriv[bid * ndescrpt + ii] * in_deriv[bid * ndescrpt * 3 + ii * 3 + jj]; + } + } + __syncthreads(); + // do reduction in shared memory + for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + for (int jj = 0; jj < 3; jj++) { + data[jj * THREADS_PER_BLOCK + tid] += data[jj * THREADS_PER_BLOCK + tid + ii]; + } + } + __syncthreads(); + } + // write result for this block to global memory + if (tid == 0) { + force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0]; + force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1]; + force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2]; + } +} + +template +__global__ void force_deriv_wrt_neighbors_a( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int * nlist, + const int nloc, + const int nnei) +{ + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + const unsigned int idw = threadIdx.z; + const int ndescrpt = nnei * 4; + if (idx >= nloc) { + return; + } + // deriv wrt neighbors + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + atomicAdd( + force + j_idx * 3 + idz, + net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); +} + +template +__global__ void force_deriv_wrt_neighbors_r( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int * nlist, + const int nloc, + const int nnei) +{ + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + const int ndescrpt = nnei * 1; + if (idx >= nloc) { + return; + } + // deriv wrt neighbors + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + atomicAdd( + force + j_idx * 3 + idz, + net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); +} + +namespace deepmd { + template + void prod_force_a_gpu_rocm( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int nnei) + { + const int ndescrpt = nnei * 4; + hipErrcheck(hipMemset( + force, + 0.0, sizeof(FPTYPE) * nall * 3)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), nloc, TPB, 0, 0, + force, + net_deriv, in_deriv, ndescrpt); + + const int LEN = 64; + const int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 3, 4); + hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, + force, + net_deriv, in_deriv, nlist, nloc, nnei); + } + + template + void prod_force_r_gpu_rocm( + FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const int * nlist, + const int nloc, + const int nall, + const int nnei) + { + const int ndescrpt = nnei * 1; + hipErrcheck(hipMemset( + force, + 0.0, sizeof(FPTYPE) * nall * 3)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), nloc, TPB, 0, 0, + force, + net_deriv, in_deriv, ndescrpt); + + const int LEN = 64; + const int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 3); + hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, + force, + net_deriv, in_deriv, nlist, nloc, nnei); + } + + template void prod_force_a_gpu_rocm(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei); + template void prod_force_a_gpu_rocm(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei); + template void prod_force_r_gpu_rocm(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei); + template void prod_force_r_gpu_rocm(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei); + +} diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu new file mode 100644 index 0000000000..3a3fd8ea58 --- /dev/null +++ b/source/lib/src/rocm/prod_force_grad.hip.cu @@ -0,0 +1,143 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "prod_force_grad.h" + +template +__device__ inline FPTYPE dev_dot( + const FPTYPE * arr1, + const FPTYPE * arr2) +{ + return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; +} + +template +__global__ void force_grad_wrt_center_atom( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int ndescrpt) +{ + __shared__ FPTYPE grad_one[3]; + unsigned int center_idx = blockIdx.x; + unsigned int tid = threadIdx.x; + if(tid < 3){ + grad_one[tid] = grad[center_idx * 3 + tid]; + } + __syncthreads(); + unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid; + if(descrpt_idx < ndescrpt){ + grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot(grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3); + } +} + +template +__global__ void force_grad_wrt_neighbors_a( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int * nlist, + const int nloc, + const int nnei) +{ + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idw = threadIdx.y; + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + if (j_idx >= nloc) j_idx = j_idx % nloc; + grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3); +} + +template +__global__ void force_grad_wrt_neighbors_r( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int * nlist, + const int nloc, + const int nnei) +{ + // idy -> nnei + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + if (j_idx >= nloc) j_idx = j_idx % nloc; + grad_net[idx * nnei + idy] += dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3); +} + +namespace deepmd { +template +void prod_force_grad_a_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int * nlist, + const int nloc, + const int nnei) +{ + const int ndescrpt = nnei * 4; + hipErrcheck(hipMemset( + grad_net, + 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + const int nblock = (ndescrpt + TPB - 1) / TPB; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(TPB, 1); + hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, + grad_net, + grad, env_deriv, ndescrpt); + + const int LEN = 128; + const int nblock_ = (nloc + LEN -1) / LEN; + dim3 block_grid_(nblock_, nnei); + dim3 thread_grid_(LEN, 4); + hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, 0, + grad_net, + grad, env_deriv, nlist, nloc, nnei); +} + +template +void prod_force_grad_r_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const int * nlist, + const int nloc, + const int nnei) +{ + const int ndescrpt = nnei * 1; + hipErrcheck(hipMemset( + grad_net, + 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + const int nblock = (ndescrpt + TPB - 1) / TPB; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(TPB, 1); + hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, + grad_net, + grad, env_deriv, ndescrpt); + + const int LEN = 128; + const int nblock_ = (nloc + LEN -1) / LEN; + dim3 block_grid_(nblock_, nnei); + dim3 thread_grid_(LEN, 1); + hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, 0, + grad_net, + grad, env_deriv, nlist, nloc, nnei); +} + +template void prod_force_grad_a_gpu_rocm(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei); +template void prod_force_grad_a_gpu_rocm(double * grad_net, const double * grad, const double * env_deriv, const int * nlist, const int nloc, const int nnei); +template void prod_force_grad_r_gpu_rocm(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei); +template void prod_force_grad_r_gpu_rocm(double * grad_net, const double * grad, const double * env_deriv, const int * nlist, const int nloc, const int nnei); +} \ No newline at end of file diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu new file mode 100644 index 0000000000..a285a1789b --- /dev/null +++ b/source/lib/src/rocm/prod_virial.hip.cu @@ -0,0 +1,176 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "prod_virial.h" +#include "rocprim/rocprim.hpp" + +template < + typename FPTYPE, + int THREADS_PER_BLOCK> +__global__ void atom_virial_reduction( + FPTYPE * virial, + const FPTYPE * atom_virial, + const int nall) +{ + unsigned int bid = blockIdx.x; + unsigned int tid = threadIdx.x; + __shared__ FPTYPE data[THREADS_PER_BLOCK]; + data[tid] = 0.f; + for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { + data[tid] += atom_virial[ii * 9 + bid]; + } + __syncthreads(); + // do reduction in shared memory + for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + // write result for this block to global memory + if (tid == 0) virial[bid] = data[0]; +} + +template +__global__ void virial_deriv_wrt_neighbors_a( + FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei) +{ + // idx -> nloc + // idy -> nnei + // idz = dd0 * 3 + dd1 + // dd0 = idz / 3 + // dd1 = idz % 3 + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + const unsigned int idw = threadIdx.z; + const int ndescrpt = nnei * 4; + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + // atomicAdd( + // virial + idz, + // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); + atomicAdd( + atom_virial + j_idx * 9 + idz, + net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]); +} + +template +__global__ void virial_deriv_wrt_neighbors_r( + FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei) +{ + // idx -> nloc + // idy -> nnei + // idz = dd0 * 3 + dd1 + // dd0 = idz / 3 + // dd1 = idz % 3 + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idy = blockIdx.y; + const unsigned int idz = threadIdx.y; + const int ndescrpt = nnei * 1; + + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + // atomicAdd( + // virial + idz, + // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); + atomicAdd( + atom_virial + j_idx * 9 + idz, + net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); +} + +namespace deepmd { +template +void prod_virial_a_gpu_rocm( + FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei) +{ + hipErrcheck(hipMemset( + virial, + 0.0, sizeof(FPTYPE) * 9)); + hipErrcheck(hipMemset( + atom_virial, + 0.0, sizeof(FPTYPE) * 9 * nall)); + + const int LEN = 16; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 9, 4); + // compute virial of a frame + hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, + virial, atom_virial, + net_deriv, in_deriv, rij, nlist, nloc, nnei); + // reduction atom_virial to virial + hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction), 9, TPB, 0, 0, + virial, + atom_virial, nall); +} + +template +void prod_virial_r_gpu_rocm( + FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei) +{ + hipErrcheck(hipMemset( + virial, + 0.0, sizeof(FPTYPE) * 9)); + hipErrcheck(hipMemset( + atom_virial, + 0.0, sizeof(FPTYPE) * 9 * nall)); + + const int LEN = 16; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 9); + // compute virial of a frame + hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, + virial, atom_virial, + net_deriv, in_deriv, rij, nlist, nloc, nnei); + // reduction atom_virial to virial + hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction), 9, TPB, 0, 0, + virial, + atom_virial, nall); +} + +template void prod_virial_a_gpu_rocm(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei); +template void prod_virial_a_gpu_rocm(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei); +template void prod_virial_r_gpu_rocm(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei); +template void prod_virial_r_gpu_rocm(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei); +} diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu new file mode 100644 index 0000000000..c4d8a5c19a --- /dev/null +++ b/source/lib/src/rocm/prod_virial_grad.hip.cu @@ -0,0 +1,141 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "prod_virial_grad.h" + +template +__device__ inline FPTYPE dev_dot9( + const FPTYPE * arr1, + const FPTYPE * arr2) +{ + FPTYPE result = 0.0; + for(int ii=0; ii<9; ii++){ + result += arr1[ii] * arr2[ii]; + } + return result; +} + +template +__global__ void virial_grad_wrt_neighbors_a( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei) +{ + // idy -> nnei + const unsigned int tid = threadIdx.x; + const unsigned int idx = blockIdx.x * blockDim.x + tid; + const unsigned int idy = blockIdx.y; + const unsigned int idw = threadIdx.y; + const int ndescrpt = nnei * 4; + __shared__ FPTYPE grad_one[9]; + if(tid < 9){ + grad_one[tid] = grad[tid]; + } + __syncthreads(); + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + FPTYPE tmp[9]; + for (int dd0 = 0; dd0 < 3; ++dd0){ + for (int dd1 = 0; dd1 < 3; ++dd1){ + tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0]; + } + } + grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp); +} + +template +__global__ void virial_grad_wrt_neighbors_r( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei) +{ + // idy -> nnei + const unsigned int tid = threadIdx.x; + const unsigned int idx = blockIdx.x * blockDim.x + tid; + const unsigned int idy = blockIdx.y; + const int ndescrpt = nnei; + __shared__ FPTYPE grad_one[9]; + if(tid < 9){ + grad_one[tid] = grad[tid]; + } + __syncthreads(); + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + FPTYPE tmp[9]; + for (int dd0 = 0; dd0 < 3; ++dd0){ + for (int dd1 = 0; dd1 < 3; ++dd1){ + tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0]; + } + } + grad_net[idx * ndescrpt + idy] -= -1.0 * dev_dot9(grad_one, tmp); +} + +namespace deepmd { +template +void prod_virial_grad_a_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei) +{ + const int ndescrpt = nnei * 4; + hipErrcheck(hipMemset( + grad_net, + 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + const int LEN = 128; + const int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 4); + hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0, + grad_net, + grad, env_deriv, rij, nlist, nloc, nnei); +} + +template +void prod_virial_grad_r_gpu_rocm( + FPTYPE * grad_net, + const FPTYPE * grad, + const FPTYPE * env_deriv, + const FPTYPE * rij, + const int * nlist, + const int nloc, + const int nnei) +{ + const int ndescrpt = nnei; + hipErrcheck(hipMemset( + grad_net, + 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + const int LEN = 128; + const int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 1); + hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0, + grad_net, + grad, env_deriv, rij, nlist, nloc, nnei); +} + +template void prod_virial_grad_a_gpu_rocm(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei); +template void prod_virial_grad_a_gpu_rocm(double * grad_net, const double * grad, const double * env_deriv, const double * rij, const int * nlist, const int nloc, const int nnei); +template void prod_virial_grad_r_gpu_rocm(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei); +template void prod_virial_grad_r_gpu_rocm(double * grad_net, const double * grad, const double * env_deriv, const double * rij, const int * nlist, const int nloc, const int nnei); +} \ No newline at end of file diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu new file mode 100644 index 0000000000..7f883b14c3 --- /dev/null +++ b/source/lib/src/rocm/region.hip.cu @@ -0,0 +1,69 @@ +#include "hip/hip_runtime.h" +#include "device.h" +#include "region.h" +#include "region.cuh" + +template +__global__ void _phys2Inter( + FPTYPE *inter, + const FPTYPE *phys, + const FPTYPE *rec_boxt) +{ + phys2Inter(inter, phys, rec_boxt); +} + +template +__global__ void _inter2Phys( + FPTYPE *phys, + const FPTYPE *inter, + const FPTYPE *boxt) +{ + inter2Phys(phys, inter, boxt); +} + +template +__global__ void _compute_volume( + FPTYPE * volume, + const FPTYPE * boxt) +{ + volume[0] = compute_volume(boxt); +} + +namespace deepmd { +//only for unittest +template +void +convert_to_inter_gpu_rocm( + FPTYPE * ri, + const Region & region, + const FPTYPE * rp) +{ + hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt); +} + +template +void +convert_to_phys_gpu_rocm( + FPTYPE * rp, + const Region & region, + const FPTYPE * ri) +{ + hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt); +} + +template +void +volume_gpu_rocm( + FPTYPE * volume, + const Region & region) +{ + hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt); +} + +template void convert_to_inter_gpu_rocm(float * ri, const Region & region, const float * rp); +template void convert_to_inter_gpu_rocm(double * ri, const Region & region, const double * rp); +template void convert_to_phys_gpu_rocm(float * rp, const Region & region, const float * ri); +template void convert_to_phys_gpu_rocm(double * rp, const Region & region, const double * ri); +template void volume_gpu_rocm(float * volume, const Region & region); +template void volume_gpu_rocm(double * volume, const Region & region); +} \ No newline at end of file diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu new file mode 100644 index 0000000000..80aec8af8f --- /dev/null +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -0,0 +1,246 @@ +#include "hip/hip_runtime.h" +#include "tabulate.h" +#include +#include +#include "device.h" + +#define MM 4 +#define KK 4 +#define TPB 256 +#define WARP_SIZE 64 +#define FULL_MASK 0xffffffff +#include "gpu_rocm.h" + +template +__forceinline__ __device__ +void locate_xx( + FPTYPE& xx, + int& table_idx, + const FPTYPE& lower, + const FPTYPE& upper, + const FPTYPE& max, + const FPTYPE& stride0, + const FPTYPE& stride1) +{ + if (xx < lower) { + table_idx = 0; + xx = 0; + } + else if (xx < upper) { + table_idx = (int)((xx - lower) / stride0); + xx -= (table_idx * stride0 + lower); + } + else if (xx < max) { + int first_stride = int((upper - lower) / stride0); + table_idx = first_stride + (int)((xx - upper) / stride1); + xx -= ((table_idx - first_stride) * stride1 + upper); + } + else { + table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; + xx = 0; + } +} + +template +__forceinline__ __device__ +FPTYPE dot( + FPTYPE ll[4], + FPTYPE rr[4]) +{ + return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3]; +} + +template +__forceinline__ +__device__ +void warp_reduce( + FPTYPE & val) +{ + for (int offset = 32; offset > 0; offset >>= 1) + val += __shfl_down( val, offset);//########???? +} + +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_fifth_order_polynomial( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei, + const int last_layer_size) +{ + HIP_DYNAMIC_SHARED( int, _data) + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0); + bool unloop = false; + int breakpoint = nnei - 1; + FPTYPE * iteratorC = (FPTYPE*) &_data[0]; + for (int kk = 0; kk < MTILE; kk++) + iteratorC[kk * last_layer_size + thread_idx] = 0.f; + __syncthreads(); + + for (int ii = 0; ii < nnei; ii++) { + FPTYPE var[6]; + FPTYPE xx = em_x[block_idx * nnei + ii]; + if (xx == ago) { + unloop = true; + breakpoint = ii; + } + int table_idx = 0; + locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + + for (int kk = 0; kk < MTILE; kk++) { + iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * em[block_idx * nnei * MTILE + ii * MTILE + kk] * res; + } + if (unloop) break; + } + for (int ii = 0; ii < MTILE; ii++) { + out[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; + } +} + +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_grad_fifth_order_polynomial( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei, + const int last_layer_size) +{ + HIP_DYNAMIC_SHARED( int, _data) + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ + int warp_idx = __shfl(threadIdx.x / 64, 0); + int lane_idx = threadIdx.x % 64; + int breakpoint = nnei - 1; + bool unloop = false; + FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy + for (int ii = 0; ii < MTILE; ii++) { + if (thread_idx < last_layer_size) { + iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx]; + } + } + __syncthreads(); + FPTYPE ago = __shfl( em_x[block_idx * nnei + nnei - 1], 0); + for (int ii = 0; ii < nnei; ii += KTILE) { + FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx]; + if (ago == xx) { + unloop = true; + breakpoint = ii + warp_idx; + } + + int table_idx = 0; + locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); + FPTYPE sum[KTILE] = {0.f}; + FPTYPE Csub = 0.f; + for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) { + FPTYPE var[6]; + // load iteratorB through table + var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; + var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; + var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; + var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3]; + var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4]; + var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + + for (int kk = 0; kk < KTILE; kk++) { + sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res; + } + res = em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * iteratorA[0 * last_layer_size + jj]; + res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj]; + res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj]; + res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj]; + Csub += (nnei - breakpoint) * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * res; + } + //__syncwarp();->syncwrap + __syncthreads(); + for (int kk = 0; kk < KTILE; kk++) { + warp_reduce(sum[kk]); + } + warp_reduce(Csub); + if (lane_idx == 0) { + for (int kk = 0; kk < KTILE; kk++) { + dy_dem[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk]; + } + dy_dem_x[block_idx * nnei + ii + warp_idx] = Csub; + } + if (unloop) break; + } +} + +namespace deepmd { + +template + void tabulate_fusion_gpu_rocm( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei, + const int last_layer_size) + { + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, + out, + table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + } + + template + void tabulate_fusion_grad_gpu_rocm( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei, + const int last_layer_size) + { + hipErrcheck(hipMemset( + dy_dem_x, + 0.0, sizeof(FPTYPE) * nloc * nnei)); + hipErrcheck(hipMemset( + dy_dem, + 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, + dy_dem_x, dy_dem, + table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + } + + template void tabulate_fusion_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); + template void tabulate_fusion_gpu_rocm(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); + template void tabulate_fusion_grad_gpu_rocm (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); + template void tabulate_fusion_grad_gpu_rocm (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); + } diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index 03242ba8dd..d6908a18ce 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -3,6 +3,8 @@ project(libdeepmd_test) enable_testing() +set(CMAKE_LINK_WHAT_YOU_USE TRUE) + set(libname "deepmd") set(LIB_BASE_DIR ${CMAKE_SOURCE_DIR}/../) @@ -40,6 +42,45 @@ if (USE_CUDA_TOOLKIT) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) endif() +#define USE_ROCM_TOOLKIT +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +if (DEFINED USE_ROCM_TOOLKIT) + if(USE_ROCM_TOOLKIT) + link_directories(${HIP_ROOT_DIR}/lib) + add_definitions("-DTENSORFLOW_USE_ROCM") + find_package(HIP REQUIRED) + add_compile_definitions(__HIP_PLATFORM_HCC__) + else() + message(STATUS "Will not build rocm GPU support") + endif() +else() + find_package(HIP REQUIRED) + if (HIP_FOUND) + link_directories(${HIP_ROOT_DIR}/lib) + add_definitions("-DTENSORFLOW_USE_ROCM") + set(USE_ROCM_TOOLKIT TRUE) + add_compile_definitions(__HIP_PLATFORM_HCC__) + message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build ROCM GPU support") + else() + set(USE_ROCM_TOOLKIT FALSE) + message(STATUS "No ROCM support found, will not build ROCM GPU support") + endif() +endif() +if (USE_ROCM_TOOLKIT) + add_definitions("-DUSE_ROCM_TOOLKIT") + include_directories(${ROCM_INCLUDE_DIRS}) + add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) +endif() + + file(GLOB TEST_SRC test_*.cc) add_executable( runUnitTests ${TEST_SRC} ) @@ -65,6 +106,11 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config) +elseif() + set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) + message(status"EXTRA_LIBS:${EXTRA_LIBS}") + message(status"EXTRA_LIBS:${HIP}") + target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc ${EXTRA_LIBS} coverage_config) else() target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config) endif() diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc index 8dee80745e..54c58c8a11 100644 --- a/source/lib/tests/test_coord.cc +++ b/source/lib/tests/test_coord.cc @@ -159,6 +159,96 @@ TEST_F(TestNormCoord, gpu_case2) #endif //GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +TEST_F(TestNormCoord, gpu_case0) +{ + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + double * box_info_dev=NULL; + double * out_c_dev=NULL; + std::vector out_c(r0); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(out_c_dev, out_c); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(out_c_dev); + for(int ii = 0; ii < posi.size(); ++ii){ + EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12); + } +} + +TEST_F(TestNormCoord, gpu_case1) +{ + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + double * box_info_dev=NULL; + double * out_c_dev=NULL; + std::vector out_c(r1); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(out_c_dev, out_c); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(out_c_dev); + for(int ii = 0; ii < posi.size(); ++ii){ + EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12); + } +} + +TEST_F(TestNormCoord, gpu_case2) +{ + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + double * box_info_dev=NULL; + double * out_c_dev=NULL; + std::vector out_c(r2); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(out_c_dev, out_c); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(out_c_dev); + for(int ii = 0; ii < posi.size(); ++ii){ + EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12); + } +} + +#endif //TENSORFLOW_USE_ROCM + typedef std::pair,std::vector> atom; static void @@ -474,6 +564,158 @@ TEST_F(TestCopyCoord, gpu_lessmem) } #endif //GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +TEST_F(TestCopyCoord, gpu) +{ + int mem_size = 1000; + std::vector out_c(mem_size * 3); + std::vector out_t(mem_size); + std::vector mapping(mem_size); + int nall; + std::vector cell_info; + cell_info.resize(23); + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + deepmd::compute_cell_info(&cell_info[0], rc, region); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + const int loc_cellnum=cell_info[21]; + const int total_cellnum=cell_info[22]; + int * cell_info_dev=NULL; + double * box_info_dev=NULL; + double * out_c_dev=NULL, * in_c_dev=NULL; + int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL; + deepmd::malloc_device_memory_sync(cell_info_dev, cell_info); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(in_c_dev, posi); + deepmd::malloc_device_memory_sync(in_t_dev, atype); + deepmd::malloc_device_memory(out_c_dev, mem_size * 3); + deepmd::malloc_device_memory(out_t_dev, mem_size); + deepmd::malloc_device_memory(mapping_dev, mem_size); + deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + int ret = deepmd::copy_coord_gpu_rocm( + out_c_dev, + out_t_dev, + mapping_dev, + &nall, + int_data_dev, + in_c_dev, + in_t_dev, + nloc, + mem_size, + loc_cellnum, + total_cellnum, + cell_info_dev, + region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::memcpy_device_to_host(out_t_dev, out_t); + deepmd::memcpy_device_to_host(mapping_dev, mapping); + deepmd::delete_device_memory(cell_info_dev); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(in_c_dev); + deepmd::delete_device_memory(in_t_dev); + deepmd::delete_device_memory(out_c_dev); + deepmd::delete_device_memory(out_t_dev); + deepmd::delete_device_memory(mapping_dev); + deepmd::delete_device_memory(int_data_dev); + EXPECT_EQ(ret, 0); + EXPECT_EQ(nall, expected_nall); + out_c.resize(nall*3); + out_t.resize(nall); + mapping.resize(nall); + + std::vector out_c_1(mem_size * 3); + std::vector out_t_1(mem_size); + std::vector mapping_1(mem_size); + sort_atoms(out_c_1, out_t_1, mapping_1, out_c, out_t, mapping, nloc, nall); + for(int ii = 0; ii < expected_nall; ++ii){ + for(int dd = 0; dd < 3; ++dd){ + EXPECT_LT(fabs(out_c_1[ii*3+dd] - expected_posi_cpy[ii*3+dd]), 1e-12); + } + EXPECT_EQ(out_t_1[ii], expected_atype_cpy[ii]); + EXPECT_EQ(mapping_1[ii], expected_mapping[ii]); + } +} + +TEST_F(TestCopyCoord, gpu_lessmem) +{ + int mem_size = 40; + std::vector out_c(mem_size * 3); + std::vector out_t(mem_size); + std::vector mapping(mem_size); + int nall; + std::vector cell_info; + cell_info.resize(23); + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + deepmd::compute_cell_info(&cell_info[0], rc, region); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + const int loc_cellnum=cell_info[21]; + const int total_cellnum=cell_info[22]; + int * cell_info_dev=NULL; + double * box_info_dev=NULL; + double * out_c_dev=NULL, * in_c_dev=NULL; + int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL; + deepmd::malloc_device_memory_sync(cell_info_dev, cell_info); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(in_c_dev, posi); + deepmd::malloc_device_memory_sync(in_t_dev, atype); + deepmd::malloc_device_memory(out_c_dev, mem_size * 3); + deepmd::malloc_device_memory(out_t_dev, mem_size); + deepmd::malloc_device_memory(mapping_dev, mem_size); + deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + int ret = deepmd::copy_coord_gpu_rocm( + out_c_dev, + out_t_dev, + mapping_dev, + &nall, + int_data_dev, + in_c_dev, + in_t_dev, + nloc, + mem_size, + loc_cellnum, + total_cellnum, + cell_info_dev, + region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::memcpy_device_to_host(out_t_dev, out_t); + deepmd::memcpy_device_to_host(mapping_dev, mapping); + deepmd::delete_device_memory(cell_info_dev); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(in_c_dev); + deepmd::delete_device_memory(in_t_dev); + deepmd::delete_device_memory(out_c_dev); + deepmd::delete_device_memory(out_t_dev); + deepmd::delete_device_memory(mapping_dev); + deepmd::delete_device_memory(int_data_dev); + EXPECT_EQ(ret, 1); + // EXPECT_EQ(nall, expected_nall); + // std::cout << "---------------------" + // << nloc << " " + // << nall << std::endl; +} +#endif //TENSORFLOW_USE_ROCM + class TestCopyCoordMoreCell : public ::testing::Test { protected: @@ -745,4 +987,156 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) // << nloc << " " // << nall << std::endl; } -#endif //GOOGLE_CUDA \ No newline at end of file +#endif //GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestCopyCoordMoreCell, gpu) +{ + int mem_size = 1000; + std::vector out_c(mem_size * 3); + std::vector out_t(mem_size); + std::vector mapping(mem_size); + int nall; + std::vector cell_info; + cell_info.resize(23); + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + deepmd::compute_cell_info(&cell_info[0], rc, region); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + const int loc_cellnum=cell_info[21]; + const int total_cellnum=cell_info[22]; + int * cell_info_dev=NULL; + double * box_info_dev=NULL; + double * out_c_dev=NULL, * in_c_dev=NULL; + int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL; + deepmd::malloc_device_memory_sync(cell_info_dev, cell_info); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(in_c_dev, posi); + deepmd::malloc_device_memory_sync(in_t_dev, atype); + deepmd::malloc_device_memory(out_c_dev, mem_size * 3); + deepmd::malloc_device_memory(out_t_dev, mem_size); + deepmd::malloc_device_memory(mapping_dev, mem_size); + deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + int ret = deepmd::copy_coord_gpu_rocm( + out_c_dev, + out_t_dev, + mapping_dev, + &nall, + int_data_dev, + in_c_dev, + in_t_dev, + nloc, + mem_size, + loc_cellnum, + total_cellnum, + cell_info_dev, + region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::memcpy_device_to_host(out_t_dev, out_t); + deepmd::memcpy_device_to_host(mapping_dev, mapping); + deepmd::delete_device_memory(cell_info_dev); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(in_c_dev); + deepmd::delete_device_memory(in_t_dev); + deepmd::delete_device_memory(out_c_dev); + deepmd::delete_device_memory(out_t_dev); + deepmd::delete_device_memory(mapping_dev); + deepmd::delete_device_memory(int_data_dev); + EXPECT_EQ(ret, 0); + EXPECT_EQ(nall, expected_nall); + out_c.resize(nall*3); + out_t.resize(nall); + mapping.resize(nall); + + std::vector out_c_1(mem_size * 3); + std::vector out_t_1(mem_size); + std::vector mapping_1(mem_size); + sort_atoms(out_c_1, out_t_1, mapping_1, out_c, out_t, mapping, nloc, nall); + for(int ii = 0; ii < expected_nall; ++ii){ + for(int dd = 0; dd < 3; ++dd){ + EXPECT_LT(fabs(out_c_1[ii*3+dd] - expected_posi_cpy[ii*3+dd]), 1e-12); + } + EXPECT_EQ(out_t_1[ii], expected_atype_cpy[ii]); + EXPECT_EQ(mapping_1[ii], expected_mapping[ii]); + } +} + +TEST_F(TestCopyCoordMoreCell, gpu_lessmem) +{ + int mem_size = 40; + std::vector out_c(mem_size * 3); + std::vector out_t(mem_size); + std::vector mapping(mem_size); + int nall; + std::vector cell_info; + cell_info.resize(23); + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + init_region_cpu(region, &boxt[0]); + deepmd::compute_cell_info(&cell_info[0], rc, region); + std::vector box_info; + box_info.resize(18); + memcpy(&box_info[0], &boxt[0], sizeof(double)*9); + memcpy(&box_info[9], region.rec_boxt, sizeof(double)*9); + const int loc_cellnum=cell_info[21]; + const int total_cellnum=cell_info[22]; + int * cell_info_dev=NULL; + double * box_info_dev=NULL; + double * out_c_dev=NULL, * in_c_dev=NULL; + int * out_t_dev=NULL, * in_t_dev=NULL, * mapping_dev=NULL, * int_data_dev=NULL; + deepmd::malloc_device_memory_sync(cell_info_dev, cell_info); + deepmd::malloc_device_memory_sync(box_info_dev, box_info); + deepmd::malloc_device_memory_sync(in_c_dev, posi); + deepmd::malloc_device_memory_sync(in_t_dev, atype); + deepmd::malloc_device_memory(out_c_dev, mem_size * 3); + deepmd::malloc_device_memory(out_t_dev, mem_size); + deepmd::malloc_device_memory(mapping_dev, mem_size); + deepmd::malloc_device_memory(int_data_dev, nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc); + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + int ret = deepmd::copy_coord_gpu_rocm( + out_c_dev, + out_t_dev, + mapping_dev, + &nall, + int_data_dev, + in_c_dev, + in_t_dev, + nloc, + mem_size, + loc_cellnum, + total_cellnum, + cell_info_dev, + region_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + deepmd::memcpy_device_to_host(out_c_dev, out_c); + deepmd::memcpy_device_to_host(out_t_dev, out_t); + deepmd::memcpy_device_to_host(mapping_dev, mapping); + deepmd::delete_device_memory(cell_info_dev); + deepmd::delete_device_memory(box_info_dev); + deepmd::delete_device_memory(in_c_dev); + deepmd::delete_device_memory(in_t_dev); + deepmd::delete_device_memory(out_c_dev); + deepmd::delete_device_memory(out_t_dev); + deepmd::delete_device_memory(mapping_dev); + deepmd::delete_device_memory(int_data_dev); + EXPECT_EQ(ret, 1); + // EXPECT_EQ(nall, expected_nall); + // std::cout << "---------------------" + // << nloc << " " + // << nall << std::endl; +} +#endif //TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc index f9e262cb2c..cfb70acfe9 100644 --- a/source/lib/tests/test_env_mat_a.cc +++ b/source/lib/tests/test_env_mat_a.cc @@ -723,3 +723,216 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) } } #endif //GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestEnvMatA, prod_gpu_rocm) +{ + EXPECT_EQ(nlist_r_cpy.size(), nloc); + int tot_nnei = 0; + int max_nbor_size = 0; + for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){ + tot_nnei += nlist_a_cpy[ii].size(); + if (nlist_a_cpy[ii].size() > max_nbor_size){ + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(inlist, nlist_a_cpy); + std::vector em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0); + std::vector nlist(nloc * nnei, 0); + std::vector avg(ntypes * ndescrpt, 0); + std::vector std(ntypes * ndescrpt, 1); + + double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; + double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + + deepmd::prod_env_mat_a_gpu_rocm( + em_dev, + em_deriv_dev, + rij_dev, + nlist_dev, + posi_cpy_dev, + atype_cpy_dev, + gpu_inlist, + array_int_dev, + array_longlong_dev, + max_nbor_size, + avg_dev, + std_dev, + nloc, + nall, + rc, + rc_smth, + sec_a); + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_rocm(gpu_inlist); + + for(int ii = 0; ii < nloc; ++ii){ + for (int jj = 0; jj < nnei; ++jj){ + for (int dd = 0; dd < 4; ++dd){ + EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - + expected_env[ii*nnei*4 + jj*4 + dd]) , + 1e-5); + } + } + } +} + + +TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) +{ + EXPECT_EQ(nlist_r_cpy.size(), nloc); + int tot_nnei = 0; + int max_nbor_size = 0; + for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){ + tot_nnei += nlist_a_cpy[ii].size(); + if (nlist_a_cpy[ii].size() > max_nbor_size){ + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(inlist, nlist_a_cpy); + std::vector em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0); + std::vector nlist(nloc * nnei, 0); + std::vector avg(ntypes * ndescrpt, 0); + std::vector std(ntypes * ndescrpt, 1); + + double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; + double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + + deepmd::prod_env_mat_a_gpu_rocm( + em_dev, + em_deriv_dev, + rij_dev, + nlist_dev, + posi_cpy_dev, + atype_cpy_dev, + gpu_inlist, + array_int_dev, + array_longlong_dev, + max_nbor_size, + avg_dev, + std_dev, + nloc, + nall, + rc, + rc_smth, + sec_a); + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); + deepmd::memcpy_device_to_host(rij_dev, rij); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_rocm(gpu_inlist); + + std::vector fmt_nlist_a_1, fmt_nlist_r_1; + std::vector env_1, env_deriv_1, rij_a_1; + for(int ii = 0; ii < nloc; ++ii){ + int ret_1 = format_nlist_i_cpu(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a); + EXPECT_EQ(ret_1, -1); + deepmd::env_mat_a_cpu(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc); + EXPECT_EQ(env_1.size(), nnei * 4); + EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3); + EXPECT_EQ(rij_a_1.size(), nnei * 3); + EXPECT_EQ(fmt_nlist_a_1.size(), nnei); + EXPECT_EQ(env_1.size() * nloc, em.size()); + EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size()); + EXPECT_EQ(rij_a_1.size() * nloc, rij.size()); + EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size()); + for (unsigned jj = 0; jj < env_1.size(); ++jj){ + EXPECT_LT(fabs(em[ii*nnei*4+jj] - env_1[jj]), 1e-10); + } + for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){ + EXPECT_LT(fabs(em_deriv[ii*nnei*4*3+jj] - env_deriv_1[jj]), 1e-10); + } + for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){ + EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10); + } + for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){ + EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]); + } + } + + for(int ii = 0; ii < nloc; ++ii){ + for (int jj = 0; jj < nnei; ++jj){ + for (int dd = 0; dd < 4; ++dd){ + EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - + expected_env[ii*nnei*4 + jj*4 + dd]) , + 1e-5); + } + } + } +} +#endif //TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc index c7b0db5e1d..1b232e28b5 100644 --- a/source/lib/tests/test_env_mat_r.cc +++ b/source/lib/tests/test_env_mat_r.cc @@ -555,3 +555,206 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) } } #endif //GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestEnvMatR, prod_gpu_rocm) +{ + EXPECT_EQ(nlist_r_cpy.size(), nloc); + int tot_nnei = 0; + int max_nbor_size = 0; + for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){ + tot_nnei += nlist_a_cpy[ii].size(); + if (nlist_a_cpy[ii].size() > max_nbor_size){ + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(inlist, nlist_a_cpy); + std::vector em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0); + std::vector nlist(nloc * nnei, 0); + std::vector avg(ntypes * ndescrpt, 0); + std::vector std(ntypes * ndescrpt, 1); + + double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; + double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + + deepmd::prod_env_mat_r_gpu_rocm( + em_dev, + em_deriv_dev, + rij_dev, + nlist_dev, + posi_cpy_dev, + atype_cpy_dev, + gpu_inlist, + array_int_dev, + array_longlong_dev, + max_nbor_size, + avg_dev, + std_dev, + nloc, + nall, + rc, + rc_smth, + sec_a); + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_rocm(gpu_inlist); + + for(int ii = 0; ii < nloc; ++ii){ + for (int jj = 0; jj < nnei; ++jj){ + for (int dd = 0; dd < 1; ++dd){ + EXPECT_LT(fabs(em[ii*nnei*1 + jj*1 + dd] - + expected_env[ii*nnei*1 + jj*1 + dd]) , + 1e-5); + } + } + } +} + +TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) +{ + EXPECT_EQ(nlist_r_cpy.size(), nloc); + int tot_nnei = 0; + int max_nbor_size = 0; + for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){ + tot_nnei += nlist_a_cpy[ii].size(); + if (nlist_a_cpy[ii].size() > max_nbor_size){ + max_nbor_size = nlist_a_cpy[ii].size(); + } + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(inlist, nlist_a_cpy); + std::vector em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0); + std::vector nlist(nloc * nnei, 0); + std::vector avg(ntypes * ndescrpt, 0); + std::vector std(ntypes * ndescrpt, 1); + + double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; + double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(avg_dev, avg); + deepmd::malloc_device_memory_sync(std_dev, std); + + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + + deepmd::prod_env_mat_r_gpu_rocm( + em_dev, + em_deriv_dev, + rij_dev, + nlist_dev, + posi_cpy_dev, + atype_cpy_dev, + gpu_inlist, + array_int_dev, + array_longlong_dev, + max_nbor_size, + avg_dev, + std_dev, + nloc, + nall, + rc, + rc_smth, + sec_a); + deepmd::memcpy_device_to_host(em_dev, em); + deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); + deepmd::memcpy_device_to_host(rij_dev, rij); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(em_deriv_dev); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(avg_dev); + deepmd::delete_device_memory(std_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_rocm(gpu_inlist); + + std::vector fmt_nlist_a_1, fmt_nlist_r_1; + std::vector env_1, env_deriv_1, rij_a_1; + for(int ii = 0; ii < nloc; ++ii){ + int ret_1 = format_nlist_i_cpu(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a); + EXPECT_EQ(ret_1, -1); + deepmd::env_mat_r_cpu(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc); + EXPECT_EQ(env_1.size(), nnei * 1); + EXPECT_EQ(env_deriv_1.size(), nnei * 1 * 3); + EXPECT_EQ(rij_a_1.size(), nnei * 3); + EXPECT_EQ(fmt_nlist_a_1.size(), nnei); + EXPECT_EQ(env_1.size() * nloc, em.size()); + EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size()); + EXPECT_EQ(rij_a_1.size() * nloc, rij.size()); + EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size()); + for (unsigned jj = 0; jj < env_1.size(); ++jj){ + EXPECT_LT(fabs(em[ii*nnei*1+jj] - env_1[jj]), 1e-10); + } + for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){ + EXPECT_LT(fabs(em_deriv[ii*nnei*1*3+jj] - env_deriv_1[jj]), 1e-10); + } + for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){ + EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10); + } + for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){ + EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]); + } + } +} +#endif //TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc index df2a68534f..d2de6e8855 100644 --- a/source/lib/tests/test_fmt_nlist.cc +++ b/source/lib/tests/test_fmt_nlist.cc @@ -525,4 +525,157 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) // deepmd::delete_device_memory(out_index_dev); // deepmd::delete_device_memory(key_dev); // } -#endif // GOOGLE_CUDA \ No newline at end of file +#endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestFormatNlist, gpu_rocm) +{ + std::vector> nlist_a_0, nlist_r_0; + build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); + // make a input nlist + int inum = nlist_a_0.size(); + std::vector ilist(inum); + std::vector numneigh(inum); + std::vector firstneigh(inum); + deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(in_nlist, nlist_a_0); + // allocate the mem for the result + std::vector nlist(inum * sec_a.back()); + EXPECT_EQ(nlist.size(), expect_nlist_cpy.size()); + + double * posi_cpy_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + for (int ii = 0; ii < inum; ii++) { + max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii]; + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + // format nlist + format_nbor_list_gpu_rocm( + nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_rocm(gpu_inlist); + // validate + for(int ii = 0; ii < nlist.size(); ++ii){ + EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); + } +} + +TEST_F(TestFormatNlistShortSel, gpu_rocm) +{ + std::vector> nlist_a_0, nlist_r_0; + build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); + // make a input nlist + int inum = nlist_a_0.size(); + std::vector ilist(inum); + std::vector numneigh(inum); + std::vector firstneigh(inum); + deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(in_nlist, nlist_a_0); + // mem + std::vector nlist(inum * sec_a.back()); + EXPECT_EQ(nlist.size(), expect_nlist_cpy.size()); + // format nlist + double * posi_cpy_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + for (int ii = 0; ii < inum; ii++) { + max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii]; + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + // format nlist + format_nbor_list_gpu_rocm( + nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_rocm(gpu_inlist); + // validate + for(int ii = 0; ii < nlist.size(); ++ii){ + EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); + } +} + +TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_rocm) +{ + int * valid_type_dev = NULL, * valid_index_dev = NULL, * out_type_dev = NULL, * out_index_dev = NULL; + double * valid_dist_dev = NULL; + uint_64 * key_dev = NULL; + std::vector out_type(size_of_array, 0); + std::vector out_index(size_of_array, 0); + std::vector key(size_of_array, 0); + deepmd::malloc_device_memory_sync(valid_type_dev, valid_type); + deepmd::malloc_device_memory_sync(valid_dist_dev, valid_dist); + deepmd::malloc_device_memory_sync(valid_index_dev, valid_index); + deepmd::malloc_device_memory_sync(out_type_dev, out_type); + deepmd::malloc_device_memory_sync(out_index_dev, out_index); + deepmd::malloc_device_memory_sync(key_dev, key); + + deepmd::test_encoding_decoding_nbor_info_gpu_rocm( + key_dev, out_type_dev, out_index_dev, + valid_type_dev, valid_dist_dev, valid_index_dev, size_of_array + ); + + deepmd::memcpy_device_to_host(key_dev, key); + deepmd::memcpy_device_to_host(out_type_dev, out_type); + deepmd::memcpy_device_to_host(out_index_dev, out_index); + deepmd::delete_device_memory(valid_type_dev); + deepmd::delete_device_memory(valid_dist_dev); + deepmd::delete_device_memory(valid_index_dev); + deepmd::delete_device_memory(out_type_dev); + deepmd::delete_device_memory(out_index_dev); + deepmd::delete_device_memory(key_dev); + // validate + for(int ii = 0; ii < size_of_array; ii++) { + EXPECT_EQ(key[ii], expect_key[ii]); + EXPECT_EQ(out_type[ii], expect_type[ii]); + EXPECT_EQ(out_index[ii], expect_index[ii]); + } +} + + +#endif // TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc index 4d85b2dd27..46539f7334 100644 --- a/source/lib/tests/test_gelu.cc +++ b/source/lib/tests/test_gelu.cc @@ -212,3 +212,73 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) } } #endif // GOOGLE_CUDA + + + +#if TENSORFLOW_USE_ROCM +TEST_F(TestGelu, gelu_gpu_rocm) +{ + std::vector gelu(nloc, 0.0); + + double * gelu_dev = NULL, * xx_dev = NULL; + deepmd::malloc_device_memory_sync(gelu_dev, gelu); + deepmd::malloc_device_memory_sync(xx_dev, xx); + deepmd::gelu_gpu_rocm (gelu_dev, xx_dev, nloc); + deepmd::memcpy_device_to_host(gelu_dev, gelu); + deepmd::delete_device_memory(gelu_dev); + deepmd::delete_device_memory(xx_dev); + + EXPECT_EQ(gelu.size(), nloc); + EXPECT_EQ(gelu.size(), expected_gelu.size()); + for (int jj = 0; jj < gelu.size(); ++jj){ + EXPECT_LT(fabs(gelu[jj] - expected_gelu[jj]) , 1e-5); + } +} + +TEST_F(TestGelu, gelu_grad_gpu_rocm) +{ + std::vector dy(100, 1.0); + std::vector gelu_grad(nloc, 0.0); + + double * gelu_grad_dev = NULL, * xx_dev = NULL, * dy_dev = NULL; + deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad); + deepmd::malloc_device_memory_sync(xx_dev, xx); + deepmd::malloc_device_memory_sync(dy_dev, dy); + deepmd::gelu_grad_gpu_rocm (gelu_grad_dev, xx_dev, dy_dev, nloc); + deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad); + deepmd::delete_device_memory(gelu_grad_dev); + deepmd::delete_device_memory(xx_dev); + deepmd::delete_device_memory(dy_dev); + + EXPECT_EQ(gelu_grad.size(), nloc); + EXPECT_EQ(gelu_grad.size(), expected_gelu_grad.size()); + for (int jj = 0; jj < gelu_grad.size(); ++jj){ + EXPECT_LT(fabs(gelu_grad[jj] - expected_gelu_grad[jj]) , 1e-5); + } +} + +TEST_F(TestGelu, gelu_grad_grad_gpu_rocm) +{ + std::vector dy(100, 1.0); + std::vector dy_2(100, 1.0); + std::vector gelu_grad_grad(nloc, 0.0); + + double * gelu_grad_grad_dev = NULL, * xx_dev = NULL, * dy_dev = NULL, * dy_2_dev = NULL; + deepmd::malloc_device_memory_sync(gelu_grad_grad_dev, gelu_grad_grad); + deepmd::malloc_device_memory_sync(xx_dev, xx); + deepmd::malloc_device_memory_sync(dy_dev, dy); + deepmd::malloc_device_memory_sync(dy_2_dev, dy_2); + deepmd::gelu_grad_grad_gpu_rocm (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc); + deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad); + deepmd::delete_device_memory(gelu_grad_grad_dev); + deepmd::delete_device_memory(xx_dev); + deepmd::delete_device_memory(dy_dev); + deepmd::delete_device_memory(dy_2_dev); + + EXPECT_EQ(gelu_grad_grad.size(), nloc); + EXPECT_EQ(gelu_grad_grad.size(), expected_gelu_grad_grad.size()); + for (int jj = 0; jj < gelu_grad_grad.size(); ++jj){ + EXPECT_LT(fabs(gelu_grad_grad[jj] - expected_gelu_grad_grad[jj]) , 1e-5); + } +} +#endif // TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc index 5cf74f7382..193279070f 100644 --- a/source/lib/tests/test_neighbor_list.cc +++ b/source/lib/tests/test_neighbor_list.cc @@ -225,4 +225,115 @@ TEST_F(TestNeighborList, gpu_lessmem) deepmd::delete_device_memory(c_cpy_dev); } -#endif //GOOGLE_CUDA \ No newline at end of file +#endif //GOOGLE_CUDA + + +#if TENSORFLOW_USE_ROCM +TEST_F(TestNeighborList, gpu) +{ + int mem_size = 48; + + int * nlist_data_dev=NULL, * jlist_dev=NULL, * ilist_dev=NULL, * numneigh_dev=NULL; + int ** firstneigh_dev=NULL; + std::vector temp_firstneigh(nloc); + double * c_cpy_dev=NULL; + + deepmd::malloc_device_memory(nlist_data_dev, 2 * nloc * mem_size); + deepmd::malloc_device_memory(jlist_dev, nloc * mem_size); + deepmd::malloc_device_memory(ilist_dev, nloc); + deepmd::malloc_device_memory(numneigh_dev, nloc); + for(int ii = 0; ii < nloc; ++ii){ + temp_firstneigh[ii] = jlist_dev + ii * mem_size; + } + deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh); + deepmd::malloc_device_memory_sync(c_cpy_dev, posi_cpy); + deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev); + + int max_list_size; + int ret = deepmd::build_nlist_gpu_rocm( + nlist_dev, + &max_list_size, + nlist_data_dev, + c_cpy_dev, + nloc, + nall, + mem_size, + rc); + + EXPECT_EQ(ret, 0); + int * ilist = new int[nloc]; + int * numneigh = new int[nloc]; + int ** firstneigh = new int*[nloc]; + int * jlist = new int[nloc * mem_size]; + deepmd::memcpy_device_to_host(jlist_dev, jlist, nloc * mem_size); + deepmd::memcpy_device_to_host(ilist_dev, ilist, nloc); + deepmd::memcpy_device_to_host(numneigh_dev, numneigh, nloc); + for(int ii = 0; ii < nloc; ++ii){ + firstneigh[ii] = jlist + ii * mem_size; + } + + deepmd::InputNlist nlist(nlist_dev.inum, ilist, numneigh, firstneigh); + EXPECT_EQ(nlist.inum, nloc); + EXPECT_EQ(max_list_size, 5); + for(int ii = 0; ii < nloc; ++ii){ + EXPECT_EQ(nlist.ilist[ii], ii); + EXPECT_EQ(nlist.numneigh[ii], expect_nlist_cpy[ii].size()); + std::sort(nlist.firstneigh[ii], nlist.firstneigh[ii] + nlist.numneigh[ii]); + for(int jj = 0; jj < nlist.numneigh[ii]; ++jj){ + EXPECT_EQ(nlist.firstneigh[ii][jj], expect_nlist_cpy[ii][jj]); + } + } + + delete[] ilist; + delete[] numneigh; + delete[] jlist; + delete[] firstneigh; + deepmd::delete_device_memory(nlist_data_dev); + deepmd::delete_device_memory(jlist_dev); + deepmd::delete_device_memory(ilist_dev); + deepmd::delete_device_memory(numneigh_dev); + deepmd::delete_device_memory(firstneigh_dev); + deepmd::delete_device_memory(c_cpy_dev); +} + +TEST_F(TestNeighborList, gpu_lessmem) +{ + int mem_size = 47; + + int * nlist_data_dev=NULL, * jlist_dev=NULL, * ilist_dev=NULL, * numneigh_dev=NULL; + int ** firstneigh_dev=NULL; + std::vector temp_firstneigh(nloc); + double * c_cpy_dev=NULL; + + deepmd::malloc_device_memory(nlist_data_dev, 2 * nloc * mem_size); + deepmd::malloc_device_memory(jlist_dev, nloc * mem_size); + deepmd::malloc_device_memory(ilist_dev, nloc); + deepmd::malloc_device_memory(numneigh_dev, nloc); + for(int ii = 0; ii < nloc; ++ii){ + temp_firstneigh[ii] = jlist_dev + ii * mem_size; + } + deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh); + deepmd::malloc_device_memory_sync(c_cpy_dev, posi_cpy); + deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev); + + int max_list_size; + int ret = deepmd::build_nlist_gpu_rocm( + nlist_dev, + &max_list_size, + nlist_data_dev, + c_cpy_dev, + nloc, + nall, + mem_size, + rc); + + EXPECT_EQ(ret, 1); + deepmd::delete_device_memory(nlist_data_dev); + deepmd::delete_device_memory(jlist_dev); + deepmd::delete_device_memory(ilist_dev); + deepmd::delete_device_memory(numneigh_dev); + deepmd::delete_device_memory(firstneigh_dev); + deepmd::delete_device_memory(c_cpy_dev); +} + +#endif //TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc index d9c7c1319d..71685de2aa 100644 --- a/source/lib/tests/test_prod_force_a.cc +++ b/source/lib/tests/test_prod_force_a.cc @@ -127,3 +127,33 @@ TEST_F(TestProdForceA, gpu_cuda) } } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdForceA, gpu_rocm) +{ + std::vector force(nall * 3, 0.0); + int n_a_sel = nnei; + + int * nlist_dev = NULL; + double * force_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(force_dev, force); + deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + + deepmd::prod_force_a_gpu_rocm (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei); + + deepmd::memcpy_device_to_host(force_dev, force); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(force_dev); + deepmd::delete_device_memory(net_deriv_dev); + deepmd::delete_device_memory(env_deriv_dev); + + EXPECT_EQ(force.size(), nall * 3); + EXPECT_EQ(force.size(), expected_force.size()); + for (int jj = 0; jj < force.size(); ++jj){ + EXPECT_LT(fabs(force[jj] - expected_force[jj]) , 1e-5); + } +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc index e456e2a8dd..45f576a667 100644 --- a/source/lib/tests/test_prod_force_grad_a.cc +++ b/source/lib/tests/test_prod_force_grad_a.cc @@ -126,3 +126,33 @@ TEST_F(TestProdForceGradA, gpu) // printf("\n"); } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdForceGradA, gpu) +{ + std::vector grad_net(nloc * ndescrpt); + int * nlist_dev = NULL; + double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(grad_dev, grad); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::prod_force_grad_a_gpu_rocm(grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei); + deepmd::memcpy_device_to_host(grad_net_dev, grad_net); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(grad_dev); + deepmd::delete_device_memory(env_deriv_dev); + deepmd::delete_device_memory(grad_net_dev); + + EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), expected_grad_net.size()); + for (int jj = 0; jj < grad_net.size(); ++jj){ + EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5); + } + // for (int jj = 0; jj < nloc * ndescrpt; ++jj){ + // printf("%8.5f, ", grad_net[jj]); + // } + // printf("\n"); +} +#endif // TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc index da4ac96d3b..5db405931d 100644 --- a/source/lib/tests/test_prod_force_grad_r.cc +++ b/source/lib/tests/test_prod_force_grad_r.cc @@ -126,3 +126,33 @@ TEST_F(TestProdForceGradR, gpu) // printf("\n"); } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdForceGradR, gpu) +{ + std::vector grad_net(nloc * ndescrpt); + int * nlist_dev = NULL; + double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(grad_dev, grad); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::prod_force_grad_r_gpu_rocm(grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei); + deepmd::memcpy_device_to_host(grad_net_dev, grad_net); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(grad_dev); + deepmd::delete_device_memory(env_deriv_dev); + deepmd::delete_device_memory(grad_net_dev); + + EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), expected_grad_net.size()); + for (int jj = 0; jj < grad_net.size(); ++jj){ + EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5); + } + // for (int jj = 0; jj < nloc * ndescrpt; ++jj){ + // printf("%8.5f, ", grad_net[jj]); + // } + // printf("\n"); +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc index e77cafdace..6577d184a0 100644 --- a/source/lib/tests/test_prod_force_r.cc +++ b/source/lib/tests/test_prod_force_r.cc @@ -127,3 +127,33 @@ TEST_F(TestProdForceR, gpu_cuda) } } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdForceR, gpu_rocm) +{ + std::vector force(nall * 3, 0.0); + int n_a_sel = nnei; + + int * nlist_dev = NULL; + double * force_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(force_dev, force); + deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + + deepmd::prod_force_r_gpu_rocm (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei); + + deepmd::memcpy_device_to_host(force_dev, force); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(force_dev); + deepmd::delete_device_memory(net_deriv_dev); + deepmd::delete_device_memory(env_deriv_dev); + + EXPECT_EQ(force.size(), nall * 3); + EXPECT_EQ(force.size(), expected_force.size()); + for (int jj = 0; jj < force.size(); ++jj){ + EXPECT_LT(fabs(force[jj] - expected_force[jj]) , 1e-5); + } +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc index f63fc00fb5..fcea02891d 100644 --- a/source/lib/tests/test_prod_virial_a.cc +++ b/source/lib/tests/test_prod_virial_a.cc @@ -159,3 +159,50 @@ TEST_F(TestProdVirialA, gpu_cuda) } } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdVirialA, gpu_rocm) +{ + std::vector virial(9, 0.0); + std::vector atom_virial(nall * 9, 0.0); + int n_a_sel = nnei; + + int * nlist_dev = NULL; + double * virial_dev = NULL, *atom_virial_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(virial_dev, virial); + deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial); + deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + + deepmd::prod_virial_a_gpu_rocm (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei); + + deepmd::memcpy_device_to_host(virial_dev, virial); + deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(virial_dev); + deepmd::delete_device_memory(atom_virial_dev); + deepmd::delete_device_memory(net_deriv_dev); + deepmd::delete_device_memory(env_deriv_dev); + deepmd::delete_device_memory(rij_dev); + // virial are not calculated in gpu currently; + // for (int ii = 0; ii < 9; ii++) { + // virial[ii] = 0; + // } + // for (int ii = 0; ii < nall * 9; ii++) { + // virial[ii % 9] += atom_virial[ii]; + // } + EXPECT_EQ(virial.size(), 9); + EXPECT_EQ(virial.size(), expected_virial.size()); + EXPECT_EQ(atom_virial.size(), nall * 9); + EXPECT_EQ(atom_virial.size(), expected_atom_virial.size()); + for (int jj = 0; jj < virial.size(); ++jj){ + EXPECT_LT(fabs(virial[jj] - expected_virial[jj]) , 1e-5); + } + for (int jj = 0; jj < atom_virial.size(); ++jj){ + EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]) , 1e-5); + } +} +#endif //TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc index 461552c5a3..469c7d4027 100644 --- a/source/lib/tests/test_prod_virial_grad_a.cc +++ b/source/lib/tests/test_prod_virial_grad_a.cc @@ -132,4 +132,38 @@ TEST_F(TestProdVirialGradA, gpu) // } // printf("\n"); } -#endif // GOOGLE_CUDA \ No newline at end of file +#endif // GOOGLE_CUDA + + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdVirialGradA, gpu) +{ + std::vector grad_net(nloc * ndescrpt); + int n_a_sel = nnei; + int * nlist_dev = NULL; + double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(grad_dev, grad); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::prod_virial_grad_a_gpu_rocm(grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei); + deepmd::memcpy_device_to_host(grad_net_dev, grad_net); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(grad_dev); + deepmd::delete_device_memory(env_deriv_dev); + deepmd::delete_device_memory(rij_dev); + deepmd::delete_device_memory(grad_net_dev); + + EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), expected_grad_net.size()); + for (int jj = 0; jj < grad_net.size(); ++jj){ + EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5); + } + // for (int jj = 0; jj < nloc * ndescrpt; ++jj){ + // printf("%8.5f, ", grad_net[jj]); + // } + // printf("\n"); +} +#endif // TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc index 3f12599232..c517cd60ef 100644 --- a/source/lib/tests/test_prod_virial_grad_r.cc +++ b/source/lib/tests/test_prod_virial_grad_r.cc @@ -133,3 +133,36 @@ TEST_F(TestProdVirialGradR, gpu) // printf("\n"); } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdVirialGradR, gpu) +{ + std::vector grad_net(nloc * ndescrpt); + int n_a_sel = nnei; + int * nlist_dev = NULL; + double * grad_net_dev = NULL, * grad_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(grad_dev, grad); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); + deepmd::prod_virial_grad_r_gpu_rocm(grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei); + deepmd::memcpy_device_to_host(grad_net_dev, grad_net); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(grad_dev); + deepmd::delete_device_memory(env_deriv_dev); + deepmd::delete_device_memory(rij_dev); + deepmd::delete_device_memory(grad_net_dev); + + EXPECT_EQ(grad_net.size(), nloc * ndescrpt); + EXPECT_EQ(grad_net.size(), expected_grad_net.size()); + for (int jj = 0; jj < grad_net.size(); ++jj){ + EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]) , 1e-5); + } + // for (int jj = 0; jj < nloc * ndescrpt; ++jj){ + // printf("%8.5f, ", grad_net[jj]); + // } + // printf("\n"); +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc index be7e865962..33cf9a9ed8 100644 --- a/source/lib/tests/test_prod_virial_r.cc +++ b/source/lib/tests/test_prod_virial_r.cc @@ -159,3 +159,50 @@ TEST_F(TestProdVirialR, gpu_cuda) } } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestProdVirialR, gpu_rocm) +{ + std::vector virial(9, 0.0); + std::vector atom_virial(nall * 9, 0.0); + int n_a_sel = nnei; + + int * nlist_dev = NULL; + double * virial_dev = NULL, *atom_virial_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL; + + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory_sync(virial_dev, virial); + deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial); + deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); + deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); + deepmd::malloc_device_memory_sync(rij_dev, rij); + + deepmd::prod_virial_r_gpu_rocm (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei); + + deepmd::memcpy_device_to_host(virial_dev, virial); + deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(virial_dev); + deepmd::delete_device_memory(atom_virial_dev); + deepmd::delete_device_memory(net_deriv_dev); + deepmd::delete_device_memory(env_deriv_dev); + deepmd::delete_device_memory(rij_dev); + // virial are not calculated in gpu currently; + // for (int ii = 0; ii < 9; ii++) { + // virial[ii] = 0; + // } + // for (int ii = 0; ii < nall * 9; ii++) { + // virial[ii % 9] += atom_virial[ii]; + // } + EXPECT_EQ(virial.size(), 9); + EXPECT_EQ(virial.size(), expected_virial.size()); + EXPECT_EQ(atom_virial.size(), nall * 9); + EXPECT_EQ(atom_virial.size(), expected_atom_virial.size()); + for (int jj = 0; jj < virial.size(); ++jj){ + EXPECT_LT(fabs(virial[jj] - expected_virial[jj]) , 1e-5); + } + for (int jj = 0; jj < atom_virial.size(); ++jj){ + EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]) , 1e-5); + } +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc index 5c8951de0e..e90ec55783 100644 --- a/source/lib/tests/test_simulation_region.cc +++ b/source/lib/tests/test_simulation_region.cc @@ -158,4 +158,76 @@ TEST_F(TestRegion, gpu) // EXPECT_EQ (50.332, square_root (2533.310224)); // } - +#if TENSORFLOW_USE_ROCM +TEST_F(TestRegion, gpu) +{ + // check rec_box + deepmd::Region region; + deepmd::Region region_dev; + double * new_boxt = region_dev.boxt; + double * new_rec_boxt = region_dev.rec_boxt; + double * boxt_dev = NULL, * rec_boxt_dev = NULL; + double * ref_rp_dev = NULL, * ref_ri_dev = NULL; + init_region_cpu(region, &ref_boxt[0]); + for(int ii = 0; ii < 9; ++ii){ + EXPECT_LT(fabs(region.rec_boxt[ii] - ref_rec_boxt[ii]), 1e-10); + } + deepmd::malloc_device_memory_sync(boxt_dev, region.boxt, 9); + deepmd::malloc_device_memory_sync(rec_boxt_dev, region.rec_boxt, 9); + deepmd::malloc_device_memory_sync(ref_rp_dev, ref_rp); + deepmd::malloc_device_memory_sync(ref_ri_dev, ref_ri); + region_dev.boxt = boxt_dev; + region_dev.rec_boxt = rec_boxt_dev; + // check volume + double vol[1]; + double * vol_dev = NULL; + deepmd::malloc_device_memory(vol_dev, 1); + deepmd::volume_gpu_rocm(vol_dev, region_dev); + deepmd::memcpy_device_to_host(vol_dev, vol, 1); + EXPECT_LT(fabs(vol[0] - expected_vol), 1e-10); + // check conversion between phys and inter coords. + double ri[3]; + double * ri_dev = NULL; + deepmd::malloc_device_memory(ri_dev, 3); + deepmd::convert_to_inter_gpu_rocm(ri_dev, region_dev, ref_rp_dev); + deepmd::memcpy_device_to_host(ri_dev, ri, 3); + for(int ii = 0; ii < 3; ++ii){ + EXPECT_LT(fabs(ri[ii] - ref_ri[ii]), 1e-10); + } + double rp2[3]; + double * rp2_dev = NULL; + deepmd::malloc_device_memory(rp2_dev, 3); + deepmd::convert_to_phys_gpu_rocm(rp2_dev, region_dev, ri_dev); + deepmd::memcpy_device_to_host(rp2_dev, rp2, 3); + for(int ii = 0; ii < 3; ++ii){ + EXPECT_LT(fabs(rp2[ii] - ref_rp[ii]), 1e-10); + } + double rp[3]; + double * rp_dev = NULL; + deepmd::malloc_device_memory(rp_dev, 3); + deepmd::convert_to_phys_gpu_rocm(rp_dev, region_dev, ref_ri_dev); + deepmd::memcpy_device_to_host(rp_dev, rp, 3); + for(int ii = 0; ii < 3; ++ii){ + EXPECT_LT(fabs(rp[ii] - ref_rp[ii]), 1e-10); + } + double ri2[3]; + double * ri2_dev = NULL; + deepmd::malloc_device_memory(ri2_dev, 3); + deepmd::convert_to_inter_gpu_rocm(ri2_dev, region_dev, rp_dev); + deepmd::memcpy_device_to_host(ri2_dev, ri2, 3); + for(int ii = 0; ii < 3; ++ii){ + EXPECT_LT(fabs(ri2[ii] - ref_ri[ii]), 1e-10); + } + deepmd::delete_device_memory(boxt_dev); + deepmd::delete_device_memory(rec_boxt_dev); + deepmd::delete_device_memory(vol_dev); + deepmd::delete_device_memory(ref_rp_dev); + deepmd::delete_device_memory(ref_ri_dev); + deepmd::delete_device_memory(ri_dev); + deepmd::delete_device_memory(rp2_dev); + deepmd::delete_device_memory(rp_dev); + deepmd::delete_device_memory(ri2_dev); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/tests/test_tabulate.cc b/source/lib/tests/test_tabulate.cc index b22cca03d8..43c0ef798e 100644 --- a/source/lib/tests/test_tabulate.cc +++ b/source/lib/tests/test_tabulate.cc @@ -232,3 +232,63 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda) } } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +TEST_F(TestTabulate, tabulate_fusion_gpu_rocm) +{ + std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); + + double * xyz_scatter_dev = NULL, * table_dev = NULL, * em_x_dev = NULL, * em_dev = NULL; + deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter); + deepmd::malloc_device_memory_sync(table_dev, table); + deepmd::malloc_device_memory_sync(em_x_dev, em_x); + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::tabulate_fusion_gpu_rocm(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size); + deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); + deepmd::delete_device_memory(xyz_scatter_dev); + deepmd::delete_device_memory(table_dev); + deepmd::delete_device_memory(em_x_dev); + deepmd::delete_device_memory(em_dev); + + EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size); + EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size()); + for (int jj = 0; jj < xyz_scatter.size(); ++jj){ + EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter[jj]) , 1e-5); + } +} + +TEST_F(TestTabulate, tabulate_fusion_grad_gpu_rocm) +{ + std::vector dy_dem_x(em_x.size(), 0.0); + std::vector dy_dem(em.size(), 0.0); + std::vector dy(nloc * nnei * last_layer_size, 1.0); + + double * dy_dem_x_dev = NULL, * dy_dem_dev = NULL, * table_dev = NULL, * em_x_dev = NULL, * em_dev = NULL, * dy_dev = NULL; + deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); + deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); + deepmd::malloc_device_memory_sync(table_dev, table); + deepmd::malloc_device_memory_sync(em_x_dev, em_x); + deepmd::malloc_device_memory_sync(em_dev, em); + deepmd::malloc_device_memory_sync(dy_dev, dy); + deepmd::tabulate_fusion_grad_gpu_rocm(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size); + deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); + deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); + deepmd::delete_device_memory(dy_dem_x_dev); + deepmd::delete_device_memory(dy_dem_dev); + deepmd::delete_device_memory(table_dev); + deepmd::delete_device_memory(em_x_dev); + deepmd::delete_device_memory(em_dev); + deepmd::delete_device_memory(dy_dev); + + EXPECT_EQ(dy_dem_x.size(), nloc * nnei); + EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); + EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size()); + EXPECT_EQ(dy_dem.size(), expected_dy_dem.size()); + for (int jj = 0; jj < dy_dem_x.size(); ++jj){ + EXPECT_LT(fabs(dy_dem_x[jj] - expected_dy_dem_x[jj]) , 1e-5); + } + for (int jj = 0; jj < dy_dem.size(); ++jj){ + EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]) , 1e-5); + } +} +#endif // TENSORFLOW_USE_ROCM diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in index a59ad84960..8157ddb8dd 100644 --- a/source/lmp/env.sh.in +++ b/source/lmp/env.sh.in @@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"` NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/ " NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib" -NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_CC@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib" +NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_DEVICE@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_CC@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib" diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index c3dc7b6815..dbb469f6ef 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -5,18 +5,24 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc) file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc) +file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc) file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) if (BUILD_CPP_IF) - if (USE_CUDA_TOOLKIT) - add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC}) - find_package(CUDA REQUIRED) - include_directories(${CUDA_INCLUDE_DIRS}) - target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES}) - else (USE_CUDA_TOOLKIT) - add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) - endif (USE_CUDA_TOOLKIT) + if (USE_CUDA_TOOLKIT) + add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC}) + find_package(CUDA REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) + target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES}) + elseif() + add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC}) + find_package(HIP REQUIRED) + include_directories(${ROCM_INCLUDE_DIRS}) + target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES}) + else() + add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) + endif() endif (BUILD_CPP_IF) if (BUILD_PY_IF) @@ -27,12 +33,20 @@ if (BUILD_PY_IF) add_library(op_grads SHARED ${OP_GRADS_SRC}) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) - target_link_libraries (op_abi ${LIB_DEEPMD_OP_CUDA}) - target_link_libraries (op_grads ${LIB_DEEPMD_OP_CUDA}) - else (USE_CUDA_TOOLKIT) + target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) + target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) + elseif() + add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) + add_library(op_grads SHARED ${OP_GRADS_SRC}) + find_package(HIP REQUIRED) + include_directories(${HIP_INCLUDE_DIRS}) + target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) + target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) + else() add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) endif(USE_CUDA_TOOLKIT) + message(STATUS ${TensorFlowFramework_LIBRARY}) target_link_libraries(op_abi ${LIB_DEEPMD}) target_link_libraries(op_grads ${LIB_DEEPMD}) @@ -62,3 +76,4 @@ if (BUILD_PY_IF) install(TARGETS op_grads DESTINATION deepmd/op/) install(FILES ${OP_PY} DESTINATION deepmd/op/) endif (BUILD_PY_IF) + diff --git a/source/op/custom_op.h b/source/op/custom_op.h index 6a8533f66f..e4f9211e61 100644 --- a/source/op/custom_op.h +++ b/source/op/custom_op.h @@ -18,12 +18,12 @@ struct DeviceFunctor { { device = "CPU"; } - #if GOOGLE_CUDA + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void operator()( std::string& device, const GPUDevice& d) { device = "GPU"; } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; \ No newline at end of file diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc index ece0d07ab3..953d89f55a 100644 --- a/source/op/gelu_multi_device.cc +++ b/source/op/gelu_multi_device.cc @@ -49,6 +49,12 @@ class GeluOp : public OpKernel { out, x, size); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::gelu_gpu_rocm( + out, + x,size); + #endif//TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_cpu( @@ -92,6 +98,12 @@ class GeluGradOp : public OpKernel { out, x, dy, size); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::gelu_grad_gpu_rocm( + out, + x, dy, size); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_grad_cpu( @@ -133,6 +145,12 @@ class GeluGradGradOp : public OpKernel { out, x, dy, dy_2, size); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::gelu_grad_grad_gpu_rocm( + out, + x, dy, dy_2, size); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_grad_grad_cpu( @@ -157,7 +175,7 @@ REGISTER_KERNEL_BUILDER( \ REGISTER_CPU(float); REGISTER_CPU(double); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ @@ -170,4 +188,4 @@ REGISTER_KERNEL_BUILDER( \ GeluGradGradOp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 1bdd5553f4..64d40acff4 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -180,6 +180,80 @@ _prepare_coord_nlist_gpu( #endif //GOOGLE_CUDA +#if TENSORFLOW_USE_ROCM +template +static int +_norm_copy_coord_gpu_rocm( + OpKernelContext* context, + Tensor * tensor_list, + FPTYPE * & coord_cpy, + int * & type_cpy, + int * & idx_mapping, + int & nall, + int & mem_cpy, + const FPTYPE * coord, + const FPTYPE * box, + const int * type, + const int &nloc, + const int &max_cpy_trial, + const float & rcut_r); + +template +static int +_build_nlist_gpu_rocm( + OpKernelContext* context, + Tensor * tensor_list, + int * &ilist, + int * &numneigh, + int ** &firstneigh, + int * &jlist, + int & max_nnei, + int & mem_nnei, + const FPTYPE *coord, + const int & nloc, + const int & new_nall, + const int & max_nnei_trial, + const float & rcut_r); + +static void +_map_nlist_gpu_rocm( + int * nlist, + const int * idx_mapping, + const int & nloc, + const int & nnei); + +template +static void +_prepare_coord_nlist_gpu_rocm( + OpKernelContext* context, + Tensor * tensor_list, + FPTYPE const ** coord, + FPTYPE * & coord_cpy, + int const** type, + int * & type_cpy, + int * & idx_mapping, + deepmd::InputNlist & inlist, + int * & ilist, + int * & numneigh, + int ** & firstneigh, + int * & jlist, + int * & nbor_list_dev, + int & new_nall, + int & mem_cpy, + int & mem_nnei, + int & max_nbor_size, + const FPTYPE * box, + const int * mesh_tensor_data, + const int mesh_tensor_size, + const int & nloc, + const int & nei_mode, + const float & rcut_r, + const int & max_cpy_trial, + const int & max_nnei_trial); + +#endif //TENSORFLOW_USE_ROCM + + template class ProdEnvMatAOp : public OpKernel { public: @@ -366,6 +440,44 @@ class ProdEnvMatAOp : public OpKernel { if(b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); deepmd::delete_device_memory(firstneigh); #endif //GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + int * idx_mapping = NULL; + int * ilist = NULL, * numneigh = NULL; + int ** firstneigh = NULL; + deepmd::malloc_device_memory(firstneigh, nloc); + int * jlist = NULL; + FPTYPE * coord_cpy; + int * type_cpy; + int frame_nall = nall; + int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); + std::vector tensor_list(7); + // prepare coord and nlist + _prepare_coord_nlist_gpu_rocm( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, idx_mapping, + gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, + frame_nall, mem_cpy, mem_nnei, max_nbor_size, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial); + + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + // launch the gpu(nv) compute function + deepmd::prod_env_mat_a_gpu_rocm( + em, em_deriv, rij, nlist, + coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a); + if(b_nlist_map) _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei); + deepmd::delete_device_memory(firstneigh); + #endif //TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; @@ -588,6 +700,45 @@ class ProdEnvMatROp : public OpKernel { if(b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); deepmd::delete_device_memory(firstneigh); #endif //GOOGLE_CUDA + + + #if TENSORFLOW_USE_ROCM + int * idx_mapping = NULL; + int * ilist = NULL, * numneigh = NULL; + int ** firstneigh = NULL; + deepmd::malloc_device_memory(firstneigh, nloc); + int * jlist = NULL; + FPTYPE * coord_cpy; + int * type_cpy; + int frame_nall = nall; + int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); + std::vector tensor_list(7); + // prepare coord and nlist + _prepare_coord_nlist_gpu_rocm( + context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, idx_mapping, + gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, + frame_nall, mem_cpy, mem_nnei, max_nbor_size, + box, mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut, max_cpy_trial, max_nnei_trial); + + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec.size() + nloc * sec.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + // launch the gpu(nv) compute function + deepmd::prod_env_mat_r_gpu_rocm( + em, em_deriv, rij, nlist, + coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut, rcut_smth, sec); + if(b_nlist_map) _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei); + deepmd::delete_device_memory(firstneigh); + #endif //TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; @@ -996,6 +1147,222 @@ _prepare_coord_nlist_gpu( } #endif // GOOGLE_CUDA + +#if TENSORFLOW_USE_ROCM +template +static int +_norm_copy_coord_gpu_rocm( + OpKernelContext* context, + Tensor * tensor_list, + FPTYPE * & coord_cpy, + int * & type_cpy, + int * & idx_mapping, + int & nall, + int & mem_cpy, + const FPTYPE * coord, + const FPTYPE * box, + const int * type, + const int &nloc, + const int &max_cpy_trial, + const float & rcut_r) +{ + // Tensor FPTYPE_temp; + TensorShape FPTYPE_shape; + FPTYPE_shape.AddDim(nall*3); + context->allocate_temp(DataTypeToEnum::value, FPTYPE_shape, tensor_list); + FPTYPE * tmp_coord = (*tensor_list).flat().data(); + hipErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice)); + + deepmd::Region region; + init_region_cpu(region, box); + FPTYPE box_info[18]; + std::copy(region.boxt, region.boxt+9, box_info); + std::copy(region.rec_boxt, region.rec_boxt+9, box_info+9); + int cell_info[23]; + deepmd::compute_cell_info(cell_info, rcut_r, region); + const int loc_cellnum=cell_info[21]; + const int total_cellnum=cell_info[22]; + //Tensor double_temp; + TensorShape double_shape; + double_shape.AddDim(18); + context->allocate_temp(DataTypeToEnum::value, double_shape, tensor_list+1); + //Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(23+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc); + context, context->allocate_temp(DT_INT32, int_shape, tensor_list+2); + FPTYPE * box_info_dev = (*(tensor_list+1)).flat().data(); + int * cell_info_dev = (*(tensor_list+2)).flat().data(); + int * int_data_dev = cell_info_dev + 23; + deepmd::memcpy_host_to_device(box_info_dev, box_info, 18); + deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23); + deepmd::Region region_dev; + FPTYPE * new_boxt = region_dev.boxt; + FPTYPE * new_rec_boxt = region_dev.rec_boxt; + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + deepmd::normalize_coord_gpu_rocm(tmp_coord, nall, region_dev); + int tt; + for(tt = 0; tt < max_cpy_trial; ++tt){ + //Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy*3); + context->allocate_temp(DataTypeToEnum::value, cpy_shape, tensor_list+3); + //Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy*2); + context, context->allocate_temp(DT_INT32, t_shape, tensor_list+4); + coord_cpy = (*(tensor_list+3)).flat().data(); + type_cpy = (*(tensor_list+4)).flat().data(); + idx_mapping = type_cpy + mem_cpy; + int ret = deepmd::copy_coord_gpu_rocm( + coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, + tmp_coord, type, nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev); + if(ret == 0){ + break; + } + else{ + mem_cpy *= 2; + } + } + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; + return (tt != max_cpy_trial); +} + +template +static int +_build_nlist_gpu_rocm( + OpKernelContext* context, + Tensor * tensor_list, + int * &ilist, + int * &numneigh, + int ** &firstneigh, + int * &jlist, + int & max_nnei, + int & mem_nnei, + const FPTYPE *coord, + const int & nloc, + const int & new_nall, + const int & max_nnei_trial, + const float & rcut_r) +{ + //Tensor nlist_temp; + TensorShape nlist_shape; + nlist_shape.AddDim(nloc*2); + context->allocate_temp(DT_INT32, nlist_shape, tensor_list); + ilist = (*tensor_list).flat().data(); + numneigh = ilist + nloc; + //Tensor jlist_temp; + int * ind_data = NULL; + + std::vector firstneigh_host(nloc); + int tt; + for(tt = 0; tt < max_nnei_trial; ++tt){ + TensorShape jlist_shape; + jlist_shape.AddDim(3*nloc*mem_nnei); + context->allocate_temp(DT_INT32, jlist_shape, tensor_list+1); + jlist = (*(tensor_list+1)).flat().data(); + ind_data = jlist + nloc * mem_nnei; + for(int ii = 0; ii < nloc; ++ii){ + firstneigh_host[ii] = jlist + ii * mem_nnei; + } + deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); + deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh); + int ret = deepmd::build_nlist_gpu_rocm( + inlist, &max_nnei, ind_data, + coord, nloc, new_nall, mem_nnei, rcut_r); + if(ret == 0){ + break; + } + else{ + mem_nnei *= 2; + } + } + return (tt != max_nnei_trial); +} + +static void +_map_nlist_gpu_rocm( + int * nlist, + const int * idx_mapping, + const int & nloc, + const int & nnei) +{ + deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei); +} + +template +static void +_prepare_coord_nlist_gpu_rocm( + OpKernelContext* context, + Tensor * tensor_list, + FPTYPE const ** coord, + FPTYPE * & coord_cpy, + int const** type, + int * & type_cpy, + int * & idx_mapping, + deepmd::InputNlist & inlist, + int * & ilist, + int * & numneigh, + int ** & firstneigh, + int * & jlist, + int * & nbor_list_dev, + int & new_nall, + int & mem_cpy, + int & mem_nnei, + int & max_nbor_size, + const FPTYPE * box, + const int * mesh_tensor_data, + const int mesh_tensor_size, + const int & nloc, + const int & nei_mode, + const float & rcut_r, + const int & max_cpy_trial, + const int & max_nnei_trial) +{ + if(nei_mode != 3){ + inlist.inum = nloc; + // build nlist by myself + // normalize and copy coord + if(nei_mode == 1){ + int copy_ok = _norm_copy_coord_gpu_rocm( + context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy, + *coord, box, *type, nloc, max_cpy_trial, rcut_r); + OP_REQUIRES (context, copy_ok, errors::Aborted("cannot allocate mem for copied coords")); + *coord = coord_cpy; + *type = type_cpy; + } + //build nlist + int build_ok = _build_nlist_gpu_rocm( + context, tensor_list + 5, ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei, + *coord, nloc, new_nall, max_nnei_trial, rcut_r); + OP_REQUIRES (context, build_ok, errors::Aborted("cannot allocate mem for nlist")); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + inlist.ilist = ilist; + inlist.numneigh = numneigh; + inlist.firstneigh = firstneigh; + } + else{ + // update nbor list + deepmd::InputNlist inlist_temp; + inlist_temp.inum = nloc; + deepmd::env_mat_nbor_update( + inlist_temp, inlist, max_nbor_size, nbor_list_dev, + mesh_tensor_data, mesh_tensor_size); + OP_REQUIRES (context, (max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit.")); + } +} +#endif // TENSORFLOW_USE_ROCM + + // Register the CPU kernels. #define REGISTER_CPU(T) \ REGISTER_KERNEL_BUILDER( \ @@ -1008,7 +1375,7 @@ REGISTER_CPU(float); REGISTER_CPU(double); // Register the GPU kernels. -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("ProdEnvMatA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms").HostMemory("box"), \ @@ -1018,4 +1385,4 @@ REGISTER_KERNEL_BUILDER( ProdEnvMatROp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc index 1bda63903b..2dae7c1a0b 100644 --- a/source/op/prod_force_grad_multi_device.cc +++ b/source/op/prod_force_grad_multi_device.cc @@ -113,6 +113,12 @@ class ProdForceSeAGradOp : public OpKernel { grad_net, grad, in_deriv, nlist, nloc, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_force_grad_a_gpu_rocm( + grad_net, + grad, in_deriv, nlist, nloc, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_grad_a_cpu( @@ -215,6 +221,12 @@ class ProdForceSeRGradOp : public OpKernel grad_net, grad, in_deriv, nlist, nloc, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_force_grad_r_gpu_rocm( + grad_net, + grad, in_deriv, nlist, nloc, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_grad_r_cpu( @@ -238,7 +250,7 @@ REGISTER_KERNEL_BUILDER( REGISTER_CPU(float); REGISTER_CPU(double); // Register the GPU kernels. -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("ProdForceSeAGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ @@ -248,4 +260,4 @@ REGISTER_KERNEL_BUILDER( ProdForceSeRGradOp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA \ No newline at end of file +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc index a49b9c1913..748971751e 100644 --- a/source/op/prod_force_multi_device.cc +++ b/source/op/prod_force_multi_device.cc @@ -88,6 +88,12 @@ class ProdForceSeAOp : public OpKernel { force, net_deriv, in_deriv, nlist, nloc, nall, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_force_a_gpu_rocm( + force, + net_deriv, in_deriv, nlist, nloc, nall, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_a_cpu( @@ -167,6 +173,12 @@ class ProdForceSeROp : public OpKernel { force, net_deriv, in_deriv, nlist, nloc, nall, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_force_r_gpu_rocm( + force, + net_deriv, in_deriv, nlist, nloc, nall, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_r_cpu( @@ -190,7 +202,7 @@ REGISTER_KERNEL_BUILDER( REGISTER_CPU(float); REGISTER_CPU(double); // Register the GPU kernels. -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ @@ -200,4 +212,5 @@ REGISTER_KERNEL_BUILDER( ProdForceSeROp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc index ac74d1d141..996c8331b8 100644 --- a/source/op/prod_virial_grad_multi_device.cc +++ b/source/op/prod_virial_grad_multi_device.cc @@ -127,6 +127,12 @@ class ProdVirialSeAGradOp : public OpKernel grad_net, grad, in_deriv, rij, nlist, nloc, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_virial_grad_a_gpu_rocm( + grad_net, + grad, in_deriv, rij, nlist, nloc, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_grad_a_cpu( @@ -239,6 +245,12 @@ class ProdVirialSeRGradOp : public OpKernel grad_net, grad, in_deriv, rij, nlist, nloc, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_virial_grad_r_gpu_rocm( + grad_net, + grad, in_deriv, rij, nlist, nloc, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_grad_r_cpu( @@ -262,7 +274,7 @@ REGISTER_KERNEL_BUILDER( REGISTER_CPU(float); REGISTER_CPU(double); // Register the GPU kernels. -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("ProdVirialSeAGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ @@ -272,4 +284,4 @@ REGISTER_KERNEL_BUILDER( ProdVirialSeRGradOp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA \ No newline at end of file +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc index 14189082a4..00537179c9 100644 --- a/source/op/prod_virial_multi_device.cc +++ b/source/op/prod_virial_multi_device.cc @@ -97,6 +97,12 @@ class ProdVirialSeAOp : public OpKernel { virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_virial_a_gpu_rocm( + virial, atom_virial, + net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_a_cpu( @@ -183,6 +189,12 @@ class ProdVirialSeROp : public OpKernel { virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::prod_virial_r_gpu_rocm( + virial, atom_virial, + net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_r_cpu( @@ -206,7 +218,7 @@ REGISTER_KERNEL_BUILDER( REGISTER_CPU(float); REGISTER_CPU(double); // Register the GPU kernels. -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ @@ -216,4 +228,4 @@ REGISTER_KERNEL_BUILDER( ProdVirialSeROp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index 8d8f9e82d6..9d54bd18a8 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -67,6 +67,12 @@ class TabulateFusionOp : public OpKernel { descriptor, table, table_info, em_x, em, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::tabulate_fusion_gpu_rocm( + descriptor, + table, table_info, em_x, em, nloc, nnei, last_layer_size); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_cpu( @@ -129,6 +135,12 @@ class TabulateFusionGradOp : public OpKernel { dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::tabulate_fusion_grad_gpu_rocm( + dy_dem_x, dy_dem, + table, table_info, em_x, em, dy, nloc, nnei, last_layer_size); + #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_grad_cpu( @@ -150,7 +162,7 @@ REGISTER_KERNEL_BUILDER( REGISTER_CPU(float); REGISTER_CPU(double); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ @@ -160,4 +172,4 @@ REGISTER_KERNEL_BUILDER( TabulateFusionGradOp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index 9cc2b5ef2a..5f23d639a5 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -42,12 +42,12 @@ struct UnaggregatedDyDxSFunctor { } } - #if GOOGLE_CUDA + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; // calculate the gradient for all variables! @@ -70,12 +70,12 @@ struct UnaggregatedDyDxFunctor { } } - #if GOOGLE_CUDA + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) { //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; template @@ -89,12 +89,12 @@ struct UnaggregatedDy2DxSFunctor { } } - #if GOOGLE_CUDA + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; // calculate the gradient for all variables! @@ -122,12 +122,12 @@ struct UnaggregatedDy2DxFunctor { } } - #if GOOGLE_CUDA + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) { //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; template From 271f23e622435c6a8511deeeb5b3b0414bc1d494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Sat, 22 May 2021 16:38:29 +0800 Subject: [PATCH 02/11] modify CMakeLists.txt --- source/CMakeLists.txt | 29 ++++++++++++++++++++--------- source/api_cc/CMakeLists.txt | 7 ++++++- source/api_cc/tests/CMakeLists.txt | 13 ++++++++++--- source/lib/CMakeLists.txt | 5 +++++ source/lib/tests/CMakeLists.txt | 19 +++++++++++-------- source/op/CMakeLists.txt | 19 ++++++++++++++----- 6 files changed, 66 insertions(+), 26 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index e0ab5ad564..092f902beb 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -2,8 +2,6 @@ cmake_minimum_required(VERSION 3.7) project(DeePMD) set(CMAKE_LINK_WHAT_YOU_USE TRUE) -# note -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake") # build cpp or python interfaces if (NOT DEFINED BUILD_CPP_IF) set(BUILD_CPP_IF TRUE) @@ -75,6 +73,14 @@ if (USE_CUDA_TOOLKIT) endif() #define USE_ROCM_TOOLKIT +if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() +endif() +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) if (DEFINED USE_ROCM_TOOLKIT) if (USE_ROCM_TOOLKIT) find_package(HIP REQUIRED) @@ -97,6 +103,11 @@ if (USE_ROCM_TOOLKIT) add_definitions("-D TENSORFLOW_USE_ROCM") endif() +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") +endif() + # find tensorflow, I need tf abi info find_package(tensorflow REQUIRED) @@ -221,13 +232,13 @@ if (BUILD_CPP_IF) set (LIB_DEEPMD_CC "deepmd_cc") if (USE_CUDA_TOOLKIT) set (LIB_DEEPMD_OP_DEVICE "deepmd_op_cuda") - else() - if(USE_ROCM_TOOLKIT) - set (LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm") - else () - set (LIB_DEEPMD_OP_DEVICE "deepmd_op") - endif() - endif() + endif(USE_CUDA_TOOLKIT) + if (USE_ROCM_TOOLKIT) + set (LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm") + endif(USE_ROCM_TOOLKIT) + if ((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) + set (LIB_DEEPMD_OP_DEVICE "deepmd_op") + endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9) set (LIB_DEEPMD_NATIVE "deepmd_native_md") set (LIB_DEEPMD_IPI "deepmd_ipi") diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index f2019928c9..cbb62df1b3 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -8,11 +8,15 @@ configure_file( @ONLY ) +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") +endif() + if (USE_CUDA_TOOLKIT) include_directories("${CUDA_INCLUDE_DIRS}") endif() - file(GLOB LIB_SRC src/*.cc src/*.cpp) file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) @@ -26,6 +30,7 @@ if (USE_ROCM_TOOLKIT) target_link_libraries(${libname} ${HIP_LIBRARIES}) endif() + install(TARGETS ${libname} DESTINATION lib/) install( diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index e0785298b8..59e6bbc1cc 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -110,6 +110,10 @@ if (USE_ROCM_TOOLKIT) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) endif() +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") +endif() file(GLOB TEST_SRC test_*.cc) add_executable( runUnitTests ${TEST_SRC} ) @@ -128,11 +132,14 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config) -elseif(USE_ROCM_TOOLKIT) +endif(USE_CUDA_TOOLKIT) +if(USE_ROCM_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread deepmd_op_rocm ${TensorFlow_LIBRARY} coverage_config) -else() +endif(USE_ROCM_TOOLKIT) +if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config) -endif() +endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) + add_test( runUnitTests runUnitTests ) find_package(GTest) diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 8c15c1d64b..c34543ab40 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -2,6 +2,11 @@ set (libname ${LIB_DEEPMD}) +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") +endif() + if (USE_CUDA_TOOLKIT) include_directories("${CUDA_INCLUDE_DIRS}") endif() diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index d6908a18ce..36367797b4 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -62,7 +62,7 @@ if (DEFINED USE_ROCM_TOOLKIT) message(STATUS "Will not build rocm GPU support") endif() else() - find_package(HIP REQUIRED) + find_package(HIP QUIET) if (HIP_FOUND) link_directories(${HIP_ROOT_DIR}/lib) add_definitions("-DTENSORFLOW_USE_ROCM") @@ -80,6 +80,10 @@ if (USE_ROCM_TOOLKIT) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) endif() +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") +endif() file(GLOB TEST_SRC test_*.cc) add_executable( runUnitTests ${TEST_SRC} ) @@ -106,14 +110,13 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config) -elseif() - set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) - message(status"EXTRA_LIBS:${EXTRA_LIBS}") - message(status"EXTRA_LIBS:${HIP}") - target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc ${EXTRA_LIBS} coverage_config) -else() +endif(USE_CUDA_TOOLKIT) +if (USE_ROCM_TOOLKIT) + target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc deepmd_op_rocm coverage_config) +endif(USE_ROCM_TOOLKIT) +if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config) -endif() +endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) add_test( runUnitTests runUnitTests ) # include(GoogleTest) diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index dbb469f6ef..eb456f9a97 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -9,18 +9,25 @@ file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") +endif() + if (BUILD_CPP_IF) if (USE_CUDA_TOOLKIT) add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC}) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES}) - elseif() + endif(USE_CUDA_TOOLKIT) + if(USE_ROCM_TOOLKIT) add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC}) find_package(HIP REQUIRED) include_directories(${ROCM_INCLUDE_DIRS}) target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES}) - else() + endif(USE_ROCM_TOOLKIT) + if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT)) add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) endif() endif (BUILD_CPP_IF) @@ -35,17 +42,19 @@ if (BUILD_PY_IF) include_directories(${CUDA_INCLUDE_DIRS}) target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) - elseif() + endif(USE_CUDA_TOOLKIT) + if(USE_ROCM_TOOLKIT) add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) find_package(HIP REQUIRED) include_directories(${HIP_INCLUDE_DIRS}) target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) - else() + endif(USE_ROCM_TOOLKIT) + if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) - endif(USE_CUDA_TOOLKIT) + endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) message(STATUS ${TensorFlowFramework_LIBRARY}) target_link_libraries(op_abi ${LIB_DEEPMD}) From 5e36affdb0a83ed7af9bf29ce3d5ea4ca68cdc33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Thu, 27 May 2021 10:27:02 +0800 Subject: [PATCH 03/11] modify CMakeLists.txt Fixed logic issues, but the absolute path has not yet been modified, leaving multiple errors thrown in the CMakeList --- source/CMakeLists.txt | 52 ++++++---------------- source/api_cc/CMakeLists.txt | 4 +- source/lib/CMakeLists.txt | 5 +-- source/lib/src/rocm/CMakeLists.txt | 19 ++------ source/lib/tests/CMakeLists.txt | 60 ++++++-------------------- source/op/CMakeLists.txt | 3 +- source/op/prod_env_mat_multi_device.cc | 2 +- 7 files changed, 35 insertions(+), 110 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 092f902beb..851873b063 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -51,26 +51,17 @@ file(READ ${PROJECT_SOURCE_DIR}/config/MODEL_VER MODEL_VERSION) string(REPLACE "\n" " " MODEL_VERSION ${MODEL_VERSION}) message(STATUS "Supported model version: ${MODEL_VERSION}") -# define USE_CUDA_TOOLKIT -if (DEFINED USE_CUDA_TOOLKIT) - if (USE_CUDA_TOOLKIT) - find_package(CUDA REQUIRED) - else() - message(STATUS "Will not build nv GPU support") - endif() -else() - find_package(CUDA QUIET) - if (CUDA_FOUND) - set(USE_CUDA_TOOLKIT TRUE) - message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") - else() - set(USE_CUDA_TOOLKIT FALSE) - message(STATUS "No cuda support found, will not build nv GPU support") - endif() +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") endif() + +# define USE_CUDA_TOOLKIT if (USE_CUDA_TOOLKIT) + find_package(CUDA REQUIRED) add_definitions("-D GOOGLE_CUDA") -endif() + message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") +endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT if(NOT DEFINED HIP_PATH) @@ -81,32 +72,13 @@ if(NOT DEFINED HIP_PATH) endif() endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) -if (DEFINED USE_ROCM_TOOLKIT) - if (USE_ROCM_TOOLKIT) - find_package(HIP REQUIRED) - add_compile_definitions(__HIP_PLATFORM_HCC__) - else() - message(STATUS "Will not build nv GPU support") - endif() -else() - find_package(HIP QUIET) - if (HIP_FOUND) - set(USE_ROCM_TOOLKIT TRUE) - add_compile_definitions(__HIP_PLATFORM_HCC__) - message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support") - else() - set(USE_ROCM_TOOLKIT FALSE) - message(STATUS "No rocm support found, will not build AMD GPU support") - endif() -endif() if (USE_ROCM_TOOLKIT) + find_package(HIP REQUIRED) add_definitions("-D TENSORFLOW_USE_ROCM") -endif() + add_compile_definitions(__HIP_PLATFORM_HCC__) + message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support") +endif (USE_ROCM_TOOLKIT) -# Devices that have both ROCM and CUDA are not currently supported -if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) - message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") -endif() # find tensorflow, I need tf abi info find_package(tensorflow REQUIRED) diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index cbb62df1b3..9e77e9fa2e 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -1,5 +1,4 @@ # libmd - set (libname ${LIB_DEEPMD_CC}) configure_file( @@ -27,7 +26,8 @@ if (USE_CUDA_TOOLKIT) endif() if (USE_ROCM_TOOLKIT) - target_link_libraries(${libname} ${HIP_LIBRARIES}) + LINK_LIBRARIES(${HIP_ROOT_DIR}\lib) + target_link_libraries(${libname} hip_hcc) endif() diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index c34543ab40..8ffbe8c192 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -1,5 +1,4 @@ # libmd - set (libname ${LIB_DEEPMD}) # Devices that have both ROCM and CUDA are not currently supported @@ -15,7 +14,6 @@ file(GLOB LIB_SRC src/*.cc src/*.cpp) file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) add_library(${libname} SHARED ${LIB_SRC}) - if (USE_CUDA_TOOLKIT) add_definitions("-D GOOGLE_CUDA") add_subdirectory(src/cuda) @@ -27,7 +25,8 @@ if (USE_ROCM_TOOLKIT) add_definitions("-D TENSORFLOW_USE_ROCM") add_subdirectory(src/rocm) set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) - target_link_libraries (${libname} ${HIP_LIBRARIES} ${EXTRA_LIBS}) + LINK_LIBRARIES(${HIP_ROOT_DIR}/lib) + target_link_libraries (${libname} ${EXTRA_LIBS} hip_hcc) endif() if(BUILD_PY_IF) diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt index 27c4d62f68..bd46a9f432 100644 --- a/source/lib/src/rocm/CMakeLists.txt +++ b/source/lib/src/rocm/CMakeLists.txt @@ -1,29 +1,17 @@ # required cmake version -cmake_minimum_required(VERSION 3.15) +cmake_minimum_required(VERSION 3.5) # project name project(deepmd_op_rocm) set(CMAKE_LINK_WHAT_YOU_USE TRUE) -#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "/opt/rocm/hip/cmake") -if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() -endif() -set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) find_package(HIP REQUIRED) - add_compile_definitions(__HIP_PLATFORM_HCC__) -link_directories(${HIP_ROOT_DIR}/lib) add_definitions("-DTENSORFLOW_USE_ROCM") # set c++ version c++11 -#SET(CMAKE_CXX_STANDARD 11) +SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_HIP_STANDARD 11) message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) -message(STATUS "HIP major version is " ${HIP_TOOLKIT_ROOT_DIR}) set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__) @@ -34,9 +22,8 @@ set (SOURCE_FILES ) hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES}) -target_link_libraries(deepmd_op_rocm ${HIP_LIBRARIES}) -#install(TARGETS deepmd_op_rocm DESTINATION lib/) +install(TARGETS deepmd_op_rocm DESTINATION lib/) if (BUILD_CPP_IF) install(TARGETS deepmd_op_rocm DESTINATION lib/) endif (BUILD_CPP_IF) diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index 36367797b4..67ad0a7c93 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -18,29 +18,19 @@ message(status "${CMAKE_SOURCE_DIR}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -# define USE_CUDA_TOOLKIT -if (DEFINED USE_CUDA_TOOLKIT) - if (USE_CUDA_TOOLKIT) - find_package(CUDA REQUIRED) - else() - message(STATUS "Will not build nv GPU support") - endif() -else() - find_package(CUDA QUIET) - if (CUDA_FOUND) - set(USE_CUDA_TOOLKIT TRUE) - message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") - else() - set(USE_CUDA_TOOLKIT FALSE) - message(STATUS "No cuda support found, will not build nv GPU support") - endif() +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") endif() +# define USE_CUDA_TOOLKIT if (USE_CUDA_TOOLKIT) + find_package(CUDA REQUIRED) add_definitions("-D GOOGLE_CUDA") + message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") include_directories(${CUDA_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) -endif() +endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT if(NOT DEFINED HIP_PATH) @@ -51,39 +41,15 @@ if(NOT DEFINED HIP_PATH) endif() endif() set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) - -if (DEFINED USE_ROCM_TOOLKIT) - if(USE_ROCM_TOOLKIT) - link_directories(${HIP_ROOT_DIR}/lib) - add_definitions("-DTENSORFLOW_USE_ROCM") - find_package(HIP REQUIRED) - add_compile_definitions(__HIP_PLATFORM_HCC__) - else() - message(STATUS "Will not build rocm GPU support") - endif() -else() - find_package(HIP QUIET) - if (HIP_FOUND) - link_directories(${HIP_ROOT_DIR}/lib) - add_definitions("-DTENSORFLOW_USE_ROCM") - set(USE_ROCM_TOOLKIT TRUE) - add_compile_definitions(__HIP_PLATFORM_HCC__) - message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build ROCM GPU support") - else() - set(USE_ROCM_TOOLKIT FALSE) - message(STATUS "No ROCM support found, will not build ROCM GPU support") - endif() -endif() if (USE_ROCM_TOOLKIT) - add_definitions("-DUSE_ROCM_TOOLKIT") - include_directories(${ROCM_INCLUDE_DIRS}) + find_package(HIP REQUIRED) + link_directories(${HIP_ROOT_DIR}/lib) + add_definitions("-D TENSORFLOW_USE_ROCM") + add_compile_definitions(__HIP_PLATFORM_HCC__) + message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support") add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) -endif() +endif (USE_ROCM_TOOLKIT) -# Devices that have both ROCM and CUDA are not currently supported -if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) - message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") -endif() file(GLOB TEST_SRC test_*.cc) add_executable( runUnitTests ${TEST_SRC} ) diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index eb456f9a97..78c0a12803 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -25,7 +25,8 @@ if (BUILD_CPP_IF) add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC}) find_package(HIP REQUIRED) include_directories(${ROCM_INCLUDE_DIRS}) - target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES}) + link_directories(${HIP_ROOT_DIR}/lib) + target_link_libraries (${LIB_DEEPMD_OP} hpi_hcc) endif(USE_ROCM_TOOLKIT) if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT)) add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 64d40acff4..e4e12cac2b 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -441,7 +441,7 @@ class ProdEnvMatAOp : public OpKernel { deepmd::delete_device_memory(firstneigh); #endif //GOOGLE_CUDA - #if TENSORFLOW_USE_ROCM + #if TENSORFLOW_USE_ROCM int * idx_mapping = NULL; int * ilist = NULL, * numneigh = NULL; int ** firstneigh = NULL; From 29ee994ac3ac5d233e9bed892714aa3b92a00105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Thu, 27 May 2021 21:32:36 +0800 Subject: [PATCH 04/11] ADD FindROCM.cmake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1、ADD FindROCM.cmake 2、modify CMAKELISTS.TXT --- source/CMakeLists.txt | 12 +---- source/api_cc/CMakeLists.txt | 7 ++- source/api_cc/tests/CMakeLists.txt | 64 +++++------------------ source/cmake/FindROCM.cmake | 83 ++++++++++++++++++++++++++++++ source/lib/CMakeLists.txt | 13 +++-- source/lib/src/rocm/CMakeLists.txt | 8 +-- source/lib/tests/CMakeLists.txt | 16 ++---- source/op/CMakeLists.txt | 11 ++-- 8 files changed, 123 insertions(+), 91 deletions(-) create mode 100644 source/cmake/FindROCM.cmake diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 851873b063..dd4b0039e1 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -64,19 +64,11 @@ if (USE_CUDA_TOOLKIT) endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT -if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() -endif() -set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) if (USE_ROCM_TOOLKIT) - find_package(HIP REQUIRED) + find_package(ROCM REQUIRED) add_definitions("-D TENSORFLOW_USE_ROCM") add_compile_definitions(__HIP_PLATFORM_HCC__) - message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support") + message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support") endif (USE_ROCM_TOOLKIT) diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index 9e77e9fa2e..ad07b9da53 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -16,6 +16,10 @@ if (USE_CUDA_TOOLKIT) include_directories("${CUDA_INCLUDE_DIRS}") endif() +if (USE_ROCM_TOOLKIT) + include_directories("${ROCM_INCLUDE_DIRS}") +endif() + file(GLOB LIB_SRC src/*.cc src/*.cpp) file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) @@ -26,8 +30,7 @@ if (USE_CUDA_TOOLKIT) endif() if (USE_ROCM_TOOLKIT) - LINK_LIBRARIES(${HIP_ROOT_DIR}\lib) - target_link_libraries(${libname} hip_hcc) + target_link_libraries (${libname} ${ROCM_LIBRARIES}) endif() diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index 59e6bbc1cc..66901011a5 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -52,68 +52,28 @@ if (OPENMP_FOUND) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() -# define USE_CUDA_TOOLKIT -if (DEFINED USE_CUDA_TOOLKIT) - if (USE_CUDA_TOOLKIT) - find_package(CUDA REQUIRED) - else() - message(STATUS "Will not build nv GPU support") - endif() -else() - find_package(CUDA QUIET) - if (CUDA_FOUND) - set(USE_CUDA_TOOLKIT TRUE) - message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") - else() - set(USE_CUDA_TOOLKIT FALSE) - message(STATUS "No cuda support found, will not build nv GPU support") - endif() +# Devices that have both ROCM and CUDA are not currently supported +if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) + message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") endif() +# define USE_CUDA_TOOLKIT if (USE_CUDA_TOOLKIT) + find_package(CUDA REQUIRED) add_definitions("-D GOOGLE_CUDA") + message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") include_directories(${CUDA_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) -endif() - -# define USE_ROCM_TOOLKIT -if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() -endif() -set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) -if (DEFINED USE_ROCM_TOOLKIT) - if (USE_ROCM_TOOLKIT) - find_package(HIP REQUIRED) - add_compile_definitions(__HIP_PLATFORM_HCC__) - else() - message(STATUS "Will not build AMD GPU support") - endif() -else() - find_package(HIP QUIET) - if (HIP_FOUND) - set(USE_ROCM_TOOLKIT TRUE) - add_compile_definitions(__HIP_PLATFORM_HCC__) - message(STATUS "Found ROCM in ${ROCM_TOOLKIT_ROOT_DIR}, build AMD GPU support") - else() - set(USE_ROCM_TOOLKIT FALSE) - message(STATUS "No rocm support found, will not build AMD GPU support") - endif() -endif() +endif(USE_CUDA_TOOLKIT) +#define USE_ROCM_TOOLKIT if (USE_ROCM_TOOLKIT) + find_package(ROCM REQUIRED) add_definitions("-D TENSORFLOW_USE_ROCM") + add_compile_definitions(__HIP_PLATFORM_HCC__) include_directories(${ROCM_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) -endif() - -# Devices that have both ROCM and CUDA are not currently supported -if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) - message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") -endif() +endif (USE_ROCM_TOOLKIT) file(GLOB TEST_SRC test_*.cc) add_executable( runUnitTests ${TEST_SRC} ) @@ -134,7 +94,7 @@ if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config) endif(USE_CUDA_TOOLKIT) if(USE_ROCM_TOOLKIT) - target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread deepmd_op_rocm ${TensorFlow_LIBRARY} coverage_config) + target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_rocm coverage_config) endif(USE_ROCM_TOOLKIT) if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config) diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake new file mode 100644 index 0000000000..d012e7772c --- /dev/null +++ b/source/cmake/FindROCM.cmake @@ -0,0 +1,83 @@ +# Input: +# ROCM_ROOT +# +# Output: +# ROCM_FOUND +# ROCM_INCLUDE_DIRS +# ROCM_LIBRARIES + +# define the search path +list(APPEND ROCM_search_PATHS ${ROCM_ROOT}) +list(APPEND ROCM_search_PATHS "/opt/rocm/") +list(APPEND HIP_search_PATHS ${ROCM_ROOT}/hip) +list(APPEND HIP_search_PATHS "/opt/rocm/hip") + +# define the libs to find +if (NOT ROCM_FIND_COMPONENTS) + set(ROCM_FIND_COMPONENTS hip_hcc hiprtc) +endif () + +# includes +find_path (ROCM_INCLUDE_DIRS + NAMES + hip/hip_runtime.h + rocprim/rocprim.hpp + hipcub/hipcub.hpp + PATHS ${ROCM_search_PATHS} + PATH_SUFFIXES "include" + NO_DEFAULT_PATH + ) +if (NOT ROCM_INCLUDE_DIRS AND ROCM_FIND_REQUIRED) + message(FATAL_ERROR + "Not found 'hip' or 'rocprim' or 'hipcub' directory in path '${ROCM_search_PATHS}' " + "You can manually set the ROCM install path by -DROCM_ROOT ") +endif () + +# libs +foreach (module ${ROCM_FIND_COMPONENTS}) + find_library(ROCM_LIBRARIES_${module} + NAMES ${module} + PATHS ${ROCM_search_PATHS} PATH_SUFFIXES "lib" NO_DEFAULT_PATH + ) + if (ROCM_LIBRARIES_${module}) + list(APPEND ROCM_LIBRARIES ${ROCM_LIBRARIES_${module}}) + elseif (ROCM_FIND_REQUIRED) + message(FATAL_ERROR + "Not found lib/'${module}' in '${ROCM_search_PATHS}' " + "You can manually set the ROCM install path by -DROCM_ROOT ") + endif () +endforeach () + +# FindHIP.cmake +find_path (HIP_CMAKE + NAMES + FindHIP.cmake + PATHS ${HIP_search_PATHS} + PATH_SUFFIXES "cmake" + NO_DEFAULT_PATH + ) + +if (NOT HIP_CMAKE AND ROCM_FIND_REQUIRED) + message(FATAL_ERROR + "Not found 'FindHIP.cmake' file in path '${ROCM_search_PATHS}' " + "You can manually set the ROCM install path by -DROCM_ROOT ") +endif () + +list (APPEND CMAKE_MODULE_PATH ${HIP_CMAKE}) +find_package(HIP) + +# define the output variable +if (ROCM_INCLUDE_DIRS AND ROCM_LIBRARIES AND HIP_CMAKE) + set(ROCM_FOUND TRUE) +else () + set(ROCM_FOUND FALSE) +endif () + +# print message +if (NOT ROCM_FIND_QUIETLY) + message(STATUS "Found ROCM: ${ROCM_INCLUDE_DIRS}, ${ROCM_LIBRARIES}, ${HIP_CMAKE}" + " in ${ROCM_search_PATHS}") +endif () + +unset(ROCM_search_PATHS) +unset(HIP_search_PATHS) \ No newline at end of file diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 8ffbe8c192..895471442d 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -1,19 +1,19 @@ # libmd set (libname ${LIB_DEEPMD}) -# Devices that have both ROCM and CUDA are not currently supported -if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) - message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") -endif() - if (USE_CUDA_TOOLKIT) include_directories("${CUDA_INCLUDE_DIRS}") endif() +if (USE_ROCM_TOOLKIT) + include_directories("${ROCM_INCLUDE_DIRS}") +endif() + file(GLOB LIB_SRC src/*.cc src/*.cpp) file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) add_library(${libname} SHARED ${LIB_SRC}) + if (USE_CUDA_TOOLKIT) add_definitions("-D GOOGLE_CUDA") add_subdirectory(src/cuda) @@ -25,8 +25,7 @@ if (USE_ROCM_TOOLKIT) add_definitions("-D TENSORFLOW_USE_ROCM") add_subdirectory(src/rocm) set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) - LINK_LIBRARIES(${HIP_ROOT_DIR}/lib) - target_link_libraries (${libname} ${EXTRA_LIBS} hip_hcc) + target_link_libraries (${libname} ${ROCM_LIBRARIES} ${EXTRA_LIBS}) endif() if(BUILD_PY_IF) diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt index bd46a9f432..839a51680d 100644 --- a/source/lib/src/rocm/CMakeLists.txt +++ b/source/lib/src/rocm/CMakeLists.txt @@ -3,9 +3,12 @@ cmake_minimum_required(VERSION 3.5) # project name project(deepmd_op_rocm) set(CMAKE_LINK_WHAT_YOU_USE TRUE) -find_package(HIP REQUIRED) + +#find package ROCM +list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../../cmake/) +find_package(ROCM REQUIRED) + add_compile_definitions(__HIP_PLATFORM_HCC__) -add_definitions("-DTENSORFLOW_USE_ROCM") # set c++ version c++11 SET(CMAKE_CXX_STANDARD 11) @@ -13,7 +16,6 @@ SET(CMAKE_HIP_STANDARD 11) message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) - set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index 67ad0a7c93..3c738e2973 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -33,20 +33,12 @@ if (USE_CUDA_TOOLKIT) endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT -if(NOT DEFINED HIP_PATH) - if(NOT DEFINED ENV{HIP_PATH}) - set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") - else() - set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") - endif() -endif() -set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) if (USE_ROCM_TOOLKIT) - find_package(HIP REQUIRED) - link_directories(${HIP_ROOT_DIR}/lib) + list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake/) + find_package(ROCM REQUIRED) add_definitions("-D TENSORFLOW_USE_ROCM") add_compile_definitions(__HIP_PLATFORM_HCC__) - message(STATUS "Found ROCM in ${HIP_ROOT_DIR}, build AMD GPU support") + include_directories(${ROCM_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) endif (USE_ROCM_TOOLKIT) @@ -78,7 +70,7 @@ if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config) endif(USE_CUDA_TOOLKIT) if (USE_ROCM_TOOLKIT) - target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread hip_hcc deepmd_op_rocm coverage_config) + target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_rocm coverage_config ${ROCM_LIBRARIES}) endif(USE_ROCM_TOOLKIT) if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config) diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index 78c0a12803..b03eb8a2b0 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -9,6 +9,8 @@ file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) +list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../cmake/) + # Devices that have both ROCM and CUDA are not currently supported if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") @@ -23,10 +25,9 @@ if (BUILD_CPP_IF) endif(USE_CUDA_TOOLKIT) if(USE_ROCM_TOOLKIT) add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC}) - find_package(HIP REQUIRED) + find_package(ROCM REQUIRED) include_directories(${ROCM_INCLUDE_DIRS}) - link_directories(${HIP_ROOT_DIR}/lib) - target_link_libraries (${LIB_DEEPMD_OP} hpi_hcc) + target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES}) endif(USE_ROCM_TOOLKIT) if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT)) add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) @@ -47,8 +48,8 @@ if (BUILD_PY_IF) if(USE_ROCM_TOOLKIT) add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) - find_package(HIP REQUIRED) - include_directories(${HIP_INCLUDE_DIRS}) + find_package(ROCM REQUIRED) + include_directories(${ROCM_INCLUDE_DIRS}) target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) endif(USE_ROCM_TOOLKIT) From 98c149ab33dd3adbe2b3a0fda1b3c65a6cf4af58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Fri, 28 May 2021 19:58:14 +0800 Subject: [PATCH 05/11] modify FindROCM.cmake --- source/cmake/FindROCM.cmake | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake index d012e7772c..c9fa80ec5d 100644 --- a/source/cmake/FindROCM.cmake +++ b/source/cmake/FindROCM.cmake @@ -9,8 +9,6 @@ # define the search path list(APPEND ROCM_search_PATHS ${ROCM_ROOT}) list(APPEND ROCM_search_PATHS "/opt/rocm/") -list(APPEND HIP_search_PATHS ${ROCM_ROOT}/hip) -list(APPEND HIP_search_PATHS "/opt/rocm/hip") # define the libs to find if (NOT ROCM_FIND_COMPONENTS) @@ -52,8 +50,8 @@ endforeach () find_path (HIP_CMAKE NAMES FindHIP.cmake - PATHS ${HIP_search_PATHS} - PATH_SUFFIXES "cmake" + PATHS ${ROCM_search_PATHS} + PATH_SUFFIXES "hip/cmake" NO_DEFAULT_PATH ) @@ -79,5 +77,4 @@ if (NOT ROCM_FIND_QUIETLY) " in ${ROCM_search_PATHS}") endif () -unset(ROCM_search_PATHS) -unset(HIP_search_PATHS) \ No newline at end of file +unset(ROCM_search_PATHS) \ No newline at end of file From 5d40b4a6f2954348592b8020bcd7b031a70f2993 Mon Sep 17 00:00:00 2001 From: pkulzy <47965866+pkulzy@users.noreply.github.com> Date: Fri, 28 May 2021 20:06:57 +0800 Subject: [PATCH 06/11] modify FindROCM.cmake --- source/cmake/FindROCM.cmake | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake index d012e7772c..e70c560084 100644 --- a/source/cmake/FindROCM.cmake +++ b/source/cmake/FindROCM.cmake @@ -9,8 +9,6 @@ # define the search path list(APPEND ROCM_search_PATHS ${ROCM_ROOT}) list(APPEND ROCM_search_PATHS "/opt/rocm/") -list(APPEND HIP_search_PATHS ${ROCM_ROOT}/hip) -list(APPEND HIP_search_PATHS "/opt/rocm/hip") # define the libs to find if (NOT ROCM_FIND_COMPONENTS) @@ -52,8 +50,8 @@ endforeach () find_path (HIP_CMAKE NAMES FindHIP.cmake - PATHS ${HIP_search_PATHS} - PATH_SUFFIXES "cmake" + PATHS ${ROCM_search_PATHS} + PATH_SUFFIXES "hip/cmake" NO_DEFAULT_PATH ) @@ -80,4 +78,3 @@ if (NOT ROCM_FIND_QUIETLY) endif () unset(ROCM_search_PATHS) -unset(HIP_search_PATHS) \ No newline at end of file From cd17a217a5e7a91cacc75f13533ebf20a1d673d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Sat, 29 May 2021 22:24:08 +0800 Subject: [PATCH 07/11] Fixed issue --- source/CMakeLists.txt | 12 +++++++----- source/api_cc/CMakeLists.txt | 5 ----- source/api_cc/tests/CMakeLists.txt | 12 +++++++----- source/lib/src/rocm/CMakeLists.txt | 10 +--------- source/lib/tests/CMakeLists.txt | 12 +++++++----- source/op/CMakeLists.txt | 21 +++++---------------- 6 files changed, 27 insertions(+), 45 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index dd4b0039e1..5feb38f3ac 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -61,6 +61,8 @@ if (USE_CUDA_TOOLKIT) find_package(CUDA REQUIRED) add_definitions("-D GOOGLE_CUDA") message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") +else() + message(STATUS "Will not build nv GPU support") endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT @@ -69,6 +71,8 @@ if (USE_ROCM_TOOLKIT) add_definitions("-D TENSORFLOW_USE_ROCM") add_compile_definitions(__HIP_PLATFORM_HCC__) message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support") +else() + message(STATUS "Will not build AMD GPU support") endif (USE_ROCM_TOOLKIT) @@ -196,13 +200,11 @@ if (BUILD_CPP_IF) set (LIB_DEEPMD_CC "deepmd_cc") if (USE_CUDA_TOOLKIT) set (LIB_DEEPMD_OP_DEVICE "deepmd_op_cuda") - endif(USE_CUDA_TOOLKIT) - if (USE_ROCM_TOOLKIT) + elseif (USE_ROCM_TOOLKIT) set (LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm") - endif(USE_ROCM_TOOLKIT) - if ((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) + else() set (LIB_DEEPMD_OP_DEVICE "deepmd_op") - endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) + endif() if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9) set (LIB_DEEPMD_NATIVE "deepmd_native_md") set (LIB_DEEPMD_IPI "deepmd_ipi") diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt index ad07b9da53..cfdfce9b0e 100644 --- a/source/api_cc/CMakeLists.txt +++ b/source/api_cc/CMakeLists.txt @@ -7,11 +7,6 @@ configure_file( @ONLY ) -# Devices that have both ROCM and CUDA are not currently supported -if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) - message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") -endif() - if (USE_CUDA_TOOLKIT) include_directories("${CUDA_INCLUDE_DIRS}") endif() diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index 66901011a5..67a2c1c467 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -64,6 +64,8 @@ if (USE_CUDA_TOOLKIT) message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") include_directories(${CUDA_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) +else() + message(STATUS "Will not build nv GPU support") endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT @@ -73,6 +75,8 @@ if (USE_ROCM_TOOLKIT) add_compile_definitions(__HIP_PLATFORM_HCC__) include_directories(${ROCM_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) +else() + message(STATUS "Will not build AMD GPU support") endif (USE_ROCM_TOOLKIT) file(GLOB TEST_SRC test_*.cc) @@ -92,13 +96,11 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config) -endif(USE_CUDA_TOOLKIT) -if(USE_ROCM_TOOLKIT) +elseif(USE_ROCM_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_rocm coverage_config) -endif(USE_ROCM_TOOLKIT) -if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) +else() target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config) -endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) +endif() add_test( runUnitTests runUnitTests ) diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt index 839a51680d..63c04360d9 100644 --- a/source/lib/src/rocm/CMakeLists.txt +++ b/source/lib/src/rocm/CMakeLists.txt @@ -4,12 +4,6 @@ cmake_minimum_required(VERSION 3.5) project(deepmd_op_rocm) set(CMAKE_LINK_WHAT_YOU_USE TRUE) -#find package ROCM -list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../../cmake/) -find_package(ROCM REQUIRED) - -add_compile_definitions(__HIP_PLATFORM_HCC__) - # set c++ version c++11 SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_HIP_STANDARD 11) @@ -19,9 +13,7 @@ message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") -set (SOURCE_FILES - prod_env_mat.hip.cu prod_force.hip.cu prod_virial.hip.cu gelu.hip.cu tabulate.hip.cu coord.hip.cu neighbor_list.hip.cu prod_force_grad.hip.cu prod_virial_grad.hip.cu region.hip.cu -) +set (SOURCE_FILES *.hip.cu ) hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES}) diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index 3c738e2973..4dc591b0eb 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -30,6 +30,8 @@ if (USE_CUDA_TOOLKIT) message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") include_directories(${CUDA_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) +else() + message(STATUS "Will not build nv GPU support") endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT @@ -40,6 +42,8 @@ if (USE_ROCM_TOOLKIT) add_compile_definitions(__HIP_PLATFORM_HCC__) include_directories(${ROCM_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) +else() + message(STATUS "Will not build AMD GPU support") endif (USE_ROCM_TOOLKIT) @@ -68,13 +72,11 @@ endif() if (USE_CUDA_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_cuda coverage_config) -endif(USE_CUDA_TOOLKIT) -if (USE_ROCM_TOOLKIT) +elseif (USE_ROCM_TOOLKIT) target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread deepmd_op_rocm coverage_config ${ROCM_LIBRARIES}) -endif(USE_ROCM_TOOLKIT) -if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) +else() target_link_libraries(runUnitTests gtest gtest_main ${libname} pthread coverage_config) -endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) +endif() add_test( runUnitTests runUnitTests ) # include(GoogleTest) diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index b03eb8a2b0..340c5601fb 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -9,27 +9,18 @@ file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) -list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../cmake/) - -# Devices that have both ROCM and CUDA are not currently supported -if (USE_ROCM_TOOLKIT AND USE_CUDA_TOOLKIT) - message (FATAL_ERROR "Devices that have both ROCM and CUDA are not currently supported") -endif() - if (BUILD_CPP_IF) if (USE_CUDA_TOOLKIT) add_library(${LIB_DEEPMD_OP} SHARED ${OP_CUDA_SRC}) find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) target_link_libraries (${LIB_DEEPMD_OP} ${CUDA_LIBRARIES}) - endif(USE_CUDA_TOOLKIT) - if(USE_ROCM_TOOLKIT) + elseif (USE_ROCM_TOOLKIT) add_library(${LIB_DEEPMD_OP} SHARED ${OP_ROCM_SRC}) find_package(ROCM REQUIRED) include_directories(${ROCM_INCLUDE_DIRS}) target_link_libraries (${LIB_DEEPMD_OP} ${ROCM_LIBRARIES}) - endif(USE_ROCM_TOOLKIT) - if((NOT USE_CUDA_TOOLKIT) AND (NOT USE_ROCM_TOOLKIT)) + else () add_library(${LIB_DEEPMD_OP} SHARED ${OP_SRC}) endif() endif (BUILD_CPP_IF) @@ -44,19 +35,17 @@ if (BUILD_PY_IF) include_directories(${CUDA_INCLUDE_DIRS}) target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) - endif(USE_CUDA_TOOLKIT) - if(USE_ROCM_TOOLKIT) + elseif(USE_ROCM_TOOLKIT) add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) find_package(ROCM REQUIRED) include_directories(${ROCM_INCLUDE_DIRS}) target_link_libraries (op_abi ${LIB_DEEPMD_OP_DEVICE}) target_link_libraries (op_grads ${LIB_DEEPMD_OP_DEVICE}) - endif(USE_ROCM_TOOLKIT) - if((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) + else() add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) - endif((NOT USE_ROCM_TOOLKIT) AND (NOT USE_CUDA_TOOLKIT)) + endif() message(STATUS ${TensorFlowFramework_LIBRARY}) target_link_libraries(op_abi ${LIB_DEEPMD}) From 4fb9b3be15d7451421f6ba4b95613191757cda2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Sat, 29 May 2021 22:32:59 +0800 Subject: [PATCH 08/11] Update FindROCM.cmake --- source/cmake/FindROCM.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake index e70c560084..cab69e4999 100644 --- a/source/cmake/FindROCM.cmake +++ b/source/cmake/FindROCM.cmake @@ -74,7 +74,7 @@ endif () # print message if (NOT ROCM_FIND_QUIETLY) message(STATUS "Found ROCM: ${ROCM_INCLUDE_DIRS}, ${ROCM_LIBRARIES}, ${HIP_CMAKE}" - " in ${ROCM_search_PATHS}") + " in ${ROCM_search_PATHS}, build AMD GPU support") endif () unset(ROCM_search_PATHS) From 7f6273e278f69445a1ca2afab3fc8b8ae2f90e60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Sat, 29 May 2021 23:41:15 +0800 Subject: [PATCH 09/11] fix issue about wildcard --- source/CMakeLists.txt | 4 ++-- source/api_cc/tests/CMakeLists.txt | 4 ++-- source/lib/CMakeLists.txt | 4 ++-- source/lib/src/cuda/CMakeLists.txt | 4 +--- source/lib/src/rocm/CMakeLists.txt | 2 +- source/lib/tests/CMakeLists.txt | 4 ++-- 6 files changed, 10 insertions(+), 12 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 5feb38f3ac..1c89c3cf25 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -59,7 +59,7 @@ endif() # define USE_CUDA_TOOLKIT if (USE_CUDA_TOOLKIT) find_package(CUDA REQUIRED) - add_definitions("-D GOOGLE_CUDA") + add_definitions("-DGOOGLE_CUDA") message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") else() message(STATUS "Will not build nv GPU support") @@ -68,7 +68,7 @@ endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT if (USE_ROCM_TOOLKIT) find_package(ROCM REQUIRED) - add_definitions("-D TENSORFLOW_USE_ROCM") + add_definitions("-DTENSORFLOW_USE_ROCM") add_compile_definitions(__HIP_PLATFORM_HCC__) message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support") else() diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index 67a2c1c467..111c7646bb 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -60,7 +60,7 @@ endif() # define USE_CUDA_TOOLKIT if (USE_CUDA_TOOLKIT) find_package(CUDA REQUIRED) - add_definitions("-D GOOGLE_CUDA") + add_definitions("-DGOOGLE_CUDA") message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") include_directories(${CUDA_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) @@ -71,7 +71,7 @@ endif(USE_CUDA_TOOLKIT) #define USE_ROCM_TOOLKIT if (USE_ROCM_TOOLKIT) find_package(ROCM REQUIRED) - add_definitions("-D TENSORFLOW_USE_ROCM") + add_definitions("-DTENSORFLOW_USE_ROCM") add_compile_definitions(__HIP_PLATFORM_HCC__) include_directories(${ROCM_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 895471442d..c1369c23ee 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -15,14 +15,14 @@ file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) add_library(${libname} SHARED ${LIB_SRC}) if (USE_CUDA_TOOLKIT) - add_definitions("-D GOOGLE_CUDA") + add_definitions("-DGOOGLE_CUDA") add_subdirectory(src/cuda) set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda) target_link_libraries (${libname} ${CUDA_LIBRARIES} ${EXTRA_LIBS}) endif() if (USE_ROCM_TOOLKIT) - add_definitions("-D TENSORFLOW_USE_ROCM") + add_definitions("-DTENSORFLOW_USE_ROCM") add_subdirectory(src/rocm) set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) target_link_libraries (${libname} ${ROCM_LIBRARIES} ${EXTRA_LIBS}) diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt index 41a2ea091e..0c57e4e396 100644 --- a/source/lib/src/cuda/CMakeLists.txt +++ b/source/lib/src/cuda/CMakeLists.txt @@ -105,9 +105,7 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") -set (SOURCE_FILES - prod_env_mat.cu prod_force.cu prod_force_grad.cu prod_virial.cu prod_virial_grad.cu gelu.cu tabulate.cu coord.cu neighbor_list.cu region.cu -) +file (GLOB SOURCE_FILES "*.cu" ) cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES}) diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt index 63c04360d9..6caf7cb658 100644 --- a/source/lib/src/rocm/CMakeLists.txt +++ b/source/lib/src/rocm/CMakeLists.txt @@ -13,7 +13,7 @@ message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") -set (SOURCE_FILES *.hip.cu ) +file (GLOB SOURCE_FILES "*.hip.cu" ) hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES}) diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index 4dc591b0eb..b12734af9b 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -26,7 +26,7 @@ endif() # define USE_CUDA_TOOLKIT if (USE_CUDA_TOOLKIT) find_package(CUDA REQUIRED) - add_definitions("-D GOOGLE_CUDA") + add_definitions("-DGOOGLE_CUDA") message(STATUS "Found CUDA in ${CUDA_TOOLKIT_ROOT_DIR}, build nv GPU support") include_directories(${CUDA_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/cuda cuda_binary_dir) @@ -38,7 +38,7 @@ endif(USE_CUDA_TOOLKIT) if (USE_ROCM_TOOLKIT) list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../../cmake/) find_package(ROCM REQUIRED) - add_definitions("-D TENSORFLOW_USE_ROCM") + add_definitions("-DTENSORFLOW_USE_ROCM") add_compile_definitions(__HIP_PLATFORM_HCC__) include_directories(${ROCM_INCLUDE_DIRS}) add_subdirectory(${LIB_BASE_DIR}/src/rocm rocm_binary_dir) From 18a79682216c34746297c3952e583a5307c379aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Sun, 30 May 2021 22:18:14 +0800 Subject: [PATCH 10/11] Modified duplicate --- source/lib/src/rocm/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt index 6caf7cb658..903d1efec5 100644 --- a/source/lib/src/rocm/CMakeLists.txt +++ b/source/lib/src/rocm/CMakeLists.txt @@ -10,7 +10,7 @@ SET(CMAKE_HIP_STANDARD 11) message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) -set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11; -D__HIP_PLATFORM_HCC__) +set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") file (GLOB SOURCE_FILES "*.hip.cu" ) From f1535192b1ac0d36631b0ca77fbc77e7c264d0c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=BD=E5=AE=87?= Date: Tue, 1 Jun 2021 19:06:30 +0800 Subject: [PATCH 11/11] Use add_definitions to replace set(CMAKE_CXX_FLAGS) --- source/lib/src/rocm/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt index 903d1efec5..393844b8bb 100644 --- a/source/lib/src/rocm/CMakeLists.txt +++ b/source/lib/src/rocm/CMakeLists.txt @@ -7,11 +7,12 @@ set(CMAKE_LINK_WHAT_YOU_USE TRUE) # set c++ version c++11 SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_HIP_STANDARD 11) +add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT") +add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT") message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) set (HIP_HIPCC_FLAGS -hc; -fno-gpu-rdc; --amdgpu-target=gfx906; -fPIC; -O3; --std=c++11) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT") file (GLOB SOURCE_FILES "*.hip.cu" )