diff --git a/.github/labeler.yml b/.github/labeler.yml index ad10d7cb7d..195d2cd217 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -5,8 +5,8 @@ Python: Docs: doc/**/* Examples: examples/**/* Core: source/lib/**/* -CUDA: source/lib/src/cuda/**/* -ROCM: source/lib/src/rocm/**/* +CUDA: source/lib/src/gpu/**/* +ROCM: source/lib/src/gpu/**/* OP: source/op/**/* C++: source/api_cc/**/* C: source/api_c/**/* diff --git a/.gitmodules b/.gitmodules index 7f3510b9d6..849b21ced5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "source/lib/src/cuda/cub"] - path = source/lib/src/cuda/cub +[submodule "source/lib/src/gpu/cub"] + path = source/lib/src/gpu/cub url = https://github.com/NVIDIA/cub.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8125324ea1..7ea4915f6e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,7 +53,7 @@ repos: rev: v16.0.6 hooks: - id: clang-format - exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc + exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc # CSS - repo: https://github.com/pre-commit/mirrors-csslint rev: v1.0.5 @@ -83,7 +83,7 @@ repos: - --comment-style - // - --no-extra-eol - exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc + exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc # CSS - id: insert-license files: \.(css|scss)$ diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md index e6a4b1a7cb..1447823c08 100644 --- a/doc/install/install-from-source.md +++ b/doc/install/install-from-source.md @@ -74,7 +74,7 @@ One may set the following environment variables before executing `pip`: | Environment variables | Allowed value | Default value | Usage | | --------------------- | ---------------------- | ------------- | -------------------------- | | DP_VARIANT | `cpu`, `cuda`, `rocm` | `cpu` | Build CPU variant or GPU variant with CUDA or ROCM support. | -| CUDAToolkit_ROOT | Path | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. | +| CUDAToolkit_ROOT | Path | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. | | ROCM_ROOT | Path | Detected automatically | The path to the ROCM toolkit directory. | | TENSORFLOW_ROOT | Path | Detected automatically | The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.| | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1 | 0 | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. | @@ -188,7 +188,7 @@ One may add the following arguments to `cmake`: | -DTENSORFLOW_ROOT=<value> | Path | - | The Path to TensorFlow's C++ interface. | | -DCMAKE_INSTALL_PREFIX=<value> | Path | - | The Path where DeePMD-kit will be installed. | | -DUSE_CUDA_TOOLKIT=<value> | `TRUE` or `FALSE` | `FALSE` | If `TRUE`, Build GPU support with CUDA toolkit. | -| -DCUDAToolkit_ROOT=<value> | Path | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. | +| -DCUDAToolkit_ROOT=<value> | Path | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. | | -DUSE_ROCM_TOOLKIT=<value> | `TRUE` or `FALSE` | `FALSE` | If `TRUE`, Build GPU support with ROCM toolkit. | | -DCMAKE_HIP_COMPILER_ROCM_ROOT=<value> | Path | Detected automatically | The path to the ROCM toolkit directory. | | -DLAMMPS_SOURCE_ROOT=<value> | Path | - | Only neccessary for LAMMPS plugin mode. The path to the [LAMMPS source code](install-lammps.md). LAMMPS 8Apr2021 or later is supported. If not assigned, the plugin mode will not be enabled. | diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 5f5528de3e..323bf2d7c0 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -11,7 +11,7 @@ target_include_directories( if(USE_CUDA_TOOLKIT) add_definitions("-DGOOGLE_CUDA") - add_subdirectory(src/cuda) + add_subdirectory(src/gpu) set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda) target_link_libraries(${libname} INTERFACE deepmd_dyn_cudart ${EXTRA_LIBS}) # gpu_cuda.h @@ -22,7 +22,7 @@ endif() if(USE_ROCM_TOOLKIT) add_definitions("-DTENSORFLOW_USE_ROCM") - add_subdirectory(src/rocm) + add_subdirectory(src/gpu) set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm) target_link_libraries(${libname} INTERFACE ${ROCM_LIBRARIES} ${EXTRA_LIBS}) # gpu_rocm.h diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index bf8c325b14..73dfed1404 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -8,6 +8,13 @@ #include "errors.h" +#define gpuGetLastError cudaGetLastError +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuMemcpy cudaMemcpy +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemset cudaMemset + #define GPU_MAX_NBOR_SIZE 4096 #define DPErrcheck(res) \ { DPAssert((res), __FILE__, __LINE__); } diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index 4c3c1b41a9..3a65a57b01 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -11,6 +11,13 @@ #define GPU_MAX_NBOR_SIZE 4096 +#define gpuGetLastError hipGetLastError +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuMemcpy hipMemcpy +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemset hipMemset + #define DPErrcheck(res) \ { DPAssert((res), __FILE__, __LINE__); } inline void DPAssert(hipError_t code, diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt deleted file mode 100644 index 1d5ae690e1..0000000000 --- a/source/lib/src/cuda/CMakeLists.txt +++ /dev/null @@ -1,60 +0,0 @@ -# required cmake version 3.23: CMAKE_CUDA_ARCHITECTURES all -cmake_minimum_required(VERSION 3.23) -# project name -project(deepmd_op_cuda) - -set(CMAKE_CUDA_ARCHITECTURES all) -enable_language(CUDA) -set(CMAKE_CUDA_STANDARD 11) -add_compile_definitions( - "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>") - -find_package(CUDAToolkit REQUIRED) - -# take dynamic open cudart library replace of static one so it's not required -# when using CPUs -add_subdirectory(cudart) - -# nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true -# -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu -# -L/usr/local/cuda/lib64 -lcudadevrt very important here! Include path to cub. -# for searching device compute capability, -# https://developer.nvidia.com/cuda-gpus - -# cub has been included in CUDA Toolkit 11, we do not need to include it any -# more see https://github.com/NVIDIA/cub -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11") - include_directories(cub) -endif() - -message(STATUS "NVCC version is " ${CMAKE_CUDA_COMPILER_VERSION}) - -# arch will be configured by CMAKE_CUDA_ARCHITECTURES -set(CMAKE_CUDA_FLAGS - "${CMAKE_CUDA_FLAGS} -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT" -) - -file(GLOB SOURCE_FILES "*.cu") - -add_library(deepmd_op_cuda SHARED ${SOURCE_FILES}) -target_link_libraries(deepmd_op_cuda PRIVATE deepmd_dyn_cudart) -target_include_directories( - deepmd_op_cuda - PUBLIC $ - $) -target_precompile_headers(deepmd_op_cuda PUBLIC [["device.h"]]) -if(APPLE) - set_target_properties(deepmd_op_cuda PROPERTIES INSTALL_RPATH @loader_path) -else() - set_target_properties(deepmd_op_cuda PROPERTIES INSTALL_RPATH "$ORIGIN") -endif() - -if(BUILD_CPP_IF AND NOT BUILD_PY_IF) - install( - TARGETS deepmd_op_cuda - EXPORT ${CMAKE_PROJECT_NAME}Targets - DESTINATION lib/) -endif(BUILD_CPP_IF AND NOT BUILD_PY_IF) -if(BUILD_PY_IF) - install(TARGETS deepmd_op_cuda DESTINATION deepmd/lib/) -endif(BUILD_PY_IF) diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt new file mode 100644 index 0000000000..25223c82bf --- /dev/null +++ b/source/lib/src/gpu/CMakeLists.txt @@ -0,0 +1,95 @@ +if(USE_CUDA_TOOLKIT) + # required cmake version 3.23: CMAKE_CUDA_ARCHITECTURES all + cmake_minimum_required(VERSION 3.23) + # project name + project(deepmd_op_cuda) + set(GPU_LIB_NAME deepmd_op_cuda) + + set(CMAKE_CUDA_ARCHITECTURES all) + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 11) + add_compile_definitions( + "$<$:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>") + + find_package(CUDAToolkit REQUIRED) + + # take dynamic open cudart library replace of static one so it's not required + # when using CPUs + add_subdirectory(cudart) + + # nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true + # -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler + # -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt very important here! + # Include path to cub. for searching device compute capability, + # https://developer.nvidia.com/cuda-gpus + + # cub has been included in CUDA Toolkit 11, we do not need to include it any + # more see https://github.com/NVIDIA/cub + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11") + include_directories(cub) + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "9") + message(FATAL_ERROR "CUDA version must be >= 9.0") + endif() + + message(STATUS "NVCC version is " ${CMAKE_CUDA_COMPILER_VERSION}) + + # arch will be configured by CMAKE_CUDA_ARCHITECTURES + set(CMAKE_CUDA_FLAGS + "${CMAKE_CUDA_FLAGS} -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT" + ) + + file(GLOB SOURCE_FILES "*.cu") + + add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES}) + target_link_libraries(${GPU_LIB_NAME} PRIVATE deepmd_dyn_cudart) + +elseif(USE_ROCM_TOOLKIT) + + # required cmake version + cmake_minimum_required(VERSION 3.21) + # project name + project(deepmd_op_rocm) + set(GPU_LIB_NAME deepmd_op_rocm) + set(CMAKE_LINK_WHAT_YOU_USE TRUE) + + # set c++ version c++11 + set(CMAKE_CXX_STANDARD 14) + set(CMAKE_HIP_STANDARD 14) + add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT") + add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT") + + message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) + + set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS} + )# --amdgpu-target=gfx906 + if(HIP_VERSION VERSION_LESS 3.5.1) + set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS}) + endif() + + file(GLOB SOURCE_FILES "*.cu") + + hip_add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES}) + +endif() + +target_include_directories( + ${GPU_LIB_NAME} + PUBLIC $ + $) +target_precompile_headers(${GPU_LIB_NAME} PUBLIC [["device.h"]]) +if(APPLE) + set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH @loader_path) +else() + set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN") +endif() + +if(BUILD_CPP_IF AND NOT BUILD_PY_IF) + install( + TARGETS ${GPU_LIB_NAME} + EXPORT ${CMAKE_PROJECT_NAME}Targets + DESTINATION lib/) +endif(BUILD_CPP_IF AND NOT BUILD_PY_IF) +if(BUILD_PY_IF) + install(TARGETS ${GPU_LIB_NAME} DESTINATION deepmd/lib/) +endif(BUILD_PY_IF) diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/gpu/coord.cu similarity index 93% rename from source/lib/src/cuda/coord.cu rename to source/lib/src/gpu/coord.cu index d37e5de9cf..52ec9ff09d 100644 --- a/source/lib/src/cuda/coord.cu +++ b/source/lib/src/gpu/coord.cu @@ -266,21 +266,21 @@ void compute_int_data(int *int_data, _fill_idx_cellmap<<>>(idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, nat_stt, nat_end, ext_stt, ext_end, nloc); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int nblock_loc_cellnum = (loc_cellnum + TPB - 1) / TPB; _fill_loc_cellnum_map<<>>( temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc, loc_cellnum); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int nblock_total_cellnum = (total_cellnum + TPB - 1) / TPB; _fill_total_cellnum_map<<>>( total_cellnum_map, mask_cellnum_map, cell_map, cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } void build_loc_clist(int *int_data, @@ -297,8 +297,8 @@ void build_loc_clist(int *int_data, total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1; _build_loc_clist<<>>(loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -326,8 +326,8 @@ void copy_coord(FPTYPE *out_c, cell_shift_map, sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } namespace deepmd { @@ -335,14 +335,14 @@ template void normalize_coord_gpu(FPTYPE *coord, const int natom, const Region ®ion) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const FPTYPE *boxt = region.boxt; const FPTYPE *rec_boxt = region.rec_boxt; const int nblock = (natom + TPB - 1) / TPB; normalize_one<<>>(coord, boxt, rec_boxt, natom); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } // int_data(temp cuda @@ -362,16 +362,17 @@ int copy_coord_gpu(FPTYPE *out_c, const int &total_cellnum, const int *cell_info, const Region ®ion) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum); int *int_data_cpu = new int [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum + 1]; // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map - DPErrcheck(cudaMemcpy(int_data_cpu, int_data + 3 * nloc, - sizeof(int) * (loc_cellnum + 2 * total_cellnum), - cudaMemcpyDeviceToHost)); + DPErrcheck(gpuMemcpy(int_data_cpu, int_data + 3 * nloc, + sizeof(int) * (loc_cellnum + 2 * total_cellnum), + gpuMemcpyDeviceToHost)); + DPErrcheck(gpuGetLastError()); int *loc_cellnum_map = int_data_cpu; int *total_cellnum_map = loc_cellnum_map + loc_cellnum; int *mask_cellnum_map = total_cellnum_map + total_cellnum; @@ -397,11 +398,12 @@ int copy_coord_gpu(FPTYPE *out_c, // size of the output arrays is not large enough return 1; } else { - DPErrcheck(cudaMemcpy(int_data + nloc * 3 + loc_cellnum + - total_cellnum * 3 + total_cellnum * 3, - sec_loc_cellnum_map, - sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1), - cudaMemcpyHostToDevice)); + DPErrcheck(gpuMemcpy(int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 + + total_cellnum * 3, + sec_loc_cellnum_map, + sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1), + gpuMemcpyHostToDevice)); + DPErrcheck(gpuGetLastError()); delete[] int_data_cpu; build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum); copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall, diff --git a/source/lib/src/cuda/cub b/source/lib/src/gpu/cub similarity index 100% rename from source/lib/src/cuda/cub rename to source/lib/src/gpu/cub diff --git a/source/lib/src/cuda/cudart/CMakeLists.txt b/source/lib/src/gpu/cudart/CMakeLists.txt similarity index 100% rename from source/lib/src/cuda/cudart/CMakeLists.txt rename to source/lib/src/gpu/cudart/CMakeLists.txt diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_0.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_10_0.inc rename to source/lib/src/gpu/cudart/cuda_runtime_10_0.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_1.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_1.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_10_1.inc rename to source/lib/src/gpu/cudart/cuda_runtime_10_1.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_2.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_2.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_10_2.inc rename to source/lib/src/gpu/cudart/cuda_runtime_10_2.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_0.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_11_0.inc rename to source/lib/src/gpu/cudart/cuda_runtime_11_0.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_2.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_2.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_11_2.inc rename to source/lib/src/gpu/cudart/cuda_runtime_11_2.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_8.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_8.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_11_8.inc rename to source/lib/src/gpu/cudart/cuda_runtime_11_8.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_12_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_12_0.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_12_0.inc rename to source/lib/src/gpu/cudart/cuda_runtime_12_0.inc diff --git a/source/lib/src/cuda/cudart/cuda_runtime_9_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_9_0.inc similarity index 100% rename from source/lib/src/cuda/cudart/cuda_runtime_9_0.inc rename to source/lib/src/gpu/cudart/cuda_runtime_9_0.inc diff --git a/source/lib/src/cuda/cudart/cudart_stub.cc b/source/lib/src/gpu/cudart/cudart_stub.cc similarity index 100% rename from source/lib/src/cuda/cudart/cudart_stub.cc rename to source/lib/src/gpu/cudart/cudart_stub.cc diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/gpu/gelu.cu similarity index 90% rename from source/lib/src/cuda/gelu.cu rename to source/lib/src/gpu/gelu.cu index 823a843b2a..ac6020ea7a 100644 --- a/source/lib/src/cuda/gelu.cu +++ b/source/lib/src/gpu/gelu.cu @@ -32,7 +32,7 @@ __global__ void gelu_grad(FPTYPE* out, (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx])); out[idx] = dy[idx] * ((FPTYPE)0.5 * SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) * - ((FPTYPE)0.134145 * xx[idx] * xx[idx] + 1) + + ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) + (FPTYPE)0.5 * var + (FPTYPE)0.5); } @@ -67,14 +67,14 @@ void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) { if (size <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int THREAD_ITEMS = 1024; const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(out, xx, size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -85,14 +85,14 @@ void gelu_grad_gpu(FPTYPE* out, if (size <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int THREAD_ITEMS = 1024; const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(out, xx, dy, size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -104,14 +104,14 @@ void gelu_grad_grad_gpu(FPTYPE* out, if (size <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int THREAD_ITEMS = 1024; const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad_grad<<>>(out, xx, dy, dy_2, size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void gelu_gpu(float* out, const float* x, const int_64 size); diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu similarity index 89% rename from source/lib/src/cuda/neighbor_list.cu rename to source/lib/src/gpu/neighbor_list.cu index 7cac07690b..fc4e784915 100644 --- a/source/lib/src/cuda/neighbor_list.cu +++ b/source/lib/src/gpu/neighbor_list.cu @@ -1,4 +1,11 @@ +#if GOOGLE_CUDA #include +#elif TENSORFLOW_USE_ROCM +#include +namespace cub = hipcub; +#else +#error "should not touch here" +#endif #include "device.h" #include "neighbor_list.h" @@ -187,13 +194,13 @@ int build_nlist_gpu(InputNlist &nlist, if (mem_size < nall) { return 1; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int nblock = (nall + TPB - 1) / TPB; int *ilist = nlist.ilist; int *numneigh = nlist.numneigh; int **firstneigh = nlist.firstneigh; - DPErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); + DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); int *temp_nlist = nlist_data; // nloc*mem_size int *nei_order = temp_nlist + nloc * mem_size; nlist.inum = nloc; @@ -203,19 +210,19 @@ int build_nlist_gpu(InputNlist &nlist, dim3 thread_grid(1, TPB); build_nlist<<>>(ilist, temp_nlist, c_cpy, rcut2, nloc, nall, mem_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); parallel_prefix_scan <<>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); fill_nlist<<>>(firstneigh, temp_nlist, nei_order, mem_size, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); int *numneigh_host = new int[nloc]; - DPErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, - cudaMemcpyDeviceToHost)); + DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, + gpuMemcpyDeviceToHost)); int max_nei = 0; for (int ii = 0; ii < nloc; ii++) { if (numneigh_host[ii] > max_nei) { @@ -231,14 +238,14 @@ void use_nlist_map(int *nlist, const int *nlist_map, const int nloc, const int nnei) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); int nblock = (nnei + TPB - 1) / TPB; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, TPB); map_nlist<<>>(nlist, nlist_map, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } void use_nei_info_gpu(int *nlist, @@ -250,13 +257,13 @@ void use_nei_info_gpu(int *nlist, const int nnei, const int ntypes, const bool b_nlist_map) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); int nblock = (nnei + TPB - 1) / TPB; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, TPB); - DPErrcheck(cudaMemset(ntype, 0, sizeof(int) * nloc * nnei)); - DPErrcheck(cudaMemset(nmask, 0, sizeof(bool) * nloc * nnei)); + DPErrcheck(gpuMemset(ntype, 0, sizeof(int) * nloc * nnei)); + DPErrcheck(gpuMemset(nmask, 0, sizeof(bool) * nloc * nnei)); if (b_nlist_map) { map_nei_info<<>>(nlist, ntype, nmask, type, nlist_map, nloc, nnei, ntypes); @@ -264,8 +271,8 @@ void use_nei_info_gpu(int *nlist, map_nei_info_noconvert<<>>( nlist, ntype, nmask, type, nloc, nnei, ntypes); } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template int build_nlist_gpu(InputNlist &nlist, @@ -295,12 +302,12 @@ __global__ void map_filter_ftype(int *ftype_out, } void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); int nblock = (nloc + TPB - 1) / TPB; map_filter_ftype<<>>(ftype_out, ftype_in, nloc); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } } // namespace deepmd diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/gpu/prod_env_mat.cu similarity index 93% rename from source/lib/src/cuda/prod_env_mat.cu rename to source/lib/src/gpu/prod_env_mat.cu index e603b25db7..a69e014272 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/gpu/prod_env_mat.cu @@ -1,6 +1,13 @@ +#if GOOGLE_CUDA #include #include #include +#elif TENSORFLOW_USE_ROCM +#include +namespace cub = hipcub; +#else +#error "should not touch here" +#endif #include "device.h" #include "fmt_nlist.h" @@ -83,7 +90,13 @@ __device__ inline uint_64 encoding_nbor_info(const int type, // the index of nbor atom(including ghost region) must be smaller than // 16777216(1 << 24) if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) { +#if GOOGLE_CUDA asm("trap;"); +#elif TENSORFLOW_USE_ROCM + __builtin_trap(); +#else +#error "should not touch here" +#endif } return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + @@ -222,16 +235,16 @@ void format_nbor_list_256(uint_64* key, format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 4; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>>(key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -250,16 +263,16 @@ void format_nbor_list_512(uint_64* key, format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 4; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>>(key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -278,16 +291,16 @@ void format_nbor_list_1024(uint_64* key, format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>>(key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -306,16 +319,16 @@ void format_nbor_list_2048(uint_64* key, format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>>(key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -334,16 +347,16 @@ void format_nbor_list_4096(uint_64* key, format_nlist_fill_a<<>>( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ITEMS_PER_THREAD = 16; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>>(key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -376,9 +389,9 @@ __global__ void compute_env_mat_a(FPTYPE* em, const int idx_value = ii * 4; // 4 components const int idx_deriv = ii * 12; // 4 components time 3 directions if (row_nlist[ii] >= 0) { - FPTYPE rr[3] = {0}; - FPTYPE dd[4] = {0}; - FPTYPE vv[12] = {0}; + FPTYPE rr[3] = {(FPTYPE)0.}; + FPTYPE dd[4] = {(FPTYPE)0.}; + FPTYPE vv[12] = {(FPTYPE)0.}; const int j_idx = row_nlist[ii]; for (int kk = 0; kk < 3; kk++) { rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; @@ -569,8 +582,8 @@ void format_nbor_list_gpu(int* nlist, const int nall, const float rcut, const std::vector sec) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int LEN = 256; const int nnei = sec.back(); const int nblock = (nloc + LEN - 1) / LEN; @@ -581,15 +594,15 @@ void format_nbor_list_gpu(int* nlist, assert(max_nbor_size == 256 || max_nbor_size == 512 || max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); - DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei)); - DPErrcheck(cudaMemset(key, 0xffffffff, - sizeof(uint_64) * int_64(nloc) * max_nbor_size)); - DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), - cudaMemcpyHostToDevice)); + DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei)); + DPErrcheck(gpuMemset(key, 0xffffffff, + sizeof(uint_64) * int_64(nloc) * max_nbor_size)); + DPErrcheck(gpuMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), + gpuMemcpyHostToDevice)); get_i_idx<<>>(i_idx, nloc, gpu_inlist.ilist); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); if (max_nbor_size == 256) { format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx); @@ -608,8 +621,8 @@ void format_nbor_list_gpu(int* nlist, format_nlist_fill_b<<>>( nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -631,27 +644,27 @@ void prod_env_mat_a_gpu(FPTYPE* em, const float rcut_smth, const std::vector sec, const int* f_type) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); if (f_type == NULL) { f_type = type; } const int nnei = sec.back(); const int ndescrpt = nnei * 4; - DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); + DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); DPErrcheck( - cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); - DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); + gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); + DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec); - nborErrcheck(cudaGetLastError()); - nborErrcheck(cudaDeviceSynchronize()); + nborErrcheck(gpuGetLastError()); + nborErrcheck(gpuDeviceSynchronize()); compute_env_mat_a<<>>( em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -672,24 +685,24 @@ void prod_env_mat_r_gpu(FPTYPE* em, const float rcut, const float rcut_smth, const std::vector sec) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int nnei = sec.back(); const int ndescrpt = nnei * 1; - DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); + DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); DPErrcheck( - cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); - DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); + gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); + DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec); - nborErrcheck(cudaGetLastError()); - nborErrcheck(cudaDeviceSynchronize()); + nborErrcheck(gpuGetLastError()); + nborErrcheck(gpuDeviceSynchronize()); compute_env_mat_r<<>>( em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -703,8 +716,8 @@ void test_encoding_decoding_nbor_info_gpu(uint_64* key, const int nblock = (size_of_array + TPB - 1) / TPB; encoding_decoding_nbor_info<<>>( key, out_type, out_index, in_type, in_dist, in_index, size_of_array); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void prod_env_mat_a_gpu(float* em, diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/gpu/prod_force.cu similarity index 91% rename from source/lib/src/cuda/prod_force.cu rename to source/lib/src/gpu/prod_force.cu index d85de26394..7b1359b3b0 100644 --- a/source/lib/src/cuda/prod_force.cu +++ b/source/lib/src/gpu/prod_force.cu @@ -12,7 +12,7 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force, int_64 bid = blockIdx.x; unsigned int tid = threadIdx.x; for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) { - data[ii] = 0.f; + data[ii] = (FPTYPE)0.; } for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) { for (int jj = 0; jj < 3; jj++) { @@ -64,7 +64,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force, if (j_idx < 0) { return; } - FPTYPE force_tmp = 0.f; + FPTYPE force_tmp = (FPTYPE)0.; for (int idw = 0; idw < 4; ++idw) { force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]; @@ -110,15 +110,15 @@ void prod_force_a_gpu(FPTYPE* force, const int nall, const int nnei, const int nframes) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ndescrpt = nnei * 4; - DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); + DPErrcheck(gpuMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); force_deriv_wrt_center_atom<<>>( force, net_deriv, in_deriv, ndescrpt, nloc, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; @@ -126,8 +126,8 @@ void prod_force_a_gpu(FPTYPE* force, dim3 thread_grid(LEN, 3); force_deriv_wrt_neighbors_a<<>>( force, net_deriv, in_deriv, nlist, nloc, nall, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -139,15 +139,15 @@ void prod_force_r_gpu(FPTYPE* force, const int nall, const int nnei, const int nframes) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ndescrpt = nnei * 1; - DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); + DPErrcheck(gpuMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); force_deriv_wrt_center_atom<<>>( force, net_deriv, in_deriv, ndescrpt, nloc, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; @@ -155,8 +155,8 @@ void prod_force_r_gpu(FPTYPE* force, dim3 thread_grid(LEN, 3); force_deriv_wrt_neighbors_r<<>>( force, net_deriv, in_deriv, nlist, nloc, nall, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void prod_force_a_gpu(float* force, diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/gpu/prod_force_grad.cu similarity index 91% rename from source/lib/src/cuda/prod_force_grad.cu rename to source/lib/src/gpu/prod_force_grad.cu index b54676586c..c784d6ba65 100644 --- a/source/lib/src/cuda/prod_force_grad.cu +++ b/source/lib/src/gpu/prod_force_grad.cu @@ -88,18 +88,18 @@ void prod_force_grad_a_gpu(FPTYPE* grad_net, const int nloc, const int nnei, const int nframes) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ndescrpt = nnei * 4; DPErrcheck( - cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); + gpuMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(TPB, 1); force_grad_wrt_center_atom<<>>(grad_net, grad, env_deriv, ndescrpt); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int LEN = 128; const int nblock_ = (nframes * nloc + LEN - 1) / LEN; @@ -107,8 +107,8 @@ void prod_force_grad_a_gpu(FPTYPE* grad_net, dim3 thread_grid_(LEN, 4); force_grad_wrt_neighbors_a<<>>( grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -119,18 +119,18 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net, const int nloc, const int nnei, const int nframes) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ndescrpt = nnei * 1; DPErrcheck( - cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); + gpuMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(TPB, 1); force_grad_wrt_center_atom<<>>(grad_net, grad, env_deriv, ndescrpt); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int LEN = 128; const int nblock_ = (nframes * nloc + LEN - 1) / LEN; @@ -138,8 +138,8 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net, dim3 thread_grid_(LEN, 1); force_grad_wrt_neighbors_r<<>>( grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void prod_force_grad_a_gpu(float* grad_net, diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/gpu/prod_virial.cu similarity index 91% rename from source/lib/src/cuda/prod_virial.cu rename to source/lib/src/gpu/prod_virial.cu index e96bacf1d3..ab9c5326e3 100644 --- a/source/lib/src/cuda/prod_virial.cu +++ b/source/lib/src/gpu/prod_virial.cu @@ -113,10 +113,10 @@ void prod_virial_a_gpu(FPTYPE* virial, const int nloc, const int nall, const int nnei) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); - DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(virial, 0, sizeof(FPTYPE) * 9)); + DPErrcheck(gpuMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); const int LEN = 16; int nblock = (nnei + LEN - 1) / LEN; @@ -125,12 +125,12 @@ void prod_virial_a_gpu(FPTYPE* virial, // compute virial of a frame virial_deriv_wrt_neighbors_a<<>>( virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); // reduction atom_virial to virial atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -143,10 +143,10 @@ void prod_virial_r_gpu(FPTYPE* virial, const int nloc, const int nall, const int nnei) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); - DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(virial, 0, sizeof(FPTYPE) * 9)); + DPErrcheck(gpuMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); const int LEN = 16; int nblock = (nnei + LEN - 1) / LEN; @@ -155,12 +155,12 @@ void prod_virial_r_gpu(FPTYPE* virial, // compute virial of a frame virial_deriv_wrt_neighbors_r<<>>( virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); // reduction atom_virial to virial atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void prod_virial_a_gpu(float* virial, diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/gpu/prod_virial_grad.cu similarity index 92% rename from source/lib/src/cuda/prod_virial_grad.cu rename to source/lib/src/gpu/prod_virial_grad.cu index 047d8ae17f..dac5b20ba8 100644 --- a/source/lib/src/cuda/prod_virial_grad.cu +++ b/source/lib/src/gpu/prod_virial_grad.cu @@ -92,18 +92,18 @@ void prod_virial_grad_a_gpu(FPTYPE* grad_net, const int* nlist, const int nloc, const int nnei) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ndescrpt = nnei * 4; - DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck(gpuMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; const int nblock = (nloc + LEN - 1) / LEN; dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 4); virial_grad_wrt_neighbors_a<<>>( grad_net, grad, env_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -114,18 +114,18 @@ void prod_virial_grad_r_gpu(FPTYPE* grad_net, const int* nlist, const int nloc, const int nnei) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); const int ndescrpt = nnei; - DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck(gpuMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; const int nblock = (nloc + LEN - 1) / LEN; dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 1); virial_grad_wrt_neighbors_r<<>>( grad_net, grad, env_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void prod_virial_grad_a_gpu(float* grad_net, diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/gpu/region.cu similarity index 83% rename from source/lib/src/cuda/region.cu rename to source/lib/src/gpu/region.cu index eb8d191a8c..849eecfc3e 100644 --- a/source/lib/src/cuda/region.cu +++ b/source/lib/src/gpu/region.cu @@ -27,31 +27,31 @@ template void convert_to_inter_gpu(FPTYPE *ri, const Region ®ion, const FPTYPE *rp) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void convert_to_phys_gpu(FPTYPE *rp, const Region ®ion, const FPTYPE *ri) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); _inter2Phys<<<1, 1>>>(rp, ri, region.boxt); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void volume_gpu(FPTYPE *volume, const Region ®ion) { - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); _compute_volume<<<1, 1>>>(volume, region.boxt); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void convert_to_inter_gpu(float *ri, diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/gpu/tabulate.cu similarity index 86% rename from source/lib/src/cuda/tabulate.cu rename to source/lib/src/gpu/tabulate.cu index 30695a6e05..f424006940 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/gpu/tabulate.cu @@ -4,9 +4,49 @@ #define MM 4 #define KK 4 #define TPB 256 +#if GOOGLE_CUDA #define WARP_SIZE 32 +#elif TENSORFLOW_USE_ROCM +// See https://github.com/pytorch/pytorch/pull/64302 +#define WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h) +#else +#error "should not touch here" +#endif #define FULL_MASK 0xffffffff +#if GOOGLE_CUDA +#define GPU_DYNAMIC_SHARED_MEM_DECL(TYPE, NAME) extern __shared__ TYPE NAME[] +#elif TENSORFLOW_USE_ROCM +#define GPU_DYNAMIC_SHARED_MEM_DECL(TYPE, NAME) HIP_DYNAMIC_SHARED(TYPE, NAME) +#else +#error "should not touch here" +#endif + +// Copyright 2017 The TensorFlow Authors. +// Licensed under the Apache License, Version 2.0 +template +__device__ T +GpuShuffleSync(unsigned mask, T value, int src_lane, int width = warpSize) { +#if GOOGLE_CUDA + return __shfl_sync(mask, value, src_lane, width); +#elif TENSORFLOW_USE_ROCM + return __shfl(value, src_lane, width); +#else +#error "should not touch here" +#endif +} + +__device__ void GpuSyncThreads() { +#if GOOGLE_CUDA + __syncwarp(); +#elif TENSORFLOW_USE_ROCM + //__syncwarp();->syncwrap + __syncthreads(); +#else +#error "should not touch here" +#endif +} + template __forceinline__ __device__ void locate_xx_se_a(FPTYPE& xx, int& table_idx, @@ -110,8 +150,14 @@ __forceinline__ __device__ FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) { template __forceinline__ __device__ void warp_reduce(FPTYPE& val) { - for (int offset = 16; offset > 0; offset >>= 1) { + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) { +#if GOOGLE_CUDA val += __shfl_down_sync(FULL_MASK, val, offset); +#elif TENSORFLOW_USE_ROCM + val += __shfl_down(val, offset); // ########???? +#else +#error "should not touch here" +#endif } } @@ -131,13 +177,25 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( const int last_layer_size, const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; +#if TENSORFLOW_USE_ROCM + GPU_DYNAMIC_SHARED_MEM_DECL(int, _data) +#endif const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); + FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); bool unloop = false; int breakpoint = nnei - 1; - +#if GOOGLE_CUDA FPTYPE sum[MTILE] = {(FPTYPE)0.}; +#elif TENSORFLOW_USE_ROCM + FPTYPE* iteratorC = (FPTYPE*)&_data[0]; + for (int kk = 0; kk < MTILE; kk++) { + iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.; + } + __syncthreads(); +#else +#error "should not touch here" +#endif int mark_table_idx = -1; FPTYPE var[6]; for (int ii = 0; ii < nnei; ii++) { @@ -163,8 +221,15 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( } for (int kk = 0; kk < MTILE; kk++) { - sum[kk] += (nnei - breakpoint) * - em[block_idx * nnei * MTILE + ii * MTILE + kk] * res; +#if GOOGLE_CUDA + sum[kk] +#elif TENSORFLOW_USE_ROCM + iteratorC[kk * last_layer_size + thread_idx] +#else +#error "should not touch here" +#endif + += (nnei - breakpoint) * + em[block_idx * nnei * MTILE + ii * MTILE + kk] * res; } if (unloop) { break; @@ -173,7 +238,14 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial( } for (int ii = 0; ii < MTILE; ii++) { out[block_idx * MTILE * last_layer_size + ii * last_layer_size + - thread_idx] = sum[ii]; + thread_idx] = +#if GOOGLE_CUDA + sum[ii]; +#elif TENSORFLOW_USE_ROCM + iteratorC[ii * last_layer_size + thread_idx]; +#else +#error "should not touch here" +#endif } } @@ -195,10 +267,10 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( const int last_layer_size, const bool is_sorted) { bool enable_se_atten = two_embed != nullptr; - extern __shared__ int _data[]; + GPU_DYNAMIC_SHARED_MEM_DECL(int, _data); const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0); + int warp_idx = GpuShuffleSync(0xffffffff, threadIdx.x / WARP_SIZE, 0); int lane_idx = threadIdx.x % WARP_SIZE; int breakpoint = nnei - 1; bool unloop = false; @@ -210,7 +282,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( } } __syncthreads(); - FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); + FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); for (int ii = warp_idx; ii < nnei; ii += KTILE) { FPTYPE xx = em_x[block_idx * nnei + ii]; if (ago == xx && is_sorted) { @@ -252,12 +324,14 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( res += reg_em[3] * iteratorA[3 * last_layer_size + jj]; Csub += (nnei - breakpoint) * - (var[1] + (2 * var[2] + - (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * + (var[1] + ((FPTYPE)2. * var[2] + + ((FPTYPE)3. * var[3] + + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * + xx) * xx) * (enable_se_atten ? res * t + res : res); } - __syncwarp(); + GpuSyncThreads(); for (int kk = 0; kk < MTILE; kk++) { warp_reduce(sum[kk]); } @@ -290,10 +364,10 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( const int nnei, const int last_layer_size, const bool is_sorted) { - extern __shared__ int _data[]; + GPU_DYNAMIC_SHARED_MEM_DECL(int, _data); const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); + FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); bool unloop = false; int breakpoint = nnei - 1; FPTYPE* iteratorC = (FPTYPE*)&_data[0]; @@ -323,9 +397,11 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; FPTYPE res_grad = - var[1] + - (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * - xx; + var[1] + ((FPTYPE)2. * var[2] + + ((FPTYPE)3. * var[3] + + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * + xx) * + xx; for (int kk = 0; kk < MTILE; kk++) { int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; @@ -403,10 +479,10 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( const int nnei_i, const int nnei_j, const int last_layer_size) { - extern __shared__ int _data[]; + GPU_DYNAMIC_SHARED_MEM_DECL(int, _data); const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0); + int warp_idx = GpuShuffleSync(0xffffffff, threadIdx.x / WARP_SIZE, 0); int lane_idx = threadIdx.x % WARP_SIZE; FPTYPE* iteratorA = (FPTYPE*)&_data[0]; // dy for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) { @@ -440,7 +516,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( xx) * xx); } - __syncwarp(); + GpuSyncThreads(); warp_reduce(sum); warp_reduce(Csub); if (lane_idx == 0) { @@ -551,10 +627,9 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial( const FPTYPE stride1, const int nnei, const int last_layer_size) { - extern __shared__ int _data[]; const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ - int warp_idx = __shfl_sync(0xffffffff, thread_idx / WARP_SIZE, 0); + int warp_idx = GpuShuffleSync(0xffffffff, thread_idx / WARP_SIZE, 0); int lane_idx = thread_idx % WARP_SIZE; __syncthreads(); for (int ii = warp_idx; ii < nnei; ii += KTILE) { @@ -568,12 +643,14 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial( for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) { load_polynomial_params(var, table, table_idx, jj, last_layer_size); Csub += - (var[1] + (2 * var[2] + - (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * + (var[1] + ((FPTYPE)2. * var[2] + + ((FPTYPE)3. * var[3] + + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * + xx) * xx) * dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj]; } - __syncwarp(); + GpuSyncThreads(); warp_reduce(Csub); if (lane_idx == 0) { @@ -595,10 +672,13 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial( const FPTYPE stride1, const int nnei, const int last_layer_size) { - extern __shared__ int _data[]; const int_64 block_idx = blockIdx.x; // nloc const int thread_idx = threadIdx.x; // last_layer_size +#if TENSORFLOW_USE_ROCM + __syncthreads(); +#endif + int mark_table_idx = -1; FPTYPE var[6]; for (int ii = 0; ii < nnei; ii++) { @@ -610,9 +690,11 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial( last_layer_size); } FPTYPE res_grad = - var[1] + - (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * - xx; + var[1] + ((FPTYPE)2. * var[2] + + ((FPTYPE)3. * var[3] + + ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * + xx) * + xx; mark_table_idx = table_idx; dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size + thread_idx] = dz_dy_dem[block_idx * nnei + ii] * res_grad; @@ -634,15 +716,21 @@ void tabulate_fusion_se_a_gpu(FPTYPE* out, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); tabulate_fusion_se_a_fifth_order_polynomial - <<>>(out, table, em_x, em, two_embed, - table_info[0], table_info[1], table_info[2], - table_info[3], table_info[4], nnei, - last_layer_size, is_sorted); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); +#if GOOGLE_CUDA + <<>> +#elif TENSORFLOW_USE_ROCM + <<>> +#else +#error "should not touch here" +#endif + (out, table, em_x, em, two_embed, table_info[0], table_info[1], + table_info[2], table_info[3], table_info[4], nnei, last_layer_size, + is_sorted); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -661,18 +749,18 @@ void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei)); - DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei)); + DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4)); tabulate_fusion_se_a_grad_fifth_order_polynomial <<>>( dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size, is_sorted); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -690,16 +778,16 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); tabulate_fusion_se_a_grad_grad_fifth_order_polynomial <<>>( dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size, is_sorted); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -715,14 +803,14 @@ void tabulate_fusion_se_t_gpu(FPTYPE* out, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); tabulate_fusion_se_t_fifth_order_polynomial <<>>( out, table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -740,18 +828,18 @@ void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); - DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); + DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); tabulate_fusion_se_t_grad_fifth_order_polynomial <<>>( dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -769,17 +857,17 @@ void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size)); tabulate_fusion_se_t_grad_grad_fifth_order_polynomial <<>>(dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -793,14 +881,14 @@ void tabulate_fusion_se_r_gpu(FPTYPE* out, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); tabulate_fusion_se_r_fifth_order_polynomial <<>>(out, table, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -815,16 +903,16 @@ void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei)); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); + DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei)); tabulate_fusion_se_r_grad_fifth_order_polynomial <<>>( dy_dem, table, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template @@ -839,16 +927,16 @@ void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy, if (nloc <= 0) { return; } - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); DPErrcheck( - cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size)); + gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size)); tabulate_fusion_se_r_grad_grad_fifth_order_polynomial <<>>( dz_dy, table, em, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); + DPErrcheck(gpuGetLastError()); + DPErrcheck(gpuDeviceSynchronize()); } template void tabulate_fusion_se_a_gpu(float* out, diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt deleted file mode 100644 index 1b093977b6..0000000000 --- a/source/lib/src/rocm/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -# required cmake version -cmake_minimum_required(VERSION 3.21) -# project name -project(deepmd_op_rocm) -set(CMAKE_LINK_WHAT_YOU_USE TRUE) - -# set c++ version c++11 -set(CMAKE_CXX_STANDARD 14) -set(CMAKE_HIP_STANDARD 14) -add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT") -add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT") - -message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR}) - -set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS} -)# --amdgpu-target=gfx906 -if(HIP_VERSION VERSION_LESS 3.5.1) - set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS}) -endif() - -file(GLOB SOURCE_FILES "*.hip.cu") - -hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES}) -target_include_directories( - deepmd_op_rocm - PUBLIC $ - $) -target_precompile_headers(deepmd_op_rocm PUBLIC [["device.h"]]) - -install(TARGETS deepmd_op_rocm DESTINATION lib/) -if(BUILD_CPP_IF) - install( - TARGETS deepmd_op_rocm - EXPORT ${CMAKE_PROJECT_NAME}Targets - DESTINATION lib/) -endif(BUILD_CPP_IF) -if(BUILD_PY_IF) - install(TARGETS deepmd_op_rocm DESTINATION deepmd/lib/) -endif(BUILD_PY_IF) diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu deleted file mode 100644 index 5416022575..0000000000 --- a/source/lib/src/rocm/coord.hip.cu +++ /dev/null @@ -1,444 +0,0 @@ -#include "coord.h" -#include "device.h" -#include "region.cuh" - -__device__ inline int collapse_index(const int *idx, const int *size) { - return (idx[0] * size[1] + idx[1]) * size[2] + idx[2]; -} -__device__ inline void index_recover(const int in_idx, - const int *size, - int *idx) { - idx[2] = in_idx % size[2]; - idx[1] = int(in_idx / size[2]) % size[1]; - idx[0] = int(int(in_idx / size[2]) / size[1]); -} -__device__ inline void idx_addshift(int *idx, const int *shift) { - for (int dd = 0; dd < 3; dd++) { - idx[dd] += shift[dd]; - } -} -__device__ inline void idx_unshift(int *idx, const int *shift) { - for (int dd = 0; dd < 3; dd++) { - idx[dd] -= shift[dd]; - } -} -__device__ inline int compute_pbc_shift(int idx, int ncell) { - int shift = 0; - if (idx < 0) { - shift = 1; - while (idx + shift * ncell < 0) { - shift++; - } - } else if (idx >= ncell) { - shift = -1; - while (idx + shift * ncell >= ncell) { - shift--; - } - } - return shift; -} - -__device__ inline double _fmod(double x, double y) { return fmod(x, y); } -__device__ inline float _fmod(float x, float y) { return fmodf(x, y); } - -template -__global__ void normalize_one(FPTYPE *out_c, - const FPTYPE *boxt, - const FPTYPE *rec_boxt, - const int nall) { - // <<>> - int idy = blockIdx.x * blockDim.x + threadIdx.x; - if (idy >= nall) { - return; - } - FPTYPE inter[3]; - phys2Inter(inter, out_c + idy * 3, rec_boxt); - for (int dd = 0; dd < 3; ++dd) { - inter[dd] = _fmod(inter[dd], (FPTYPE)1.); - if (inter[dd] < (FPTYPE)0.) { - inter[dd] += (FPTYPE)1.; - } - } - inter2Phys(out_c + idy * 3, inter, boxt); -} - -template -__global__ void _fill_idx_cellmap(int *idx_cellmap, - int *idx_cellmap_noshift, - const FPTYPE *in_c, - const FPTYPE *rec_boxt, - const int *nat_stt, - const int *nat_end, - const int *ext_stt, - const int *ext_end, - const int nloc) { - int idy = blockIdx.x * blockDim.x + threadIdx.x; - int ext_ncell[3]; - int global_grid[3]; - int idx_orig_shift[3]; - FPTYPE cell_size[3]; - FPTYPE nat_orig[3]; - for (int dd = 0; dd < 3; ++dd) { - ext_ncell[dd] = ext_end[dd] - ext_stt[dd]; - global_grid[dd] = nat_end[dd] - nat_stt[dd]; - idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd]; - cell_size[dd] = (FPTYPE)1. / global_grid[dd]; - nat_orig[dd] = nat_stt[dd] * cell_size[dd]; - } - if (idy < nloc) { - int idx_noshift[3]; - int idx[3]; - FPTYPE inter[3]; - phys2Inter(inter, in_c + idy * 3, rec_boxt); - for (int dd = 0; dd < 3; ++dd) { - idx_noshift[dd] = (inter[dd] - nat_orig[dd]) / cell_size[dd]; - if (inter[dd] - nat_orig[dd] < 0.) { - idx_noshift[dd]--; - } - if (idx_noshift[dd] < nat_stt[dd]) { - idx_noshift[dd] = nat_stt[dd]; - } else if (idx_noshift[dd] >= nat_end[dd]) { - idx_noshift[dd] = nat_end[dd] - 1; - } - idx[dd] = idx_noshift[dd] + idx_orig_shift[dd]; - } - idx_cellmap_noshift[idy] = collapse_index(idx_noshift, global_grid); - idx_cellmap[idy] = collapse_index(idx, ext_ncell); - } -} - -__global__ void _fill_loc_cellnum_map(int *temp_idx_order, - int *loc_cellnum_map, - const int *idx_cellmap_noshift, - const int nloc, - const int loc_cellnum) { - int idy = blockIdx.x * blockDim.x + threadIdx.x; - if (idy < loc_cellnum) { - int num = 0; - for (int ii = 0; ii < nloc; ii++) { - if (idx_cellmap_noshift[ii] == idy) { - temp_idx_order[ii] = num; - num++; - } - } - loc_cellnum_map[idy] = num; - } -} - -__global__ void _fill_total_cellnum_map(int *total_cellnum_map, - int *mask_cellnum_map, - int *cell_map, - int *cell_shift_map, - const int *nat_stt, - const int *nat_end, - const int *ext_stt, - const int *ext_end, - const int *loc_cellnum_map, - const int total_cellnum) { - int idy = blockIdx.x * blockDim.x + threadIdx.x; - int ext_ncell[3]; - int global_grid[3]; - int idx_orig_shift[3]; - for (int dd = 0; dd < 3; ++dd) { - ext_ncell[dd] = ext_end[dd] - ext_stt[dd]; - global_grid[dd] = nat_end[dd] - nat_stt[dd]; - idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd]; - } - if (idy < total_cellnum) { - int *shift = cell_shift_map + idy * 3; - int idx[3]; - index_recover(idy, ext_ncell, idx); - idx_unshift(idx, idx_orig_shift); - shift[0] = compute_pbc_shift(idx[0], global_grid[0]); - shift[1] = compute_pbc_shift(idx[1], global_grid[1]); - shift[2] = compute_pbc_shift(idx[2], global_grid[2]); - bool loc = false; - if (shift[0] == 0 && shift[1] == 0 && shift[2] == 0) { - loc = true; - } - for (int dd = 0; dd < 3; dd++) { - idx[dd] += shift[dd] * global_grid[dd]; - } - int orig_idy = collapse_index(idx, global_grid); - mask_cellnum_map[idy] = loc_cellnum_map[orig_idy]; - total_cellnum_map[idy] = mask_cellnum_map[idy]; - if (loc) { - mask_cellnum_map[idy] = 0; - } - cell_map[idy] = orig_idy; - } -} - -__global__ void _build_loc_clist(int *clist, - const int *idx_cellmap, - const int *idx_order, - const int *sec_num_map, - const int nloc) { - int idy = blockIdx.x * blockDim.x + threadIdx.x; - if (idy >= nloc) { - return; - } - int cell_idx = idx_cellmap[idy]; - int *clist_row = clist + sec_num_map[cell_idx]; - clist_row[idx_order[idy]] = idy; -} - -template -__global__ void _copy_coord(FPTYPE *out_c, - int *out_t, - int *mapping, - const FPTYPE *in_c, - const int *in_t, - const int *cell_map, - const int *cell_shift_map, - const int *sec_loc_cellnum_map, - const int *sec_total_cellnum_map, - const int *loc_clist, - const int nloc, - const int nall, - const int total_cellnum, - const FPTYPE *boxt, - const FPTYPE *rec_boxt) { - int idy = blockIdx.x * blockDim.x + threadIdx.x; - if (idy >= nall) { - return; - } - if (idy < nloc) { - mapping[idy] = idy; - out_t[idy] = in_t[idy]; - for (int dd = 0; dd < 3; dd++) { - out_c[idy * 3 + dd] = in_c[idy * 3 + dd]; - } - } else { - int cell_idx = 0; - int atom_idx = 0; - int orig_cell_idx = 0; - int orig_idy = 0; - int shift[3]; - FPTYPE d_shift[3]; - for (int ii = 0; ii < total_cellnum; ii++) { - if (idy >= sec_total_cellnum_map[ii + 1]) { - cell_idx++; - } else { - break; - } - } - for (int dd = 0; dd < 3; dd++) { - shift[dd] = cell_shift_map[cell_idx * 3 + dd]; - d_shift[dd] = shift[dd]; - } - atom_idx = idy - sec_total_cellnum_map[cell_idx]; - orig_cell_idx = cell_map[cell_idx]; - orig_idy = loc_clist[sec_loc_cellnum_map[orig_cell_idx] + atom_idx]; - mapping[idy] = orig_idy; - out_t[idy] = in_t[orig_idy]; - FPTYPE shift_v[3]; - inter2Phys(shift_v, d_shift, boxt); - for (int dd = 0; dd < 3; dd++) { - out_c[idy * 3 + dd] = in_c[orig_idy * 3 + dd] - shift_v[dd]; - } - } -} - -template -void compute_int_data(int *int_data, - const FPTYPE *in_c, - const int *cell_info, - const deepmd::Region ®ion, - const int nloc, - const int loc_cellnum, - const int total_cellnum) { - int *idx_cellmap = int_data; - int *idx_cellmap_noshift = idx_cellmap + nloc; - int *temp_idx_order = idx_cellmap_noshift + nloc; - int *loc_cellnum_map = temp_idx_order + nloc; - int *total_cellnum_map = loc_cellnum_map + loc_cellnum; - int *mask_cellnum_map = total_cellnum_map + total_cellnum; - int *cell_map = mask_cellnum_map + total_cellnum; - int *cell_shift_map = cell_map + total_cellnum; - const int *nat_stt = cell_info; - const int *nat_end = cell_info + 3; - const int *ext_stt = cell_info + 6; - const int *ext_end = cell_info + 9; - const FPTYPE *rec_boxt = region.rec_boxt; - - const int nblock_loc = (nloc + TPB - 1) / TPB; - hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap, - idx_cellmap_noshift, in_c, rec_boxt, nat_stt, nat_end, - ext_stt, ext_end, nloc); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - - const int nblock_loc_cellnum = (loc_cellnum + TPB - 1) / TPB; - hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0, - temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc, - loc_cellnum); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - - const int nblock_total_cellnum = (total_cellnum + TPB - 1) / TPB; - hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0, - total_cellnum_map, mask_cellnum_map, cell_map, - cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, - loc_cellnum_map, total_cellnum); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -void build_loc_clist(int *int_data, - const int nloc, - const int loc_cellnum, - const int total_cellnum) { - const int nblock = (nloc + TPB - 1) / TPB; - const int *idx_cellmap_noshift = int_data + nloc; - const int *temp_idx_order = idx_cellmap_noshift + nloc; - const int *sec_loc_cellnum_map = temp_idx_order + nloc + loc_cellnum + - 2 * total_cellnum + total_cellnum + - 3 * total_cellnum; - int *loc_clist = int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 + - total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1; - hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist, - idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, - nloc); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void copy_coord(FPTYPE *out_c, - int *out_t, - int *mapping, - const int *int_data, - const FPTYPE *in_c, - const int *in_t, - const int nloc, - const int nall, - const int loc_cellnum, - const int total_cellnum, - const deepmd::Region ®ion) { - const int nblock = (nall + TPB - 1) / TPB; - const int *cell_map = int_data + 3 * nloc + loc_cellnum + 2 * total_cellnum; - const int *cell_shift_map = cell_map + total_cellnum; - const int *sec_loc_cellnum_map = cell_shift_map + 3 * total_cellnum; - const int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1; - const int *loc_clist = sec_total_cellnum_map + total_cellnum + 1; - - const FPTYPE *boxt = region.boxt; - const FPTYPE *rec_boxt = region.rec_boxt; - hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping, - in_c, in_t, cell_map, cell_shift_map, sec_loc_cellnum_map, - sec_total_cellnum_map, loc_clist, nloc, nall, - total_cellnum, boxt, rec_boxt); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -namespace deepmd { -template -void normalize_coord_gpu(FPTYPE *coord, - const int natom, - const Region ®ion) { - const FPTYPE *boxt = region.boxt; - const FPTYPE *rec_boxt = region.rec_boxt; - const int nblock = (natom + TPB - 1) / TPB; - hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt, - natom); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -int copy_coord_gpu(FPTYPE *out_c, - int *out_t, - int *mapping, - int *nall, - int *int_data, - const FPTYPE *in_c, - const int *in_t, - const int &nloc, - const int &mem_nall, - const int &loc_cellnum, - const int &total_cellnum, - const int *cell_info, - const Region ®ion) { - compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, - total_cellnum); - int *int_data_cpu = new int - [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum + - 1]; // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map - DPErrcheck(hipMemcpy(int_data_cpu, int_data + 3 * nloc, - sizeof(int) * (loc_cellnum + 2 * total_cellnum), - hipMemcpyDeviceToHost)); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - int *loc_cellnum_map = int_data_cpu; - int *total_cellnum_map = loc_cellnum_map + loc_cellnum; - int *mask_cellnum_map = total_cellnum_map + total_cellnum; - int *sec_loc_cellnum_map = mask_cellnum_map + total_cellnum; - int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1; - sec_loc_cellnum_map[0] = 0; - sec_total_cellnum_map[0] = nloc; - int max_cell = 0; - for (int iii = 0; iii < total_cellnum; iii++) { - if (max_cell < total_cellnum_map[iii]) { - max_cell = total_cellnum_map[iii]; - } - if (iii < loc_cellnum) { - sec_loc_cellnum_map[iii + 1] = - sec_loc_cellnum_map[iii] + loc_cellnum_map[iii]; - } - sec_total_cellnum_map[iii + 1] = - sec_total_cellnum_map[iii] + mask_cellnum_map[iii]; - } - *nall = sec_total_cellnum_map[total_cellnum]; - if (*nall > mem_nall) { - delete[] int_data_cpu; - // size of the output arrays is not large enough - return 1; - } else { - DPErrcheck(hipMemcpy(int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 + - total_cellnum * 3, - sec_loc_cellnum_map, - sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1), - hipMemcpyHostToDevice)); - delete[] int_data_cpu; - build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum); - copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall, - loc_cellnum, total_cellnum, region); - } - return 0; -} - -template void normalize_coord_gpu(float *coord, - const int natom, - const Region ®ion); -template void normalize_coord_gpu(double *coord, - const int natom, - const Region ®ion); -template int copy_coord_gpu(float *out_c, - int *out_t, - int *mapping, - int *nall, - int *int_data, - const float *in_c, - const int *in_t, - const int &nloc, - const int &mem_nall, - const int &loc_cellnum, - const int &total_cellnum, - const int *cell_info, - const Region ®ion); -template int copy_coord_gpu(double *out_c, - int *out_t, - int *mapping, - int *nall, - int *int_data, - const double *in_c, - const int *in_t, - const int &nloc, - const int &mem_nall, - const int &loc_cellnum, - const int &total_cellnum, - const int *cell_info, - const Region ®ion); -} // namespace deepmd diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu deleted file mode 100644 index 76657eea52..0000000000 --- a/source/lib/src/rocm/gelu.hip.cu +++ /dev/null @@ -1,134 +0,0 @@ -#include "device.h" -#include "gelu.h" - -__device__ inline double _tanh(double x) { return tanh(x); } -__device__ inline float _tanh(float x) { return tanhf(x); } - -template -__global__ void gelu(FPTYPE* out, const FPTYPE* xx, const int_64 size) { - const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x; - if (idx >= size) { - return; - } - out[idx] = xx[idx] * (FPTYPE)0.5 * - ((FPTYPE)1.0 + - _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] * - xx[idx] * xx[idx]))); -} - -template -__global__ void gelu_grad(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const int_64 size) { - const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x; - if (idx >= size) { - return; - } - // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * - // xx[idx] * xx[idx] *xx[idx]))); - const FPTYPE var = - _tanh((FPTYPE)SQRT_2_PI * - (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx])); - out[idx] = - dy[idx] * - ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) * - ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) + - (FPTYPE)0.5 * var + (FPTYPE)0.5); -} - -template -__global__ void gelu_grad_grad(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const FPTYPE* dy_2, - const int_64 size) { - const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x; - if (idx >= size) { - return; - } - // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 * - // xx[idx] * xx[idx] *xx[idx]))); - const FPTYPE var1 = - _tanh((FPTYPE)SQRT_2_PI * - (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx])); - const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) * - ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.); - out[idx] = dy[idx] * dy_2[idx] * - ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] * - ((FPTYPE)1. - var1 * var1) - - (FPTYPE)SQRT_2_PI * xx[idx] * var2 * - ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 + - var2); -} - -namespace deepmd { -template -void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) { - if (size <= 0) { - return; - } - const int THREAD_ITEMS = 1024; - const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; - - hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void gelu_grad_gpu(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const int_64 size) { - if (size <= 0) { - return; - } - const int THREAD_ITEMS = 1024; - const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; - - hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, - size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void gelu_grad_grad_gpu(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const FPTYPE* dy_2, - const int_64 size) { - if (size <= 0) { - return; - } - const int THREAD_ITEMS = 1024; - const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; - - hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, - dy, dy_2, size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void gelu_gpu(float* out, const float* x, const int_64 size); -template void gelu_gpu(double* out, const double* x, const int_64 size); -template void gelu_grad_gpu(float* out, - const float* x, - const float* dy, - const int_64 size); -template void gelu_grad_gpu(double* out, - const double* x, - const double* dy, - const int_64 size); -template void gelu_grad_grad_gpu(float* out, - const float* x, - const float* dy, - const float* dy_2, - const int_64 size); -template void gelu_grad_grad_gpu(double* out, - const double* x, - const double* dy, - const double* dy_2, - const int_64 size); -} // namespace deepmd diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu deleted file mode 100644 index 736f2f9e9a..0000000000 --- a/source/lib/src/rocm/neighbor_list.hip.cu +++ /dev/null @@ -1,296 +0,0 @@ -#include "device.h" -#include "hipcub/hipcub.hpp" -#include "neighbor_list.h" -// A stateful callback functor that maintains a running prefix to be applied -// during consecutive scan operations. -struct parallel_prefix_scan_op { - // Running prefix - int running_total; - // Constructor - __device__ parallel_prefix_scan_op(int running_total) - : running_total(running_total) {} - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide - // scan. - __device__ int operator()(int block_aggregate) { - int old_prefix = running_total; - running_total += block_aggregate; - return old_prefix; - } -}; - -template -__global__ void parallel_prefix_scan(int *numneigh, - int *nei_order, - const int *temp_nlist, - const int mem_size, - const int nloc, - const int nall) { - // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 - // threads, 4 ints per thread - typedef hipcub::BlockScan BlockScan; - // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - __shared__ typename BlockScan::TempStorage temp_storage; - - // Initialize running total - parallel_prefix_scan_op prefix_op(0); - - // Have the block iterate over segments of items - for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK) { - int block_offset = blockIdx.x * mem_size; - // Load a segment of consecutive items that are blocked across threads - int i_data = temp_nlist[block_offset + ii]; - int o_data = i_data == -1 ? 0 : 1; - - // Collectively compute the block-wide exclusive prefix sum - BlockScan(temp_storage).ExclusiveSum(o_data, o_data, prefix_op); - - __syncthreads(); - // Store scanned items to output segment - if (i_data != -1) { - nei_order[block_offset + ii] = o_data; - } - // Store numneigh into the output array - if (ii == nall - 1) { - o_data += i_data == -1 ? 0 : 1; - numneigh[blockIdx.x] = o_data; - } - } -} - -template -__device__ inline FPTYPE dev_dot(FPTYPE *arr1, FPTYPE *arr2) { - return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; -} - -template -__global__ void build_nlist(int *ilist, - int *temp_nlist, - const FPTYPE *c_cpy, - const FPTYPE rcut2, - const int nloc, - const int nall, - const int mem_size) { - const unsigned int atom_idx = blockIdx.x; - const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y; - if (neighbor_idx < nall) { - int *neighbor_row = temp_nlist + atom_idx * mem_size; - if (neighbor_idx == atom_idx) { - ilist[atom_idx] = atom_idx; - } else { - const FPTYPE *ccoord = c_cpy + atom_idx * 3; - const FPTYPE *ncoord = c_cpy + neighbor_idx * 3; - FPTYPE diff[3]; - for (int kk = 0; kk < 3; kk++) { - diff[kk] = ccoord[kk] - ncoord[kk]; - } - FPTYPE r2 = dev_dot(diff, diff); - if (r2 < rcut2) { - neighbor_row[neighbor_idx] = neighbor_idx; - } - } - } -} - -__global__ void fill_nlist(int **firstneigh, - const int *temp_nlist, - const int *nei_order, - const int mem_size, - const int nall) { - const unsigned int atom_idx = blockIdx.x; - const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y; - if (neighbor_idx < nall) { - const int *in_row = temp_nlist + atom_idx * mem_size; - int *out_row = firstneigh[atom_idx]; - int nei = in_row[neighbor_idx]; - if (nei != -1) { - out_row[nei_order[atom_idx * mem_size + neighbor_idx]] = nei; - } - } -} - -__global__ void map_nlist(int *nlist, - const int *nlist_map, - const int nloc, - const int nnei) { - int atom_idx = blockIdx.x; - int nei_idx = blockIdx.y * blockDim.y + threadIdx.y; - if (nei_idx >= nnei) { - return; - } - int nlist_idx = atom_idx * nnei + nei_idx; - int nlist_item = nlist[nlist_idx]; - if (nlist_item != -1) { - nlist[nlist_idx] = nlist_map[nlist_item]; - } -} - -__global__ void map_nei_info(int *nlist, - int *ntype, - bool *nmask, - const int *type, - const int *nlist_map, - const int nloc, - const int nnei, - const int ntypes) { - int atom_idx = blockIdx.x; - int nei_idx = blockIdx.y * blockDim.y + threadIdx.y; - if (nei_idx >= nnei) { - return; - } - int nlist_idx = atom_idx * nnei + nei_idx; - int nlist_item = nlist[nlist_idx]; - int temp = 0; - if (nlist_item != -1) { - temp = nlist_map[nlist_item]; - nlist[nlist_idx] = temp; - ntype[nlist_idx] = type[temp]; - nmask[nlist_idx] = true; - } else { - ntype[nlist_idx] = ntypes; - } -} - -__global__ void map_nei_info_noconvert(int *nlist, - int *ntype, - bool *nmask, - const int *type, - const int nloc, - const int nnei, - const int ntypes) { - int atom_idx = blockIdx.x; - int nei_idx = blockIdx.y * blockDim.y + threadIdx.y; - if (nei_idx >= nnei) { - return; - } - int nlist_idx = atom_idx * nnei + nei_idx; - int nlist_item = nlist[nlist_idx]; - if (nlist_item != -1) { - ntype[nlist_idx] = type[nlist_item]; - nmask[nlist_idx] = true; - } else { - ntype[nlist_idx] = ntypes; - } -} - -namespace deepmd { -template -int build_nlist_gpu(InputNlist &nlist, - int *max_list_size, - int *nlist_data, - const FPTYPE *c_cpy, - const int &nloc, - const int &nall, - const int &mem_size, - const float &rcut) { - if (mem_size < nall) { - return 1; - } - const int nblock = (nall + TPB - 1) / TPB; - int *ilist = nlist.ilist; - int *numneigh = nlist.numneigh; - int **firstneigh = nlist.firstneigh; - DPErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); - int *temp_nlist = nlist_data; // nloc*mem_size - int *nei_order = temp_nlist + nloc * mem_size; - nlist.inum = nloc; - FPTYPE rcut2 = rcut * rcut; - - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, TPB); - hipLaunchKernelGGL(build_nlist, block_grid, thread_grid, 0, 0, ilist, - temp_nlist, c_cpy, rcut2, nloc, nall, mem_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - hipLaunchKernelGGL(HIP_KERNEL_NAME(parallel_prefix_scan), nloc, TPB, 0, - 0, numneigh, nei_order, temp_nlist, mem_size, nloc, nall); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, firstneigh, - temp_nlist, nei_order, mem_size, nall); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - int *numneigh_host = new int[nloc]; - DPErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, - hipMemcpyDeviceToHost)); - int max_nei = 0; - for (int ii = 0; ii < nloc; ii++) { - if (numneigh_host[ii] > max_nei) { - max_nei = numneigh_host[ii]; - } - } - *max_list_size = max_nei; - delete[] numneigh_host; - return 0; -} - -void use_nlist_map(int *nlist, - const int *nlist_map, - const int nloc, - const int nnei) { - int nblock = (nnei + TPB - 1) / TPB; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, TPB); - hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map, - nloc, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -void use_nei_info_gpu(int *nlist, - int *ntype, - bool *nmask, - const int *type, - const int *nlist_map, - const int nloc, - const int nnei, - const int ntypes, - const bool b_nlist_map) { - int nblock = (nnei + TPB - 1) / TPB; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, TPB); - DPErrcheck(hipMemset(ntype, 0, sizeof(int) * nloc * nnei)); - DPErrcheck(hipMemset(nmask, 0, sizeof(bool) * nloc * nnei)); - if (b_nlist_map) { - hipLaunchKernelGGL(map_nei_info, block_grid, thread_grid, 0, 0, nlist, - ntype, nmask, type, nlist_map, nloc, nnei, ntypes); - } else { - hipLaunchKernelGGL(map_nei_info_noconvert, block_grid, thread_grid, 0, 0, - nlist, ntype, nmask, type, nloc, nnei, ntypes); - } - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template int build_nlist_gpu(InputNlist &nlist, - int *max_list_size, - int *nlist_data, - const float *c_cpy, - const int &nloc, - const int &nall, - const int &mem_size, - const float &rcut); -template int build_nlist_gpu(InputNlist &nlist, - int *max_list_size, - int *nlist_data, - const double *c_cpy, - const int &nloc, - const int &nall, - const int &mem_size, - const float &rcut); -__global__ void map_filter_ftype(int *ftype_out, - const int *ftype_in, - const int nloc) { - int ii = blockIdx.x * blockDim.x + threadIdx.x; - if (ii < nloc) { - ftype_out[ii] = ftype_in[ii] >= 0 ? 0 : -1; - } -} - -void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) { - int nblock = (nloc + TPB - 1) / TPB; - map_filter_ftype<<>>(ftype_out, ftype_in, nloc); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} -} // namespace deepmd diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu deleted file mode 100644 index 23e8ce1d0e..0000000000 --- a/source/lib/src/rocm/prod_env_mat.hip.cu +++ /dev/null @@ -1,821 +0,0 @@ -#include "device.h" -#include "fmt_nlist.h" -#include "hipcub/hipcub.hpp" -#include "prod_env_mat.h" - -__device__ inline double _sqrt(double x) { return sqrt(x); } -__device__ inline float _sqrt(float x) { return sqrtf(x); } -__device__ inline double _rsqrt(double x) { return rsqrt(x); } -__device__ inline float _rsqrt(float x) { return rsqrtf(x); } - -// common part of prod_env_mat -template -__launch_bounds__(BLOCK_THREADS) __global__ - void BlockSortKernel(Key* d_in, - Key* d_out) // Tile of output -{ - enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; - // Specialize BlockLoad type for our thread block (uses warp-striped loads for - // coalescing, then transposes in shared memory to a blocked arrangement) - typedef hipcub::BlockLoad - BlockLoadT; - // Specialize BlockRadixSort type for our thread block - typedef hipcub::BlockRadixSort - BlockRadixSortT; - // Shared memory - __shared__ union TempStorage { - typename BlockLoadT::TempStorage load; - typename BlockRadixSortT::TempStorage sort; - } temp_storage; - // Per-thread tile items - Key items[ITEMS_PER_THREAD]; - // Our current block's offset - int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE; - // Load items into a blocked arrangement - BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); - // Barrier for smem reuse - __syncthreads(); - // Sort keys - BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items); - // Store output in striped fashion - hipcub::StoreDirectStriped(threadIdx.x, d_out + block_offset, - items); -} - -template -__device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) { - return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; -} - -template -__device__ inline void spline5_switch( - FPTYPE& vv, FPTYPE& dd, FPTYPE& xx, const float& rmin, const float& rmax) { - if (xx < rmin) { - dd = (FPTYPE)0.; - vv = (FPTYPE)1.; - } else if (xx < rmax) { - FPTYPE uu = (xx - rmin) / (rmax - rmin); - FPTYPE du = (FPTYPE)1. / (rmax - rmin); - vv = uu * uu * uu * - ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) + - (FPTYPE)1.; - dd = ((FPTYPE)3. * uu * uu * - ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) + - uu * uu * uu * ((FPTYPE)-12. * uu + (FPTYPE)15.)) * - du; - } else { - dd = (FPTYPE)0.; - vv = (FPTYPE)0.; - } -} - -template -__device__ inline uint_64 encoding_nbor_info(const int type, - const FPTYPE dist, - const int index) { - // nbor info checking: - // the type of nbor atom must be smaller than 128 - // the distance of center atom between nbor atom must be smaller than 128 - // the index of nbor atom(including ghost region) must be smaller than - // 16777216(1 << 24) - if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) { - __builtin_trap(); - } - return ((uint_64)type << 57) + - (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + - index; -} - -__device__ inline void decoding_nbor_info(int& type, - int& index, - const uint_64 key) { - type = key >> 57; - index = key & 0xFFFFFF; -} - -template -__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) { - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= nloc) { - return; - } - i_idx[ilist[idx]] = idx; -} - -template -__global__ void format_nlist_fill_a(uint_64* key, - const FPTYPE* coord, - const int* type, - const int* numneigh, - int** firstneigh, - const float rcut, - int* i_idx, - const int MAX_NBOR_SIZE) { - // <<>> - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; - - const int nsize = numneigh[i_idx[idx]]; - if (idy >= nsize) { - return; - } - - const int* nei_idx = firstneigh[i_idx[idx]]; - // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); - uint_64* key_in = key + idx * MAX_NBOR_SIZE; - FPTYPE diff[3]; - const int& j_idx = nei_idx[idy]; - if (type[j_idx] < 0) { - return; - } - for (int dd = 0; dd < 3; dd++) { - diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; - } - FPTYPE rr = _sqrt(dev_dot(diff, diff)); - if (rr <= rcut) { - key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx); - } -} - -template -__global__ void fill_nei_iter(int* nei_iter_dev, - const FPTYPE* key, - const int nloc, - const int max_nbor_size, - const int sec_size) { - int_64 row = blockIdx.x; - int col = blockIdx.y * blockDim.x + threadIdx.x; - const FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size; - int nei_type_cur = -1, nbor_idx_cur = 0; - int nei_type_pre = -1, nbor_idx_pre = 0; - if (col < max_nbor_size && key_out[col] != key_out[max_nbor_size - 1]) { - if (col >= 1) { - decoding_nbor_info(nei_type_pre, nbor_idx_pre, key_out[col - 1]); - } - decoding_nbor_info(nei_type_cur, nbor_idx_cur, key_out[col]); - } - if (nei_type_cur != nei_type_pre) { - nei_iter_dev[row * sec_size + nei_type_cur] = col; - } -} - -template -__global__ void format_nlist_fill_b(int* nlist, - const int nlist_size, - const int nloc, - FPTYPE* key, - const int* sec, - const int sec_size, - int* nei_iter_dev, - const int max_nbor_size) { - int_64 row = blockIdx.x; - int col = blockIdx.y * blockDim.x + threadIdx.x; - int* nei_iter = nei_iter_dev + row * sec_size; - FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size; - int* row_nlist = nlist + row * nlist_size; - if (col < max_nbor_size) { - if (key_out[col] != key_out[max_nbor_size - 1]) { - int nei_type = 0, nbor_idx = 0; - decoding_nbor_info(nei_type, nbor_idx, key_out[col]); - int out_indx = col - nei_iter[nei_type] + sec[nei_type]; - if (out_indx < sec[nei_type + 1]) { - row_nlist[out_indx] = nbor_idx; - } - } - } -} - -template -__global__ void encoding_decoding_nbor_info(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const FPTYPE* in_dist, - const int* in_index, - const int size_of_array) { - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= size_of_array) { - return; - } - - key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]); - decoding_nbor_info(out_type[idx], out_index[idx], key[idx]); -} - -template -void format_nbor_list_256(uint_64* key, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - const int& nloc, - const float& rcut, - int* i_idx) { - const int LEN = 256; - const int MAX_NBOR_SIZE = 256; - const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, - coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, - rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; - // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - BlockSortKernel), - nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void format_nbor_list_512(uint_64* key, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - const int& nloc, - const float& rcut, - int* i_idx) { - const int LEN = 256; - const int MAX_NBOR_SIZE = 512; - const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, - coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, - rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; - // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - BlockSortKernel), - nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void format_nbor_list_1024(uint_64* key, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - const int& nloc, - const float& rcut, - int* i_idx) { - const int LEN = 256; - const int MAX_NBOR_SIZE = 1024; - const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, - coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, - rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - const int ITEMS_PER_THREAD = 8; - const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; - // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - BlockSortKernel), - nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void format_nbor_list_2048(uint_64* key, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - const int& nloc, - const float& rcut, - int* i_idx) { - const int LEN = 256; - const int MAX_NBOR_SIZE = 2048; - const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, - coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, - rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - const int ITEMS_PER_THREAD = 8; - const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; - // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - BlockSortKernel), - nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void format_nbor_list_4096(uint_64* key, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - const int& nloc, - const float& rcut, - int* i_idx) { - const int LEN = 256; - const int MAX_NBOR_SIZE = 4096; - const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, - coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, - rcut, i_idx, MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - const int ITEMS_PER_THREAD = 16; - const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; - // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - BlockSortKernel), - nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -__global__ void compute_env_mat_a(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - const FPTYPE* coord, - const FPTYPE* avg, - const FPTYPE* std, - const int* type, - const int* nlist, - const int nnei, - const float rmin, - const float rmax) { - // <<>> - const int_64 bid = blockIdx.x; - const unsigned int tid = threadIdx.x; - if (type[bid] < 0) { - return; - } - if (tid >= nnei) { - return; - } - const int ndescrpt = nnei * 4; - const int* row_nlist = nlist + bid * nnei; - FPTYPE* row_rij = rij + bid * nnei * 3; - FPTYPE* row_descript = em + bid * nnei * 4; - FPTYPE* row_descript_deriv = em_deriv + bid * nnei * 12; - for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) { - const int idx_value = ii * 4; // 4 components - const int idx_deriv = ii * 12; // 4 components time 3 directions - if (row_nlist[ii] >= 0) { - FPTYPE rr[3] = {0}; - FPTYPE dd[4] = {0}; - FPTYPE vv[12] = {0}; - const int j_idx = row_nlist[ii]; - for (int kk = 0; kk < 3; kk++) { - rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; - row_rij[ii * 3 + kk] = rr[kk]; - } - // const FPTYPE * rr = &row_rij[ii * 3]; - FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = _rsqrt(nr2); - FPTYPE nr = nr2 * inr; - FPTYPE inr2 = inr * inr; - FPTYPE inr4 = inr2 * inr2; - FPTYPE inr3 = inr4 * nr; - FPTYPE sw, dsw; - spline5_switch(sw, dsw, nr, rmin, rmax); - dd[0] = ((FPTYPE)1. / nr); //* sw; - dd[1] = (rr[0] / nr2); //* sw; - dd[2] = (rr[1] / nr2); //* sw; - dd[3] = (rr[2] / nr2); //* sw; - vv[0] = (rr[0] * inr3 * sw - - dd[0] * dsw * rr[0] * - inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; - vv[1] = (rr[1] * inr3 * sw - - dd[0] * dsw * rr[1] * - inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; - vv[2] = (rr[2] * inr3 * sw - - dd[0] * dsw * rr[2] * - inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; - // ****deriv of component x/r2 - vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw - - dd[1] * dsw * rr[0] * - inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3]; - vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4) * sw - - dd[1] * dsw * rr[1] * - inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3]; - vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4) * sw - - dd[1] * dsw * rr[2] * - inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3]; - // ***deriv of component y/r2 - vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4) * sw - - dd[2] * dsw * rr[0] * - inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3]; - vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw - - dd[2] * dsw * rr[1] * - inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3]; - vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4) * sw - - dd[2] * dsw * rr[2] * - inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3]; - // ***deriv of component z/r2 - vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4) * sw - - dd[3] * dsw * rr[0] * - inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3]; - vv[10] = - (((FPTYPE)2. * rr[2] * rr[1] * inr4) * sw - - dd[3] * dsw * rr[1] * - inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt - // + ((idx_deriv + 10) % (ndescrpt * 3)) / 3]; - vv[11] = - (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw - - dd[3] * dsw * rr[2] * - inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt - // + ((idx_deriv + 11) % (ndescrpt * 3)) / 3]; - // 4 value components - dd[0] *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] - // * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + - // idx_value + 0]; - dd[1] *= sw; // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] - // * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + - // idx_value + 1]; - dd[2] *= sw; // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] - // * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + - // idx_value + 2]; - dd[3] *= sw; // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] - // * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + - // idx_value + 3]; - for (int ii = 0; ii < 12; ii++) { - row_descript_deriv[idx_deriv + ii] = - vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3]; - } - for (int ii = 0; ii < 4; ii++) { - row_descript[idx_value + ii] = - (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) / - std[type[bid] * ndescrpt + idx_value + ii]; - } - } else { - // TODO: move it to the memset. - row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / - std[type[bid] * ndescrpt + idx_value]; - } - } -} - -template -__global__ void compute_env_mat_r(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - const FPTYPE* coord, - const FPTYPE* avg, - const FPTYPE* std, - const int* type, - const int* nlist, - const int nnei, - const float rmin, - const float rmax) { - // <<>> - const int_64 bid = blockIdx.x; - const unsigned int tid = threadIdx.x; - if (tid >= nnei) { - return; - } - const int ndescrpt = nnei; - const int* row_nlist = nlist + bid * nnei; - FPTYPE* row_rij = rij + bid * nnei * 3; - FPTYPE* row_em = em + bid * nnei; - FPTYPE* row_em_deriv = em_deriv + bid * nnei * 3; - for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) { - const int idx_value = ii; // 4 components - const int idx_deriv = ii * 3; // 4 components time 3 directions - if (row_nlist[ii] >= 0) { - FPTYPE rr[3] = {(FPTYPE)0.}; - FPTYPE vv[3] = {(FPTYPE)0.}; - FPTYPE dd = (FPTYPE)0.; - const int& j_idx = row_nlist[ii]; - for (int kk = 0; kk < 3; kk++) { - rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk]; - row_rij[ii * 3 + kk] = rr[kk]; - } - // const FPTYPE * rr = &row_rij[ii * 3]; - FPTYPE nr2 = dev_dot(rr, rr); - FPTYPE inr = _rsqrt(nr2); - FPTYPE nr = nr2 * inr; - FPTYPE inr2 = inr * inr; - FPTYPE inr4 = inr2 * inr2; - FPTYPE inr3 = inr4 * nr; - FPTYPE sw, dsw; - spline5_switch(sw, dsw, nr, rmin, rmax); - dd = ((FPTYPE)1. / nr); //* sw; - vv[0] = (rr[0] * inr3 * sw - - dd * dsw * rr[0] * - inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; - vv[1] = (rr[1] * inr3 * sw - - dd * dsw * rr[1] * - inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; - vv[2] = (rr[2] * inr3 * sw - - dd * dsw * rr[2] * - inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * - // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; - - // 4 value components - dd *= sw; // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * - // ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + - // idx_value + 0]; - for (int ii = 0; ii < 3; ii++) { - row_em_deriv[idx_deriv + ii] = - vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3]; - } - row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) / - std[type[bid] * ndescrpt + idx_value]; - } else { - // TODO: move it to the memset. - row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / - std[type[bid] * ndescrpt + idx_value]; - } - } -} - -namespace deepmd { -template -void format_nbor_list_gpu(int* nlist, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec) { - const int LEN = 256; - const int nnei = sec.back(); - const int nblock = (nloc + LEN - 1) / LEN; - int* sec_dev = array_int; - int* nei_iter = array_int + sec.size(); // = new int[sec_size]; - int* i_idx = array_int + sec.size() + nloc * sec.size(); - uint_64* key = array_longlong; - assert(max_nbor_size == 256 || max_nbor_size == 512 || 1024 || - max_nbor_size == 2048 || max_nbor_size == 4096); - DPErrcheck(hipMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei)); - DPErrcheck(hipMemset(key, 0xffffffff, - sizeof(uint_64) * int_64(nloc) * max_nbor_size)); - DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), - hipMemcpyHostToDevice)); - - hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, i_idx, nloc, - gpu_inlist.ilist); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - - if (max_nbor_size == 256) { - format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx); - } else if (max_nbor_size == 512) { - format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx); - } else if (max_nbor_size == 1024) { - format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx); - } else if (max_nbor_size == 2048) { - format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx); - } else if (max_nbor_size == 4096) { - format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx); - } - - hipLaunchKernelGGL(fill_nei_iter, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), - LEN, 0, 0, nei_iter, key, nloc, max_nbor_size, sec.size()); - - hipLaunchKernelGGL( - format_nlist_fill_b, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN, 0, - 0, nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void prod_env_mat_a_gpu(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - int* nlist, - const FPTYPE* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const FPTYPE* avg, - const FPTYPE* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type) { - if (f_type == NULL) { - f_type = type; - } - const int nnei = sec.back(); - const int ndescrpt = nnei * 4; - DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); - DPErrcheck( - hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); - DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); - - format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, sec); - nborErrcheck(hipGetLastError()); - nborErrcheck(hipDeviceSynchronize()); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a), nloc, TPB, - 0, 0, em, em_deriv, rij, coord, avg, std, type, nlist, - nnei, rcut_smth, rcut); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void prod_env_mat_r_gpu(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - int* nlist, - const FPTYPE* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const FPTYPE* avg, - const FPTYPE* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec) { - const int nnei = sec.back(); - const int ndescrpt = nnei * 1; - DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); - DPErrcheck( - hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); - DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); - - format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, sec); - nborErrcheck(hipGetLastError()); - nborErrcheck(hipDeviceSynchronize()); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r), nloc, TPB, - 0, 0, em, em_deriv, rij, coord, avg, std, type, nlist, - nnei, rcut_smth, rcut); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void test_encoding_decoding_nbor_info_gpu(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const FPTYPE* in_dist, - const int* in_index, - const int size_of_array) { - const int nblock = (size_of_array + TPB - 1) / TPB; - hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, key, - out_type, out_index, in_type, in_dist, in_index, - size_of_array); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void prod_env_mat_a_gpu(float* em, - float* em_deriv, - float* rij, - int* nlist, - const float* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const float* avg, - const float* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type); -template void prod_env_mat_a_gpu(double* em, - double* em_deriv, - double* rij, - int* nlist, - const double* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const double* avg, - const double* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type); -template void prod_env_mat_r_gpu(float* em, - float* em_deriv, - float* rij, - int* nlist, - const float* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const float* avg, - const float* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); -template void prod_env_mat_r_gpu(double* em, - double* em_deriv, - double* rij, - int* nlist, - const double* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const double* avg, - const double* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); -template void format_nbor_list_gpu(int* nlist, - const float* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); -template void format_nbor_list_gpu(int* nlist, - const double* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); -template void test_encoding_decoding_nbor_info_gpu(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const float* in_dist, - const int* in_index, - const int size_of_array); -template void test_encoding_decoding_nbor_info_gpu(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const double* in_dist, - const int* in_index, - const int size_of_array); -} // namespace deepmd diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu deleted file mode 100644 index 5b1f91dd49..0000000000 --- a/source/lib/src/rocm/prod_force.hip.cu +++ /dev/null @@ -1,193 +0,0 @@ -#include "device.h" -#include "prod_force.h" - -template -__global__ void force_deriv_wrt_center_atom(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int ndescrpt, - const int nloc, - const int nall) { - __shared__ FPTYPE data[THREADS_PER_BLOCK * 3]; - int_64 bid = blockIdx.x; - unsigned int tid = threadIdx.x; - for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) { - data[ii] = (FPTYPE)0.; - } - for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) { - for (int jj = 0; jj < 3; jj++) { - data[jj * THREADS_PER_BLOCK + tid] += - net_deriv[bid * ndescrpt + ii] * - in_deriv[bid * ndescrpt * 3 + ii * 3 + jj]; - } - } - __syncthreads(); - // do reduction in shared memory - for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - for (int jj = 0; jj < 3; jj++) { - data[jj * THREADS_PER_BLOCK + tid] += - data[jj * THREADS_PER_BLOCK + tid + ii]; - } - } - __syncthreads(); - } - // write result for this block to global memory - const int_64 kk = bid / nloc; // frame index - const int_64 ll = bid % nloc; // atom index - const int_64 i_idx_nall = kk * nall + ll; - if (tid == 0) { - force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0]; - force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1]; - force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2]; - } -} - -template -__global__ void force_deriv_wrt_neighbors_a(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { - // idy -> nnei - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - const int ndescrpt = nnei * 4; - if (idy >= nnei) { - return; - } - // deriv wrt neighbors - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - FPTYPE force_tmp = (FPTYPE)0.; - for (int idw = 0; idw < 4; ++idw) { - force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * - in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]; - } - const int_64 kk = idx / nloc; // frame index - atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp); -} - -template -__global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { - // idy -> nnei - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - const int ndescrpt = nnei * 1; - if (idy >= nnei) { - return; - } - // deriv wrt neighbors - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - const int_64 kk = idx / nloc; // frame index - atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, - net_deriv[idx * ndescrpt + idy] * - in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); -} - -namespace deepmd { -template -void prod_force_a_gpu(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes) { - const int ndescrpt = nnei * 4; - DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), - nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv, - ndescrpt, nloc, nall); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - - const int LEN = 64; - const int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nframes * nloc, nblock); - dim3 thread_grid(LEN, 3); - hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, - force, net_deriv, in_deriv, nlist, nloc, nall, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void prod_force_r_gpu(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes) { - const int ndescrpt = nnei * 1; - DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), - nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv, - ndescrpt, nloc, nall); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - - const int LEN = 64; - const int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nframes * nloc, nblock); - dim3 thread_grid(LEN, 3); - hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, - force, net_deriv, in_deriv, nlist, nloc, nall, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void prod_force_a_gpu(float* force, - const float* net_deriv, - const float* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_a_gpu(double* force, - const double* net_deriv, - const double* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_r_gpu(float* force, - const float* net_deriv, - const float* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_r_gpu(double* force, - const double* net_deriv, - const double* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); - -} // namespace deepmd diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu deleted file mode 100644 index 2cb7c4f1d6..0000000000 --- a/source/lib/src/rocm/prod_force_grad.hip.cu +++ /dev/null @@ -1,168 +0,0 @@ -#include "device.h" -#include "prod_force_grad.h" - -template -__device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) { - return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; -} - -template -__global__ void force_grad_wrt_center_atom(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int ndescrpt) { - __shared__ FPTYPE grad_one[3]; - int_64 center_idx = blockIdx.x; - unsigned int tid = threadIdx.x; - if (tid < 3) { - grad_one[tid] = grad[center_idx * 3 + tid]; - } - __syncthreads(); - unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid; - if (descrpt_idx < ndescrpt) { - grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot( - grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3); - } -} - -template -__global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { - // idy -> nnei - const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idy = blockIdx.y; - const unsigned int idw = threadIdx.y; - if (idx >= nframes * nloc) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - if (j_idx >= nloc) { - j_idx = j_idx % nloc; - } - const int kk = idx / nloc; // frame index - grad_net[idx * nnei * 4 + idy * 4 + idw] += - dev_dot(grad + kk * nloc * 3 + j_idx * 3, - env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3); -} - -template -__global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { - // idy -> nnei - const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idy = blockIdx.y; - if (idx >= nframes * nloc) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - if (j_idx >= nloc) { - j_idx = j_idx % nloc; - } - const int kk = idx / nloc; // frame index - grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3, - env_deriv + idx * nnei * 3 + idy * 3); -} - -namespace deepmd { -template -void prod_force_grad_a_gpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { - const int ndescrpt = nnei * 4; - DPErrcheck( - hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); - const int nblock = (ndescrpt + TPB - 1) / TPB; - dim3 block_grid(nframes * nloc, nblock); - dim3 thread_grid(TPB, 1); - hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, - grad_net, grad, env_deriv, ndescrpt); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - const int LEN = 128; - const int nblock_ = (nframes * nloc + LEN - 1) / LEN; - dim3 block_grid_(nblock_, nnei); - dim3 thread_grid_(LEN, 4); - hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, - 0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void prod_force_grad_r_gpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { - const int ndescrpt = nnei * 1; - DPErrcheck( - hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); - const int nblock = (ndescrpt + TPB - 1) / TPB; - dim3 block_grid(nframes * nloc, nblock); - dim3 thread_grid(TPB, 1); - hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, - grad_net, grad, env_deriv, ndescrpt); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - - const int LEN = 128; - const int nblock_ = (nframes * nloc + LEN - 1) / LEN; - dim3 block_grid_(nblock_, nnei); - dim3 thread_grid_(LEN, 1); - hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, - 0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void prod_force_grad_a_gpu(float* grad_net, - const float* grad, - const float* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_a_gpu(double* grad_net, - const double* grad, - const double* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_r_gpu(float* grad_net, - const float* grad, - const float* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_r_gpu(double* grad_net, - const double* grad, - const double* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -} // namespace deepmd diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu deleted file mode 100644 index ff29c07ffb..0000000000 --- a/source/lib/src/rocm/prod_virial.hip.cu +++ /dev/null @@ -1,197 +0,0 @@ -#include "device.h" -#include "prod_virial.h" - -template -__global__ void atom_virial_reduction(FPTYPE* virial, - const FPTYPE* atom_virial, - const int nall) { - unsigned int bid = blockIdx.x; - unsigned int tid = threadIdx.x; - __shared__ FPTYPE data[THREADS_PER_BLOCK]; - data[tid] = (FPTYPE)0.; - for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { - data[tid] += atom_virial[ii * 9 + bid]; - } - __syncthreads(); - // do reduction in shared memory - for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - // write result for this block to global memory - if (tid == 0) { - virial[bid] = data[0]; - } -} - -template -__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idx -> nloc - // idy -> nnei - // idz = dd0 * 3 + dd1 - // dd0 = idz / 3 - // dd1 = idz % 3 - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - const int ndescrpt = nnei * 4; - if (idy >= nnei) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - FPTYPE virial_tmp = (FPTYPE)0.; - for (int idw = 0; idw < 4; ++idw) { - virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * - rij[idx * nnei * 3 + idy * 3 + idz % 3] * - in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; - } - atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp); -} - -template -__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idx -> nloc - // idy -> nnei - // idz = dd0 * 3 + dd1 - // dd0 = idz / 3 - // dd1 = idz % 3 - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - const int ndescrpt = nnei * 1; - - if (idy >= nnei) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - // atomicAdd( - // virial + idz, - // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 - // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % - // 3]); - atomicAdd(atom_virial + j_idx * 9 + idz, - net_deriv[idx * ndescrpt + idy] * - rij[idx * nnei * 3 + idy * 3 + idz % 3] * - in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); -} - -namespace deepmd { -template -void prod_virial_a_gpu(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { - DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9)); - DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); - - const int LEN = 16; - int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(LEN, 9); - // compute virial of a frame - hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, - 0, virial, atom_virial, net_deriv, in_deriv, rij, nlist, - nloc, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - // reduction atom_virial to virial - hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction), 9, - TPB, 0, 0, virial, atom_virial, nall); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void prod_virial_r_gpu(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { - DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9)); - DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); - - const int LEN = 16; - int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(LEN, 9); - // compute virial of a frame - hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, - 0, virial, atom_virial, net_deriv, in_deriv, rij, nlist, - nloc, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - // reduction atom_virial to virial - hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction), 9, - TPB, 0, 0, virial, atom_virial, nall); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void prod_virial_a_gpu(float* virial, - float* atom_virial, - const float* net_deriv, - const float* in_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_a_gpu(double* virial, - double* atom_virial, - const double* net_deriv, - const double* in_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_r_gpu(float* virial, - float* atom_virial, - const float* net_deriv, - const float* in_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_r_gpu(double* virial, - double* atom_virial, - const double* net_deriv, - const double* in_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -} // namespace deepmd diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu deleted file mode 100644 index d41a1689ce..0000000000 --- a/source/lib/src/rocm/prod_virial_grad.hip.cu +++ /dev/null @@ -1,154 +0,0 @@ -#include "device.h" -#include "prod_virial_grad.h" - -template -__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) { - FPTYPE result = (FPTYPE)0.0; - for (int ii = 0; ii < 9; ii++) { - result += arr1[ii] * arr2[ii]; - } - return result; -} - -template -__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idy -> nnei - const unsigned int tid = threadIdx.x; - const int_64 idx = blockIdx.x * blockDim.x + tid; - const unsigned int idy = blockIdx.y; - const unsigned int idw = threadIdx.y; - const int ndescrpt = nnei * 4; - __shared__ FPTYPE grad_one[9]; - if (tid < 9) { - grad_one[tid] = grad[tid]; - } - __syncthreads(); - if (idx >= nloc) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - FPTYPE tmp[9]; - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - tmp[dd0 * 3 + dd1] = - rij[idx * nnei * 3 + idy * 3 + dd1] * - env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0]; - } - } - grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp); -} - -template -__global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idy -> nnei - const unsigned int tid = threadIdx.x; - const int_64 idx = blockIdx.x * blockDim.x + tid; - const unsigned int idy = blockIdx.y; - const int ndescrpt = nnei; - __shared__ FPTYPE grad_one[9]; - if (tid < 9) { - grad_one[tid] = grad[tid]; - } - __syncthreads(); - if (idx >= nloc) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - FPTYPE tmp[9]; - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] * - env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0]; - } - } - grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp); -} - -namespace deepmd { -template -void prod_virial_grad_a_gpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - const int ndescrpt = nnei * 4; - DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); - const int LEN = 128; - const int nblock = (nloc + LEN - 1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 4); - hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0, - grad_net, grad, env_deriv, rij, nlist, nloc, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void prod_virial_grad_r_gpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - const int ndescrpt = nnei; - DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); - const int LEN = 128; - const int nblock = (nloc + LEN - 1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 1); - hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0, - grad_net, grad, env_deriv, rij, nlist, nloc, nnei); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void prod_virial_grad_a_gpu(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_a_gpu(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_r_gpu(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_r_gpu(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); -} // namespace deepmd diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu deleted file mode 100644 index de67ef648c..0000000000 --- a/source/lib/src/rocm/region.hip.cu +++ /dev/null @@ -1,65 +0,0 @@ -#include "device.h" -#include "region.cuh" -#include "region.h" - -template -__global__ void _phys2Inter(FPTYPE *inter, - const FPTYPE *phys, - const FPTYPE *rec_boxt) { - phys2Inter(inter, phys, rec_boxt); -} - -template -__global__ void _inter2Phys(FPTYPE *phys, - const FPTYPE *inter, - const FPTYPE *boxt) { - inter2Phys(phys, inter, boxt); -} - -template -__global__ void _compute_volume(FPTYPE *volume, const FPTYPE *boxt) { - volume[0] = compute_volume(boxt); -} - -namespace deepmd { -// only for unittest -template -void convert_to_inter_gpu(FPTYPE *ri, - const Region ®ion, - const FPTYPE *rp) { - hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void convert_to_phys_gpu(FPTYPE *rp, - const Region ®ion, - const FPTYPE *ri) { - hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void volume_gpu(FPTYPE *volume, const Region ®ion) { - hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void convert_to_inter_gpu(float *ri, - const Region ®ion, - const float *rp); -template void convert_to_inter_gpu(double *ri, - const Region ®ion, - const double *rp); -template void convert_to_phys_gpu(float *rp, - const Region ®ion, - const float *ri); -template void convert_to_phys_gpu(double *rp, - const Region ®ion, - const double *ri); -template void volume_gpu(float *volume, const Region ®ion); -template void volume_gpu(double *volume, const Region ®ion); -} // namespace deepmd diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu deleted file mode 100644 index 88a1cbb574..0000000000 --- a/source/lib/src/rocm/tabulate.hip.cu +++ /dev/null @@ -1,1036 +0,0 @@ -#include "device.h" -#include "tabulate.h" - -#define MM 4 -#define KK 4 -#define TPB 256 -#define WARP_SIZE 64 -#define FULL_MASK 0xffffffff - -template -__forceinline__ __device__ void locate_xx(FPTYPE& xx, - int& table_idx, - const FPTYPE& lower, - const FPTYPE& upper, - const FPTYPE& max, - const FPTYPE& stride0, - const FPTYPE& stride1) { - if (xx < lower) { - table_idx = 0; - xx = (FPTYPE)0.; - } else if (xx < upper) { - table_idx = (int)((xx - lower) / stride0); - xx -= (table_idx * stride0 + lower); - } else if (xx < max) { - int first_stride = int((upper - lower) / stride0); - table_idx = first_stride + (int)((xx - upper) / stride1); - xx -= ((table_idx - first_stride) * stride1 + upper); - } else { - table_idx = - int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; - xx = (FPTYPE)0.; - } -} - -template -__forceinline__ __device__ void locate_xx_se_t(FPTYPE& xx, - int& table_idx, - const FPTYPE& lower, - const FPTYPE& upper, - const FPTYPE& min, - const FPTYPE& max, - const FPTYPE& stride0, - const FPTYPE& stride1) { - if (xx < min) { - table_idx = 0; - xx = (FPTYPE)0.; - } else if (xx < lower) { - table_idx = (int)((xx - min) / stride1); - xx -= (table_idx * stride1 + min); - } else if (xx < upper) { - int first_stride = int((lower - min) / stride1); - table_idx = first_stride + (int)((xx - lower) / stride0); - xx -= ((table_idx - first_stride) * stride0 + lower); - } else if (xx < max) { - int first_stride = - int((lower - min) / stride1) + int((upper - lower) / stride0); - table_idx = first_stride + (int)((xx - upper) / stride1); - xx -= ((table_idx - first_stride) * stride1 + upper); - } else { - table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + - (int)((max - upper) / stride1) - 1; - xx = (FPTYPE)0.; - } -} - -template -__forceinline__ __device__ FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) { - return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3]; -} - -template -__forceinline__ __device__ void warp_reduce(FPTYPE& val) { - for (int offset = 32; offset > 0; offset >>= 1) { - val += __shfl_down(val, offset); // ########???? - } -} - -template -__global__ void tabulate_fusion_se_a_fifth_order_polynomial( - FPTYPE* out, - const FPTYPE* table, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei, - const int last_layer_size, - const bool is_sorted) { - bool enable_se_atten = two_embed != nullptr; - HIP_DYNAMIC_SHARED(int, _data) - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0); - bool unloop = false; - int breakpoint = nnei - 1; - FPTYPE* iteratorC = (FPTYPE*)&_data[0]; - for (int kk = 0; kk < MTILE; kk++) { - iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.; - } - __syncthreads(); - - for (int ii = 0; ii < nnei; ii++) { - FPTYPE var[6]; - FPTYPE xx = em_x[block_idx * nnei + ii]; - if (xx == ago && is_sorted) { - unloop = true; - breakpoint = ii; - } - int table_idx = 0; - locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; - var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; - var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; - var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; - var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; - var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - FPTYPE res = - var[0] + - (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - if (enable_se_atten) { - FPTYPE t = two_embed[block_idx * nnei * last_layer_size + - ii * last_layer_size + thread_idx]; - res = res * t + res; - } - - for (int kk = 0; kk < MTILE; kk++) { - iteratorC[kk * last_layer_size + thread_idx] += - (nnei - breakpoint) * em[block_idx * nnei * MTILE + ii * MTILE + kk] * - res; - } - if (unloop) { - break; - } - } - for (int ii = 0; ii < MTILE; ii++) { - out[block_idx * MTILE * last_layer_size + ii * last_layer_size + - thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; - } -} - -template -__global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( - FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE* dy, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei, - const int last_layer_size, - const bool is_sorted) { - bool enable_se_atten = two_embed != nullptr; - HIP_DYNAMIC_SHARED(int, _data) - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ - int warp_idx = __shfl(threadIdx.x / 64, 0); - int lane_idx = threadIdx.x % 64; - int breakpoint = nnei - 1; - bool unloop = false; - FPTYPE* iteratorA = (FPTYPE*)&_data[0]; // dy - for (int ii = 0; ii < MTILE; ii++) { - for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) { - iteratorA[ii * last_layer_size + jj] = - dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj]; - } - } - __syncthreads(); - FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0); - for (int ii = 0; ii < nnei - warp_idx; ii += KTILE) { - FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx]; - if (ago == xx && is_sorted) { - unloop = true; - breakpoint = ii + warp_idx; - } - - int table_idx = 0; - locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - FPTYPE sum[KTILE] = {(FPTYPE)0.}; - FPTYPE Csub = (FPTYPE)0.; - for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) { - FPTYPE var[6]; - // load iteratorB through table - var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; - var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; - var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; - var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3]; - var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4]; - var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5]; - FPTYPE res = - var[0] + - (var[1] + - (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - FPTYPE t; - if (enable_se_atten) { - t = two_embed[block_idx * nnei * last_layer_size + - ii * last_layer_size + jj]; - res = res * t + res; - } - - for (int kk = 0; kk < KTILE; kk++) { - sum[kk] += - (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res; - } - res = em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * - iteratorA[0 * last_layer_size + jj]; - res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * - iteratorA[1 * last_layer_size + jj]; - res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * - iteratorA[2 * last_layer_size + jj]; - res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * - iteratorA[3 * last_layer_size + jj]; - Csub += - (nnei - breakpoint) * - (var[1] + ((FPTYPE)2. * var[2] + - ((FPTYPE)3. * var[3] + - ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * - xx) * - xx) * - (enable_se_atten ? res * t + res : res); - } - //__syncwarp();->syncwrap - __syncthreads(); - for (int kk = 0; kk < KTILE; kk++) { - warp_reduce(sum[kk]); - } - warp_reduce(Csub); - if (lane_idx == 0) { - for (int kk = 0; kk < KTILE; kk++) { - dy_dem[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk]; - } - dy_dem_x[block_idx * nnei + ii + warp_idx] = Csub; - } - if (unloop) { - break; - } - } -} - -template -__global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( - FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei, - const int last_layer_size, - const bool is_sorted) { - extern __shared__ int _data[]; - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // last_layer_size - FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0); - bool unloop = false; - int breakpoint = nnei - 1; - FPTYPE* iteratorC = (FPTYPE*)&_data[0]; - for (int kk = 0; kk < MTILE; kk++) { - iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.; - } - __syncthreads(); - - for (int ii = 0; ii < nnei; ii++) { - FPTYPE var[6]; - FPTYPE xx = em_x[block_idx * nnei + ii]; - FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; - if (xx == ago && is_sorted) { - unloop = true; - breakpoint = ii; - } - int table_idx = 0; - locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; - var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; - var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; - var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; - var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; - var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - FPTYPE res = - var[0] + - (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - FPTYPE res_grad = - var[1] + ((FPTYPE)2. * var[2] + - ((FPTYPE)3. * var[3] + - ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * - xx) * - xx; - - for (int kk = 0; kk < MTILE; kk++) { - int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; - iteratorC[kk * last_layer_size + thread_idx] += - (nnei - breakpoint) * - (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res); - } - if (unloop) { - break; - } - } - for (int ii = 0; ii < MTILE; ii++) { - dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + - thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; - } -} - -template -__global__ void tabulate_fusion_se_t_fifth_order_polynomial( - FPTYPE* out, - const FPTYPE* table, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { - HIP_DYNAMIC_SHARED(int, _data) - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // last_layer_size - - FPTYPE sum = (FPTYPE)0.; - for (int ii = 0; ii < nnei_i; ii++) { - for (int jj = 0; jj < nnei_j; jj++) { - FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; - FPTYPE tmp = xx; - int table_idx = 0; - locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); - FPTYPE var[6]; - var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; - var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; - var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; - var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; - var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; - var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - FPTYPE res = - var[0] + - (var[1] + - (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - - sum += tmp * res; - } - } - out[block_idx * last_layer_size + thread_idx] = sum; -} - -template -__global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( - FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dy, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { - HIP_DYNAMIC_SHARED(int, _data) - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ - int warp_idx = __shfl(threadIdx.x / 64, 0); - int lane_idx = threadIdx.x % 64; - FPTYPE* iteratorA = (FPTYPE*)&_data[0]; // dy - for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) { - iteratorA[ii] = dy[block_idx * last_layer_size + ii]; - } - __syncthreads(); - - for (int ii = 0; ii < nnei_i; ii++) { - FPTYPE ago = - __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); - for (int jj = warp_idx; jj < nnei_j; jj += KTILE) { - FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; - FPTYPE tmp = xx; - int table_idx = 0; - locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); - FPTYPE sum = (FPTYPE)0.; - FPTYPE Csub = (FPTYPE)0.; - for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) { - FPTYPE var[6]; - // load iteratorB through table - var[0] = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; - var[1] = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; - var[2] = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; - var[3] = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; - var[4] = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; - var[5] = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; - FPTYPE res = - var[0] + - (var[1] + - (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - - sum += iteratorA[kk] * res; - Csub += - iteratorA[kk] * tmp * - (var[1] + ((FPTYPE)2. * var[2] + - ((FPTYPE)3. * var[3] + - ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * - xx) * - xx); - } - __syncthreads(); - warp_reduce(sum); - warp_reduce(Csub); - if (lane_idx == 0) { - dy_dem[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = sum; - dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = Csub; - } - } - } -} - -template -__global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial( - FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // last_layer_size - - FPTYPE sum = (FPTYPE)0.; - for (int ii = 0; ii < nnei_i; ii++) { - FPTYPE ago = - __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); - for (int jj = 0; ii < nnei_j; jj++) { - FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; - FPTYPE tmp = xx; - FPTYPE dz_xx = - dz_dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; - FPTYPE dz_em = dz_dy_dem[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; - FPTYPE var[6]; - - int table_idx = 0; - locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); - var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; - var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; - var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; - var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; - var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; - var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - FPTYPE res = - var[0] + - (var[1] + - (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - FPTYPE res_grad = - var[1] + ((FPTYPE)2. * var[2] + - ((FPTYPE)3. * var[3] + - ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * - xx) * - xx; - - sum += (tmp * res_grad * dz_xx + dz_em * res); - } - } - dz_dy[block_idx * last_layer_size + thread_idx] = sum; -} - -template -__global__ void tabulate_fusion_se_r_fifth_order_polynomial( - FPTYPE* out, - const FPTYPE* table, - const FPTYPE* em, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei, - const int last_layer_size) { - HIP_DYNAMIC_SHARED(int, _data) - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // last_layer_size - - for (int ii = 0; ii < nnei; ii++) { - FPTYPE var[6]; - FPTYPE xx = em[block_idx * nnei + ii]; - int table_idx = 0; - locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; - var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; - var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; - var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; - var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; - var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - out[block_idx * nnei * last_layer_size + ii * last_layer_size + - thread_idx] = - var[0] + - (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * - xx; - } -} - -template -__global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial( - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* em, - const FPTYPE* dy, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei, - const int last_layer_size) { - HIP_DYNAMIC_SHARED(int, _data) - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ - int warp_idx = __shfl(threadIdx.x / 64, 0); - int lane_idx = threadIdx.x % 64; - - for (int ii = 0; ii < nnei; ii += KTILE) { - FPTYPE xx = em[block_idx * nnei + ii + warp_idx]; - - int table_idx = 0; - locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - FPTYPE Csub = 0.f; - for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) { - FPTYPE var[6]; - // load iteratorB through table - var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; - var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; - var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; - var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3]; - var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4]; - var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5]; - Csub += - (var[1] + ((FPTYPE)2. * var[2] + - ((FPTYPE)3. * var[3] + - ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * - xx) * - xx) * - dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj]; - } - //__syncwarp();->syncwrap - __syncthreads(); - warp_reduce(Csub); - if (lane_idx == 0) { - dy_dem[block_idx * nnei + ii + warp_idx] = Csub; - } - } -} - -template -__global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial( - FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* em, - const FPTYPE* dz_dy_dem, - const FPTYPE lower, - const FPTYPE upper, - const FPTYPE max, - const FPTYPE stride0, - const FPTYPE stride1, - const int nnei, - const int last_layer_size) { - extern __shared__ int _data[]; - const int_64 block_idx = blockIdx.x; // nloc - const int thread_idx = threadIdx.x; // last_layer_size - - __syncthreads(); - - for (int ii = 0; ii < nnei; ii++) { - FPTYPE var[6]; - FPTYPE xx = em[block_idx * nnei + ii]; - int table_idx = 0; - locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); - var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; - var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; - var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; - var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; - var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; - var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; - FPTYPE res_grad = - var[1] + ((FPTYPE)2. * var[2] + - ((FPTYPE)3. * var[3] + - ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) * - xx) * - xx; - dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size + - thread_idx] = dz_dy_dem[block_idx * nnei + ii] * res_grad; - } -} - -namespace deepmd { -template -void tabulate_fusion_se_a_gpu(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { - if (nloc <= 0) { - return; - } - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_a_fifth_order_polynomial), - nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out, - table, em_x, em, two_embed, table_info[0], table_info[1], table_info[2], - table_info[3], table_info[4], nnei, last_layer_size, is_sorted); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { - if (nloc <= 0) { - return; - } - DPErrcheck(hipMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei)); - DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4)); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_a_grad_fifth_order_polynomial), - nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x, - dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1], - table_info[2], table_info[3], table_info[4], nnei, last_layer_size, - is_sorted); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { - if (nloc <= 0) { - return; - } - DPErrcheck(hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_a_grad_grad_fifth_order_polynomial), - nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy, - table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], - table_info[2], table_info[3], table_info[4], nnei, last_layer_size, - is_sorted); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_t_gpu(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { - if (nloc <= 0) { - return; - } - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_t_fifth_order_polynomial), - nloc, last_layer_size, 0, 0, out, table, em_x, em, table_info[0], - table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, - nnei_j, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { - if (nloc <= 0) { - return; - } - DPErrcheck(hipMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); - DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_t_grad_fifth_order_polynomial), - nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size, 0, dy_dem_x, - dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2], - table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { - if (nloc <= 0) { - return; - } - DPErrcheck(hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size)); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_t_grad_grad_fifth_order_polynomial), - nloc, last_layer_size, 0, 0, dz_dy, table, em_x, em, dz_dy_dem_x, - dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], - table_info[4], nnei_i, nnei_j, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_r_gpu(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const int nloc, - const int nnei, - const int last_layer_size) { - if (nloc <= 0) { - return; - } - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_r_fifth_order_polynomial), - nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out, - table, em, table_info[0], table_info[1], table_info[2], table_info[3], - table_info[4], nnei, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dy, - const int nloc, - const int nnei, - const int last_layer_size) { - if (nloc <= 0) { - return; - } - DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei)); - - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_r_grad_fifth_order_polynomial), - nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem, - table, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], - table_info[4], nnei, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template -void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size) { - if (nloc <= 0) { - return; - } - DPErrcheck( - hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size)); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - tabulate_fusion_se_r_grad_grad_fifth_order_polynomial), - nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy, - table, em, dz_dy_dem, table_info[0], table_info[1], table_info[2], - table_info[3], table_info[4], nnei, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); -} - -template void tabulate_fusion_se_a_gpu(float* out, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* two_embed, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_gpu(double* out, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* two_embed, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_gpu(float* dy_dem_x, - float* dy_dem, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* two_embed, - const float* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_gpu(double* dy_dem_x, - double* dy_dem, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* two_embed, - const double* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_grad_gpu( - float* dz_dy, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* dz_dy_dem_x, - const float* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_grad_gpu( - double* dz_dy, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* dz_dy_dem_x, - const double* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); - -template void tabulate_fusion_se_t_gpu(float* out, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_gpu(double* out, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_gpu(float* dy_dem_x, - float* dy_dem, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_gpu(double* dy_dem_x, - double* dy_dem, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_grad_gpu( - float* dz_dy, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* dz_dy_dem_x, - const float* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_grad_gpu( - double* dz_dy, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* dz_dy_dem_x, - const double* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); - -template void tabulate_fusion_se_r_gpu(float* out, - const float* table, - const float* table_info, - const float* em, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_gpu(double* out, - const double* table, - const double* table_info, - const double* em, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_gpu(float* dy_dem, - const float* table, - const float* table_info, - const float* em, - const float* dy, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_gpu(double* dy_dem, - const double* table, - const double* table_info, - const double* em, - const double* dy, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_grad_gpu( - float* dz_dy, - const float* table, - const float* table_info, - const float* em, - const float* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_grad_gpu( - double* dz_dy, - const double* table, - const double* table_info, - const double* em, - const double* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size); - -} // namespace deepmd