diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc index 404ee9e1de..a8890498e5 100644 --- a/source/api_cc/src/DeepPot.cc +++ b/source/api_cc/src/DeepPot.cc @@ -1,38 +1,11 @@ #include "DeepPot.h" #include "AtomMap.h" #include +#include "device.h" using namespace tensorflow; using namespace deepmd; -#if GOOGLE_CUDA -#include "cuda_runtime.h" - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} -#endif - -#if TENSORFLOW_USE_ROCM -#include - -#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); } -inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) -{ - if (code != hipSuccess) - { - fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line); - if (abort) exit(code); - } -} -#endif //TENSORFLOW_USE_ROCM - static std::vector cum_sum (const std::vector & n_sel) { std::vector sec; @@ -218,32 +191,18 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_ else graph_def.ParseFromString(file_content); int gpu_num = -1; - #if GOOGLE_CUDA - cudaGetDeviceCount(&gpu_num); // check current device environment + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + DPGetDeviceCount(gpu_num); // check current device environment if (gpu_num > 0) { options.config.set_allow_soft_placement(true); options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); options.config.mutable_gpu_options()->set_allow_growth(true); - cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num)); + DPErrcheck(DPSetDevice(gpu_rank % gpu_num)); std::string str = "/gpu:"; str += std::to_string(gpu_rank % gpu_num); graph::SetDefaultDevice(str, &graph_def); } - #endif // GOOGLE_CUDA - - #if TENSORFLOW_USE_ROCM - hipGetDeviceCount(&gpu_num); // check current device environment - if (gpu_num > 0) { - options.config.set_allow_soft_placement(true); - options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); - options.config.mutable_gpu_options()->set_allow_growth(true); - hipErrcheck(hipSetDevice(gpu_rank % gpu_num)); - std::string str = "/gpu:"; - str += std::to_string(gpu_rank % gpu_num); - graph::SetDefaultDevice(str, &graph_def); - } - #endif // TENSORFLOW_USE_ROCM - + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM check_status (NewSession(options, &session)); check_status (session->Create(graph_def)); rcut = get_scalar("descrpt_attr/rcut"); @@ -552,13 +511,9 @@ init (const std::vector & models, const int & gpu_rank, const std:: graph_defs.resize(numb_models); int gpu_num = -1; - #if GOOGLE_CUDA - cudaGetDeviceCount(&gpu_num); - #endif // GOOGLE_CUDA - - #if TENSORFLOW_USE_ROCM - hipGetDeviceCount(&gpu_num); - #endif //TENSORFLOW_USE_ROCM + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + DPGetDeviceCount(gpu_num); + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM SessionOptions options; options.config.set_inter_op_parallelism_threads(num_inter_nthreads); @@ -569,24 +524,14 @@ init (const std::vector & models, const int & gpu_rank, const std:: else graph_defs[ii].ParseFromString(file_contents[ii]); } - #if GOOGLE_CUDA - if (gpu_num > 0) { - options.config.set_allow_soft_placement(true); - options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); - options.config.mutable_gpu_options()->set_allow_growth(true); - cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num)); - } - #endif // GOOGLE_CUDA - - - #if TENSORFLOW_USE_ROCM + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM if (gpu_num > 0) { options.config.set_allow_soft_placement(true); options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); options.config.mutable_gpu_options()->set_allow_growth(true); - hipErrcheck(hipSetDevice(gpu_rank % gpu_num)); + DPErrcheck(DPSetDevice(gpu_rank % gpu_num)); } - #endif // TENSORFLOW_USE_ROCM + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM for (unsigned ii = 0; ii < numb_models; ++ii) { if (gpu_num > 0) { diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 8464a8f46c..cd82ee4657 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -5,8 +5,9 @@ #include #define GPU_MAX_NBOR_SIZE 4096 -#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);} -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { +#define DPErrcheck(res) {DPAssert((res), __FILE__, __LINE__);} +inline void DPAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ if (code != cudaSuccess) { fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); if (code == 2) { @@ -27,7 +28,8 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= } #define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);} -inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) { +inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ if (code != cudaSuccess) { fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line); if (code == 2) { @@ -65,12 +67,17 @@ static __inline__ __device__ double atomicAdd( #endif namespace deepmd { + +inline void DPGetDeviceCount(int &gpu_num) { cudaGetDeviceCount(&gpu_num) ;} + +inline cudaError_t DPSetDevice(int rank) { return cudaSetDevice(rank); } + template void memcpy_host_to_device( FPTYPE * device, const std::vector &host) { - cudaErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice)); + DPErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice)); } template @@ -79,7 +86,7 @@ void memcpy_host_to_device( const FPTYPE * host, const int size) { - cudaErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice)); + DPErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice)); } template @@ -87,7 +94,7 @@ void memcpy_device_to_host( const FPTYPE * device, std::vector &host) { - cudaErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost)); + DPErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost)); } template @@ -96,7 +103,7 @@ void memcpy_device_to_host( FPTYPE * host, const int size) { - cudaErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost)); + DPErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost)); } template @@ -104,7 +111,7 @@ void malloc_device_memory( FPTYPE * &device, const std::vector &host) { - cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); + DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); } template @@ -112,7 +119,7 @@ void malloc_device_memory( FPTYPE * &device, const int size) { - cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); + DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); } template @@ -120,7 +127,7 @@ void malloc_device_memory_sync( FPTYPE * &device, const std::vector &host) { - cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); + DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); memcpy_host_to_device(device, host); } @@ -130,7 +137,7 @@ void malloc_device_memory_sync( const FPTYPE * host, const int size) { - cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); + DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); memcpy_host_to_device(device, host, size); } @@ -139,7 +146,7 @@ void delete_device_memory( FPTYPE * &device) { if (device != NULL) { - cudaErrcheck(cudaFree(device)); + DPErrcheck(cudaFree(device)); } } @@ -149,6 +156,6 @@ void memset_device_memory( const FPTYPE var, const int size) { - cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size)); + DPErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size)); } } // end of namespace deepmd \ No newline at end of file diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index ee3e88ee9e..955ffe5bf7 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -8,8 +8,8 @@ #define GPU_MAX_NBOR_SIZE 4096 -#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); } -inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) { +#define DPErrcheck(res) { DPAssert((res), __FILE__, __LINE__); } +inline void DPAssert(hipError_t code, const char *file, int line, bool abort=true) { if (code != hipSuccess) { fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line); if (abort) exit(code); @@ -24,13 +24,18 @@ inline void nborAssert(hipError_t code, const char *file, int line, bool abort=t } } + namespace deepmd { +inline void DPGetDeviceCount(int &gpu_num) { hipGetDeviceCount(&gpu_num) ;} + +inline hipError_t DPSetDevice(int rank) { return hipSetDevice(rank); } + template void memcpy_host_to_device( FPTYPE * device, std::vector &host) { - hipErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice)); + DPErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice)); } template @@ -39,7 +44,7 @@ void memcpy_host_to_device( const FPTYPE * host, const int size) { - hipErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice)); + DPErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice)); } template @@ -47,7 +52,7 @@ void memcpy_device_to_host( FPTYPE * device, std::vector &host) { - hipErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost)); + DPErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost)); } template void memcpy_device_to_host( @@ -55,7 +60,7 @@ void memcpy_device_to_host( FPTYPE * host, const int size) { - hipErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost)); + DPErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost)); } template @@ -63,7 +68,7 @@ void malloc_device_memory( FPTYPE * &device, std::vector &host) { - hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size())); + DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size())); } template @@ -71,7 +76,7 @@ void malloc_device_memory( FPTYPE * &device, const int size) { - hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size)); + DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size)); } template @@ -79,7 +84,7 @@ void malloc_device_memory_sync( FPTYPE * &device, std::vector &host) { - hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size())); + DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size())); memcpy_host_to_device(device, host); } template @@ -88,7 +93,7 @@ void malloc_device_memory_sync( const FPTYPE * host, const int size) { - hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size)); + DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size)); memcpy_host_to_device(device, host, size); } @@ -97,7 +102,7 @@ void delete_device_memory( FPTYPE * &device) { if (device != NULL) { - hipErrcheck(hipFree(device)); + DPErrcheck(hipFree(device)); } } @@ -107,7 +112,7 @@ void memset_device_memory( const FPTYPE var, const int size) { - hipErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size)); + DPErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size)); } } diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h index bc717255b9..e0901c26d0 100644 --- a/source/lib/include/neighbor_list.h +++ b/source/lib/include/neighbor_list.h @@ -63,16 +63,25 @@ build_nlist_cpu( const int & mem_size, const float & rcut); -#if GOOGLE_CUDA -void convert_nlist_gpu_cuda( +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +void convert_nlist_gpu_device( InputNlist & gpu_nlist, InputNlist & cpu_nlist, int* & gpu_memory, const int & max_nbor_size); -void free_nlist_gpu_cuda( +void free_nlist_gpu_device( InputNlist & gpu_nlist); +void use_nlist_map( + int * nlist, + const int * nlist_map, + const int nloc, + const int nnei); + +#endif //GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +#if GOOGLE_CUDA // build neighbor list. // outputs // nlist, max_list_size @@ -96,25 +105,10 @@ build_nlist_gpu( const int & mem_size, const float & rcut); -void use_nlist_map( - int * nlist, - const int * nlist_map, - const int nloc, - const int nnei); - #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -void convert_nlist_gpu_rocm( - InputNlist & gpu_nlist, - InputNlist & cpu_nlist, - int* & gpu_memory, - const int & max_nbor_size); - -void free_nlist_gpu_rocm( - InputNlist & gpu_nlist); - // build neighbor list. // outputs // nlist, max_list_size @@ -137,12 +131,6 @@ build_nlist_gpu_rocm( const int & nall, const int & mem_size, const float & rcut); - -void use_nlist_map( - int * nlist, - const int * nlist_map, - const int nloc, - const int nnei); #endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu index c11609c57f..982e603f42 100644 --- a/source/lib/src/cuda/coord.cu +++ b/source/lib/src/cuda/coord.cu @@ -291,14 +291,20 @@ void compute_int_data( const int nblock_loc=(nloc+TPB-1)/TPB; _fill_idx_cellmap<<>>(idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, nat_stt, nat_end, ext_stt, ext_end, nloc); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB; _fill_loc_cellnum_map<<>>(temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc, loc_cellnum); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB; _fill_total_cellnum_map<<>>(total_cellnum_map, mask_cellnum_map, cell_map, cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } void build_loc_clist( @@ -313,6 +319,8 @@ void build_loc_clist( const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum; int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1; _build_loc_clist<<>>(loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -340,6 +348,8 @@ void copy_coord( const FPTYPE *rec_boxt = region.rec_boxt; _copy_coord<<>>(out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } namespace deepmd { @@ -354,6 +364,8 @@ normalize_coord_gpu( const FPTYPE * rec_boxt=region.rec_boxt; const int nblock=(natom+TPB-1)/TPB; normalize_one<<>>(coord, boxt, rec_boxt, natom); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } // int_data(temp cuda memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map, @@ -377,7 +389,7 @@ copy_coord_gpu( { compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum); int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map - cudaErrcheck(cudaMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), cudaMemcpyDeviceToHost)); + DPErrcheck(cudaMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), cudaMemcpyDeviceToHost)); int * loc_cellnum_map=int_data_cpu; int * total_cellnum_map=loc_cellnum_map+loc_cellnum; int * mask_cellnum_map=total_cellnum_map+total_cellnum; @@ -399,7 +411,7 @@ copy_coord_gpu( return 1; } else{ - cudaErrcheck(cudaMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, + DPErrcheck(cudaMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), cudaMemcpyHostToDevice)); delete[] int_data_cpu; build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum); diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu index ba9cdad4b7..ca96751895 100644 --- a/source/lib/src/cuda/gelu.cu +++ b/source/lib/src/cuda/gelu.cu @@ -62,6 +62,8 @@ void gelu_gpu_cuda( const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(out, xx, size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -78,6 +80,8 @@ void gelu_grad_gpu_cuda( const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(out, xx, dy, size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -95,6 +99,8 @@ void gelu_grad_grad_gpu_cuda( const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad_grad<<>>(out, xx, dy, dy_2, size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void gelu_gpu_cuda(float * out, const float * x, const int size); diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu index 33bf33aa3c..66bd122079 100644 --- a/source/lib/src/cuda/neighbor_list.cu +++ b/source/lib/src/cuda/neighbor_list.cu @@ -124,7 +124,7 @@ int build_nlist_gpu( int * ilist = nlist.ilist; int * numneigh = nlist.numneigh; int ** firstneigh = nlist.firstneigh; - cudaErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); + DPErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); int * temp_nlist = nlist_data; //nloc*mem_size int * nei_order = temp_nlist + nloc * mem_size; nlist.inum = nloc; @@ -141,6 +141,8 @@ int build_nlist_gpu( nloc, nall, mem_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int nblock_ = (nloc+TPB-1)/TPB; scan_nlist<<>>( numneigh, @@ -149,15 +151,18 @@ int build_nlist_gpu( mem_size, nloc, nall); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); fill_nlist<<>>( firstneigh, temp_nlist, nei_order, mem_size, - nall - ); + nall); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); int * numneigh_host = new int[nloc]; - cudaErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, cudaMemcpyDeviceToHost)); + DPErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, cudaMemcpyDeviceToHost)); int max_nei = 0; for(int ii=0;iimax_nei)max_nei=numneigh_host[ii]; @@ -177,6 +182,8 @@ void use_nlist_map( dim3 block_grid(nloc, nblock); dim3 thread_grid(1, TPB); map_nlist<<>>(nlist, nlist_map, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template int build_nlist_gpu(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut); diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index ccea65d448..7787fa0355 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -213,12 +213,16 @@ void format_nbor_list_1024 ( format_nlist_fill_a<<>> ( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>> ( key, key + nloc * MAX_NBOR_SIZE); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -239,12 +243,16 @@ void format_nbor_list_2048 ( format_nlist_fill_a<<>> ( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>> ( key, key + nloc * MAX_NBOR_SIZE); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -265,12 +273,16 @@ void format_nbor_list_4096 ( format_nlist_fill_a<<>> ( key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int ITEMS_PER_THREAD = 16; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( BlockSortKernel <<>> ( key, key + nloc * MAX_NBOR_SIZE); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template< @@ -449,13 +461,15 @@ void format_nbor_list_gpu_cuda( int * i_idx = array_int + sec.size() + nloc * sec.size(); uint_64 * key = array_longlong; assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); - cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei)); - cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size)); - cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice)); + DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei)); + DPErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size)); + DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice)); get_i_idx<<>>( i_idx, nloc, gpu_inlist.ilist); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); if (max_nbor_size == 1024) { format_nbor_list_1024 ( @@ -476,6 +490,8 @@ void format_nbor_list_gpu_cuda( format_nlist_fill_b<<>> ( nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -500,9 +516,9 @@ void prod_env_mat_a_gpu_cuda( { const int nnei = sec.back(); const int ndescrpt = nnei * 4; - cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); - cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); - cudaErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3)); + DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); + DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3)); format_nbor_list_gpu_cuda( nlist, @@ -513,6 +529,8 @@ void prod_env_mat_a_gpu_cuda( compute_env_mat_a <<>> ( em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -537,9 +555,9 @@ void prod_env_mat_r_gpu_cuda( { const int nnei = sec.back(); const int ndescrpt = nnei * 1; - cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); - cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); - cudaErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3)); + DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); + DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3)); format_nbor_list_gpu_cuda( nlist, @@ -550,6 +568,8 @@ void prod_env_mat_r_gpu_cuda( compute_env_mat_r <<>> ( em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -566,6 +586,8 @@ void test_encoding_decoding_nbor_info_gpu_cuda( encoding_decoding_nbor_info<<>> ( key, out_type, out_index, in_type, in_dist, in_index, size_of_array); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void prod_env_mat_a_gpu_cuda(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu index 62c7ce8926..124f6c806d 100644 --- a/source/lib/src/cuda/prod_force.cu +++ b/source/lib/src/cuda/prod_force.cu @@ -108,13 +108,15 @@ void prod_force_a_gpu_cuda( const int nnei) { const int ndescrpt = nnei * 4; - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( force, 0.0, sizeof(FPTYPE) * nall * 3)); force_deriv_wrt_center_atom <<>>( force, net_deriv, in_deriv, ndescrpt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; @@ -123,6 +125,8 @@ void prod_force_a_gpu_cuda( force_deriv_wrt_neighbors_a<<>>( force, net_deriv, in_deriv, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -136,13 +140,15 @@ void prod_force_r_gpu_cuda( const int nnei) { const int ndescrpt = nnei * 1; - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( force, 0.0, sizeof(FPTYPE) * nall * 3)); force_deriv_wrt_center_atom <<>>( force, net_deriv, in_deriv, ndescrpt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; @@ -151,6 +157,8 @@ void prod_force_r_gpu_cuda( force_deriv_wrt_neighbors_r<<>>( force, net_deriv, in_deriv, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void prod_force_a_gpu_cuda(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei); diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu index 7fd9359cfe..11af70c245 100644 --- a/source/lib/src/cuda/prod_force_grad.cu +++ b/source/lib/src/cuda/prod_force_grad.cu @@ -88,7 +88,7 @@ void prod_force_grad_a_gpu_cuda( const int nnei) { const int ndescrpt = nnei * 4; - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; @@ -97,6 +97,8 @@ void prod_force_grad_a_gpu_cuda( force_grad_wrt_center_atom<<>>( grad_net, grad, env_deriv, ndescrpt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int LEN = 128; const int nblock_ = (nloc + LEN -1) / LEN; @@ -105,6 +107,8 @@ void prod_force_grad_a_gpu_cuda( force_grad_wrt_neighbors_a<<>>( grad_net, grad, env_deriv, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -117,7 +121,7 @@ void prod_force_grad_r_gpu_cuda( const int nnei) { const int ndescrpt = nnei * 1; - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; @@ -126,6 +130,8 @@ void prod_force_grad_r_gpu_cuda( force_grad_wrt_center_atom<<>>( grad_net, grad, env_deriv, ndescrpt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); const int LEN = 128; const int nblock_ = (nloc + LEN -1) / LEN; @@ -134,6 +140,8 @@ void prod_force_grad_r_gpu_cuda( force_grad_wrt_neighbors_r<<>>( grad_net, grad, env_deriv, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void prod_force_grad_a_gpu_cuda(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei); diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu index 08a64d71fe..06d1cefa42 100644 --- a/source/lib/src/cuda/prod_virial.cu +++ b/source/lib/src/cuda/prod_virial.cu @@ -115,10 +115,10 @@ void prod_virial_a_gpu_cuda( const int nall, const int nnei) { - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( virial, 0.0, sizeof(FPTYPE) * 9)); - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); @@ -130,10 +130,14 @@ void prod_virial_a_gpu_cuda( virial_deriv_wrt_neighbors_a<<>>( virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); // reduction atom_virial to virial atom_virial_reduction <<<9, TPB>>>( virial, atom_virial, nall); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -148,10 +152,10 @@ void prod_virial_r_gpu_cuda( const int nall, const int nnei) { - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( virial, 0.0, sizeof(FPTYPE) * 9)); - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); @@ -163,10 +167,14 @@ void prod_virial_r_gpu_cuda( virial_deriv_wrt_neighbors_r<<>>( virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); // reduction atom_virial to virial atom_virial_reduction <<<9, TPB>>>( virial, atom_virial, nall); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void prod_virial_a_gpu_cuda(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei); diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu index 2cdd25ec38..0b9affa948 100644 --- a/source/lib/src/cuda/prod_virial_grad.cu +++ b/source/lib/src/cuda/prod_virial_grad.cu @@ -99,7 +99,7 @@ void prod_virial_grad_a_gpu_cuda( const int nnei) { const int ndescrpt = nnei * 4; - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; @@ -109,6 +109,8 @@ void prod_virial_grad_a_gpu_cuda( virial_grad_wrt_neighbors_a<<>>( grad_net, grad, env_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -122,7 +124,7 @@ void prod_virial_grad_r_gpu_cuda( const int nnei) { const int ndescrpt = nnei; - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; @@ -132,6 +134,8 @@ void prod_virial_grad_r_gpu_cuda( virial_grad_wrt_neighbors_r<<>>( grad_net, grad, env_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void prod_virial_grad_a_gpu_cuda(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei); diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/cuda/region.cu index 4a95e5f9da..99e203cfcc 100644 --- a/source/lib/src/cuda/region.cu +++ b/source/lib/src/cuda/region.cu @@ -39,6 +39,8 @@ convert_to_inter_gpu( const FPTYPE * rp) { _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -49,6 +51,8 @@ convert_to_phys_gpu( const FPTYPE * ri) { _inter2Phys<<<1, 1>>>(rp, ri, region.boxt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -58,6 +62,8 @@ volume_gpu( const Region & region) { _compute_volume<<<1, 1>>>(volume, region.boxt); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void convert_to_inter_gpu(float * ri, const Region & region, const float * rp); diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu index b71a989819..2cab6d75a7 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/cuda/tabulate.cu @@ -203,6 +203,8 @@ void tabulate_fusion_gpu_cuda( tabulate_fusion_fifth_order_polynomial <<>>( out, table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template @@ -219,16 +221,18 @@ void tabulate_fusion_grad_gpu_cuda( const int last_layer_size) { if (nloc <= 0) {return;} - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( dy_dem_x, 0.0, sizeof(FPTYPE) * nloc * nnei)); - cudaErrcheck(cudaMemset( + DPErrcheck(cudaMemset( dy_dem, 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); tabulate_fusion_grad_fifth_order_polynomial <<>>( dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); } template void tabulate_fusion_gpu_cuda(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc index a41784bc29..c3cd376fbe 100644 --- a/source/lib/src/neighbor_list.cc +++ b/source/lib/src/neighbor_list.cc @@ -843,8 +843,8 @@ build_nlist_cpu( const int & mem_size, const float & rcut); -#if GOOGLE_CUDA -void deepmd::convert_nlist_gpu_cuda( +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +void deepmd::convert_nlist_gpu_device( InputNlist & gpu_nlist, InputNlist & cpu_nlist, int* & gpu_memory, @@ -867,44 +867,11 @@ void deepmd::convert_nlist_gpu_cuda( free(_firstneigh); } -void deepmd::free_nlist_gpu_cuda( +void deepmd::free_nlist_gpu_device( InputNlist & gpu_nlist) { delete_device_memory(gpu_nlist.ilist); delete_device_memory(gpu_nlist.numneigh); delete_device_memory(gpu_nlist.firstneigh); } -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM -void deepmd::convert_nlist_gpu_rocm( - InputNlist & gpu_nlist, - InputNlist & cpu_nlist, - int* & gpu_memory, - const int & max_nbor_size) -{ - const int inum = cpu_nlist.inum; - gpu_nlist.inum = inum; - malloc_device_memory(gpu_nlist.ilist, inum); - malloc_device_memory(gpu_nlist.numneigh, inum); - malloc_device_memory(gpu_nlist.firstneigh, inum); - memcpy_host_to_device(gpu_nlist.ilist, cpu_nlist.ilist, inum); - memcpy_host_to_device(gpu_nlist.numneigh, cpu_nlist.numneigh, inum); - int ** _firstneigh = NULL; - _firstneigh = (int**)malloc(sizeof(int*) * inum); - for (int ii = 0; ii < inum; ii++) { - memcpy_host_to_device(gpu_memory + ii * max_nbor_size, cpu_nlist.firstneigh[ii], cpu_nlist.numneigh[ii]); - _firstneigh[ii] = gpu_memory + ii * max_nbor_size; - } - memcpy_host_to_device(gpu_nlist.firstneigh, _firstneigh, inum); - free(_firstneigh); -} - -void deepmd::free_nlist_gpu_rocm( - InputNlist & gpu_nlist) -{ - delete_device_memory(gpu_nlist.ilist); - delete_device_memory(gpu_nlist.numneigh); - delete_device_memory(gpu_nlist.firstneigh); -} -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc index 4ef5af49e3..1ac944786c 100644 --- a/source/lib/src/prod_env_mat.cc +++ b/source/lib/src/prod_env_mat.cc @@ -256,7 +256,7 @@ prod_env_mat_r_cpu( const float rcut_smth, const std::vector sec); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM void deepmd::env_mat_nbor_update( InputNlist &inlist, InputNlist &gpu_inlist, @@ -266,7 +266,7 @@ void deepmd::env_mat_nbor_update( const int size) { int *mesh_host = new int[size]; - cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost)); + memcpy_device_to_host(mesh, mesh_host, size); memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *)); memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *)); memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **)); @@ -319,69 +319,4 @@ void deepmd::env_mat_nbor_update( } delete [] mesh_host; } -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM -void deepmd::env_mat_nbor_update( - InputNlist &inlist, - InputNlist &gpu_inlist, - int &max_nbor_size, - int* &nbor_list_dev, - const int * mesh, - const int size) -{ - int *mesh_host = new int[size]; - hipErrcheck(hipMemcpy(mesh_host, mesh, sizeof(int) * size, hipMemcpyDeviceToHost)); - memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *)); - memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *)); - memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **)); - const int ago = mesh_host[0]; - if (ago == 0) { - const int inum = inlist.inum; - if (gpu_inlist.inum < inum) { - delete_device_memory(gpu_inlist.ilist); - delete_device_memory(gpu_inlist.numneigh); - delete_device_memory(gpu_inlist.firstneigh); - malloc_device_memory(gpu_inlist.ilist, inum); - malloc_device_memory(gpu_inlist.numneigh, inum); - malloc_device_memory(gpu_inlist.firstneigh, inum); - } - memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum); - memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum); - int _max_nbor_size = max_numneigh(inlist); - if (_max_nbor_size <= 1024) { - _max_nbor_size = 1024; - } - else if (_max_nbor_size <= 2048) { - _max_nbor_size = 2048; - } - else { - _max_nbor_size = 4096; - } - if ( nbor_list_dev == NULL - || _max_nbor_size > max_nbor_size - || inum > gpu_inlist.inum) - { - delete_device_memory(nbor_list_dev); - malloc_device_memory(nbor_list_dev, inum * _max_nbor_size); - } - // update info - gpu_inlist.inum = inum; - max_nbor_size = _max_nbor_size; - - // copy nbor list from host to the device - std::vector nbor_list_host(inum * max_nbor_size, 0); - int ** _firstneigh = (int**)malloc(sizeof(int*) * inum); - for (int ii = 0; ii < inum; ii++) { - _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size; - for (int jj = 0; jj < inlist.numneigh[ii]; jj++) { - nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj]; - } - } - memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0], inum * max_nbor_size); - memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum); - free(_firstneigh); - } - delete [] mesh_host; -} -#endif // TENSORFLOW_USE_ROCM \ No newline at end of file +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM \ No newline at end of file diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu index 12c797ab3e..1d01e1da30 100644 --- a/source/lib/src/rocm/coord.hip.cu +++ b/source/lib/src/rocm/coord.hip.cu @@ -292,14 +292,20 @@ void compute_int_data( const int nblock_loc=(nloc+TPB-1)/TPB; hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, nat_stt, nat_end, ext_stt, ext_end, nloc); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB; hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0, temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc, loc_cellnum); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB; hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0, total_cellnum_map, mask_cellnum_map, cell_map, cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } void build_loc_clist( @@ -314,6 +320,8 @@ void build_loc_clist( const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum; int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1; hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -341,6 +349,8 @@ void copy_coord( const FPTYPE *rec_boxt = region.rec_boxt; hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } namespace deepmd { @@ -355,6 +365,8 @@ normalize_coord_gpu_rocm( const FPTYPE * rec_boxt=region.rec_boxt; const int nblock=(natom+TPB-1)/TPB; hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt, natom); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -376,7 +388,9 @@ copy_coord_gpu_rocm( { compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum); int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map - hipErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost)); + DPErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost)); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); int * loc_cellnum_map=int_data_cpu; int * total_cellnum_map=loc_cellnum_map+loc_cellnum; int * mask_cellnum_map=total_cellnum_map+total_cellnum; @@ -398,7 +412,7 @@ copy_coord_gpu_rocm( return 1; } else{ - hipErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, + DPErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), hipMemcpyHostToDevice)); delete[] int_data_cpu; build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum); diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu index 83e7a3be6d..9cabb53c08 100644 --- a/source/lib/src/rocm/gelu.hip.cu +++ b/source/lib/src/rocm/gelu.hip.cu @@ -64,6 +64,8 @@ namespace deepmd { const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -81,6 +83,8 @@ namespace deepmd { const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -99,6 +103,8 @@ namespace deepmd { const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, dy_2, size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void gelu_gpu_rocm(float * out, const float * x, const int size); diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu index a0da866d12..243ea0507a 100644 --- a/source/lib/src/rocm/neighbor_list.hip.cu +++ b/source/lib/src/rocm/neighbor_list.hip.cu @@ -124,7 +124,7 @@ int build_nlist_gpu_rocm( int * ilist = nlist.ilist; int * numneigh = nlist.numneigh; int ** firstneigh = nlist.firstneigh; - hipErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); + DPErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size)); int * temp_nlist = nlist_data; //nloc*mem_size int * nei_order = temp_nlist + nloc * mem_size; nlist.inum = nloc; @@ -141,6 +141,8 @@ int build_nlist_gpu_rocm( nloc, nall, mem_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int nblock_ = (nloc+TPB-1)/TPB; hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0, numneigh, @@ -149,15 +151,18 @@ int build_nlist_gpu_rocm( mem_size, nloc, nall); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, firstneigh, temp_nlist, nei_order, mem_size, - nall - ); + nall); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); int * numneigh_host = new int[nloc]; - hipErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost)); + DPErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost)); int max_nei = 0; for(int ii=0;iimax_nei)max_nei=numneigh_host[ii]; @@ -177,6 +182,8 @@ void use_nlist_map( dim3 block_grid(nloc, nblock); dim3 thread_grid(1, TPB); hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template int build_nlist_gpu_rocm(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut); diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu index b1251c6a42..f3cd3bf31a 100644 --- a/source/lib/src/rocm/prod_env_mat.hip.cu +++ b/source/lib/src/rocm/prod_env_mat.hip.cu @@ -234,12 +234,16 @@ void format_nbor_list_1024 ( hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -260,12 +264,16 @@ void format_nbor_list_2048 ( hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -286,12 +294,16 @@ void format_nbor_list_4096 ( hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int ITEMS_PER_THREAD = 16; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), g_grid_size, BLOCK_THREADS, 0, 0, hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel), nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template< @@ -470,13 +482,15 @@ void format_nbor_list_gpu_rocm( int * i_idx = array_int + sec.size() + nloc * sec.size(); uint_64 * key = array_longlong; assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); - hipErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei)); - hipErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size)); - hipErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice)); + DPErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei)); + DPErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size)); + DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice)); hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, i_idx, nloc, gpu_inlist.ilist); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); if (max_nbor_size == 1024) { format_nbor_list_1024 ( @@ -501,6 +515,8 @@ void format_nbor_list_gpu_rocm( hipLaunchKernelGGL(format_nlist_fill_b, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN, 0, 0, nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -525,9 +541,9 @@ void prod_env_mat_a_gpu_rocm( { const int nnei = sec.back(); const int ndescrpt = nnei * 4; - hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); - hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); - hipErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3)); + DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); + DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3)); format_nbor_list_gpu_rocm( nlist, @@ -538,6 +554,8 @@ void prod_env_mat_a_gpu_rocm( hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a), nloc, TPB, 0, 0, em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -562,9 +580,9 @@ void prod_env_mat_r_gpu_rocm( { const int nnei = sec.back(); const int ndescrpt = nnei * 1; - hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); - hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); - hipErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3)); + DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); + DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); + DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3)); format_nbor_list_gpu_rocm( nlist, @@ -575,6 +593,8 @@ void prod_env_mat_r_gpu_rocm( hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r), nloc, TPB, 0, 0, em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -591,6 +611,8 @@ void test_encoding_decoding_nbor_info_gpu_rocm( hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, key, out_type, out_index, in_type, in_dist, in_index, size_of_array); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void prod_env_mat_a_gpu_rocm(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu index 48b12dfa50..815fed44c5 100644 --- a/source/lib/src/rocm/prod_force.hip.cu +++ b/source/lib/src/rocm/prod_force.hip.cu @@ -109,13 +109,15 @@ namespace deepmd { const int nnei) { const int ndescrpt = nnei * 4; - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( force, 0.0, sizeof(FPTYPE) * nall * 3)); hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), nloc, TPB, 0, 0, force, net_deriv, in_deriv, ndescrpt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; @@ -124,6 +126,8 @@ namespace deepmd { hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, force, net_deriv, in_deriv, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -137,13 +141,15 @@ namespace deepmd { const int nnei) { const int ndescrpt = nnei * 1; - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( force, 0.0, sizeof(FPTYPE) * nall * 3)); hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), nloc, TPB, 0, 0, force, net_deriv, in_deriv, ndescrpt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN -1) / LEN; @@ -152,6 +158,8 @@ namespace deepmd { hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, force, net_deriv, in_deriv, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void prod_force_a_gpu_rocm(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei); diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu index 3a3fd8ea58..f7af9a1746 100644 --- a/source/lib/src/rocm/prod_force_grad.hip.cu +++ b/source/lib/src/rocm/prod_force_grad.hip.cu @@ -88,7 +88,7 @@ void prod_force_grad_a_gpu_rocm( const int nnei) { const int ndescrpt = nnei * 4; - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; @@ -97,7 +97,8 @@ void prod_force_grad_a_gpu_rocm( hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, grad_net, grad, env_deriv, ndescrpt); - + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int LEN = 128; const int nblock_ = (nloc + LEN -1) / LEN; dim3 block_grid_(nblock_, nnei); @@ -105,6 +106,8 @@ void prod_force_grad_a_gpu_rocm( hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, 0, grad_net, grad, env_deriv, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -117,7 +120,7 @@ void prod_force_grad_r_gpu_rocm( const int nnei) { const int ndescrpt = nnei * 1; - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int nblock = (ndescrpt + TPB - 1) / TPB; @@ -126,6 +129,8 @@ void prod_force_grad_r_gpu_rocm( hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, grad_net, grad, env_deriv, ndescrpt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); const int LEN = 128; const int nblock_ = (nloc + LEN -1) / LEN; @@ -134,6 +139,8 @@ void prod_force_grad_r_gpu_rocm( hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, 0, grad_net, grad, env_deriv, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void prod_force_grad_a_gpu_rocm(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei); diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu index ff8017a687..9c4ec5010b 100644 --- a/source/lib/src/rocm/prod_virial.hip.cu +++ b/source/lib/src/rocm/prod_virial.hip.cu @@ -113,10 +113,10 @@ void prod_virial_a_gpu_rocm( const int nall, const int nnei) { - hipErrcheck(hipMemset( - virial, - 0.0, sizeof(FPTYPE) * 9)); - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( + virial, + 0.0, sizeof(FPTYPE) * 9)); + DPErrcheck(hipMemset( atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); @@ -128,10 +128,14 @@ void prod_virial_a_gpu_rocm( hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); // reduction atom_virial to virial hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction), 9, TPB, 0, 0, virial, atom_virial, nall); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -146,10 +150,10 @@ void prod_virial_r_gpu_rocm( const int nall, const int nnei) { - hipErrcheck(hipMemset( - virial, - 0.0, sizeof(FPTYPE) * 9)); - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( + virial, + 0.0, sizeof(FPTYPE) * 9)); + DPErrcheck(hipMemset( atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); @@ -161,10 +165,14 @@ void prod_virial_r_gpu_rocm( hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); // reduction atom_virial to virial hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction), 9, TPB, 0, 0, virial, atom_virial, nall); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void prod_virial_a_gpu_rocm(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei); diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu index c4d8a5c19a..4c729453f7 100644 --- a/source/lib/src/rocm/prod_virial_grad.hip.cu +++ b/source/lib/src/rocm/prod_virial_grad.hip.cu @@ -99,7 +99,7 @@ void prod_virial_grad_a_gpu_rocm( const int nnei) { const int ndescrpt = nnei * 4; - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; @@ -109,6 +109,8 @@ void prod_virial_grad_a_gpu_rocm( hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0, grad_net, grad, env_deriv, rij, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -122,7 +124,7 @@ void prod_virial_grad_r_gpu_rocm( const int nnei) { const int ndescrpt = nnei; - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( grad_net, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; @@ -132,6 +134,8 @@ void prod_virial_grad_r_gpu_rocm( hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0, grad_net, grad, env_deriv, rij, nlist, nloc, nnei); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void prod_virial_grad_a_gpu_rocm(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei); diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu index 7f883b14c3..ab40f6bf20 100644 --- a/source/lib/src/rocm/region.hip.cu +++ b/source/lib/src/rocm/region.hip.cu @@ -39,6 +39,8 @@ convert_to_inter_gpu_rocm( const FPTYPE * rp) { hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -49,6 +51,8 @@ convert_to_phys_gpu_rocm( const FPTYPE * ri) { hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -58,6 +62,8 @@ volume_gpu_rocm( const Region & region) { hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void convert_to_inter_gpu_rocm(float * ri, const Region & region, const float * rp); diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index b4097083b0..497f4931cd 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -213,6 +213,8 @@ template hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out, table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template @@ -229,16 +231,18 @@ template const int last_layer_size) { if( nloc<=0 ) { return;} - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( dy_dem_x, 0.0, sizeof(FPTYPE) * nloc * nnei)); - hipErrcheck(hipMemset( + DPErrcheck(hipMemset( dy_dem, 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); } template void tabulate_fusion_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc index cfb70acfe9..df7b5f38a0 100644 --- a/source/lib/tests/test_env_mat_a.cc +++ b/source/lib/tests/test_env_mat_a.cc @@ -557,7 +557,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_a_gpu_cuda( em_dev, @@ -588,7 +588,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_cuda(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); for(int ii = 0; ii < nloc; ++ii){ for (int jj = 0; jj < nnei; ++jj){ @@ -648,7 +648,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_a_gpu_cuda( em_dev, @@ -682,7 +682,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_cuda(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); std::vector fmt_nlist_a_1, fmt_nlist_r_1; std::vector env_1, env_deriv_1, rij_a_1; @@ -770,7 +770,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_a_gpu_rocm( em_dev, @@ -801,7 +801,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_rocm(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); for(int ii = 0; ii < nloc; ++ii){ for (int jj = 0; jj < nnei; ++jj){ @@ -861,7 +861,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_a_gpu_rocm( em_dev, @@ -895,7 +895,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_rocm(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); std::vector fmt_nlist_a_1, fmt_nlist_r_1; std::vector env_1, env_deriv_1, rij_a_1; diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc index 1b232e28b5..39d36be42d 100644 --- a/source/lib/tests/test_env_mat_r.cc +++ b/source/lib/tests/test_env_mat_r.cc @@ -400,7 +400,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_r_gpu_cuda( em_dev, @@ -431,7 +431,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_cuda(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); for(int ii = 0; ii < nloc; ++ii){ for (int jj = 0; jj < nnei; ++jj){ @@ -490,7 +490,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_r_gpu_cuda( em_dev, @@ -524,7 +524,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_cuda(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); std::vector fmt_nlist_a_1, fmt_nlist_r_1; std::vector env_1, env_deriv_1, rij_a_1; @@ -603,7 +603,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_r_gpu_rocm( em_dev, @@ -634,7 +634,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_rocm(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); for(int ii = 0; ii < nloc; ++ii){ for (int jj = 0; jj < nnei; ++jj){ @@ -693,7 +693,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); deepmd::prod_env_mat_r_gpu_rocm( em_dev, @@ -727,7 +727,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) deepmd::delete_device_memory(avg_dev); deepmd::delete_device_memory(std_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_rocm(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); std::vector fmt_nlist_a_1, fmt_nlist_r_1; std::vector env_1, env_deriv_1, rij_a_1; diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc index d2de6e8855..844d110de5 100644 --- a/source/lib/tests/test_fmt_nlist.cc +++ b/source/lib/tests/test_fmt_nlist.cc @@ -381,7 +381,7 @@ TEST_F(TestFormatNlist, gpu_cuda) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist format_nbor_list_gpu_cuda( nlist_dev, @@ -393,7 +393,7 @@ TEST_F(TestFormatNlist, gpu_cuda) deepmd::delete_device_memory(array_int_dev); deepmd::delete_device_memory(array_longlong_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_cuda(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); // validate for(int ii = 0; ii < nlist.size(); ++ii){ EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); @@ -437,7 +437,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist format_nbor_list_gpu_cuda( nlist_dev, @@ -449,7 +449,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) deepmd::delete_device_memory(array_int_dev); deepmd::delete_device_memory(array_longlong_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_cuda(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); // validate for(int ii = 0; ii < nlist.size(); ++ii){ EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); @@ -565,7 +565,7 @@ TEST_F(TestFormatNlist, gpu_rocm) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist format_nbor_list_gpu_rocm( nlist_dev, @@ -577,7 +577,7 @@ TEST_F(TestFormatNlist, gpu_rocm) deepmd::delete_device_memory(array_int_dev); deepmd::delete_device_memory(array_longlong_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_rocm(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); // validate for(int ii = 0; ii < nlist.size(); ++ii){ EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); @@ -621,7 +621,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm) deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); - deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist format_nbor_list_gpu_rocm( nlist_dev, @@ -633,7 +633,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm) deepmd::delete_device_memory(array_int_dev); deepmd::delete_device_memory(array_longlong_dev); deepmd::delete_device_memory(memory_dev); - deepmd::free_nlist_gpu_rocm(gpu_inlist); + deepmd::free_nlist_gpu_device(gpu_inlist); // validate for(int ii = 0; ii < nlist.size(); ++ii){ EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 7c7130cda0..22fb223289 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -995,7 +995,7 @@ _norm_copy_coord_gpu( FPTYPE_shape.AddDim(nall*3); context->allocate_temp(DataTypeToEnum::value, FPTYPE_shape, tensor_list); FPTYPE * tmp_coord = (*tensor_list).flat().data(); - cudaErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice)); + DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice)); deepmd::Region region; init_region_cpu(region, box); @@ -1210,7 +1210,7 @@ _norm_copy_coord_gpu_rocm( FPTYPE_shape.AddDim(nall*3); context->allocate_temp(DataTypeToEnum::value, FPTYPE_shape, tensor_list); FPTYPE * tmp_coord = (*tensor_list).flat().data(); - hipErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice)); + DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice)); deepmd::Region region; init_region_cpu(region, box);