From d42968f193a8796847febcdb119ed9121c1f0dcf Mon Sep 17 00:00:00 2001 From: denghuilu Date: Tue, 20 Apr 2021 22:34:45 +0800 Subject: [PATCH 1/2] use bit operations to construct nbor sort keys --- source/lib/include/device.h | 2 +- source/lib/include/fmt_nlist.h | 16 +++ source/lib/include/gpu_cuda.h | 8 ++ source/lib/src/cuda/prod_env_mat.cu | 144 +++++++++++++------------ source/lib/tests/test_env_mat_a.cc | 4 +- source/lib/tests/test_env_mat_r.cc | 4 +- source/lib/tests/test_fmt_nlist.cc | 121 ++++++++++++++++++++- source/op/prod_env_mat_multi_device.cc | 2 +- 8 files changed, 226 insertions(+), 75 deletions(-) diff --git a/source/lib/include/device.h b/source/lib/include/device.h index dfd12a4f27..f0e36ae73d 100644 --- a/source/lib/include/device.h +++ b/source/lib/include/device.h @@ -6,7 +6,7 @@ #define TPB 256 #define SQRT_2_PI 0.7978845608028654 -typedef unsigned long long int_64; +typedef unsigned long long uint_64; #if GOOGLE_CUDA #include "gpu_cuda.h" diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h index 13c9082240..cc3adf43c5 100644 --- a/source/lib/include/fmt_nlist.h +++ b/source/lib/include/fmt_nlist.h @@ -1,6 +1,7 @@ #pragma once #include +#include "device.h" #include "neighbor_list.h" namespace deepmd{ @@ -16,6 +17,21 @@ void format_nlist_cpu( const float rcut, const std::vector sec); +#if GOOGLE_CUDA +template +void format_nbor_list_gpu_cuda( + int * nlist, + const FPTYPE * coord, + const int * type, + const deepmd::InputNlist & gpu_inlist, + int * array_int, + uint_64 * array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); +#endif //GOOGLE_CUDA } diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index a9988b1caa..904ba5fafa 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -13,6 +13,14 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= } } +#define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);} +inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line); + if (abort) exit(code); + } +} + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd( double* address, diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index 807b2c37de..bfb15dd785 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -1,4 +1,5 @@ #include "gpu_cuda.h" +#include "fmt_nlist.h" #include "prod_env_mat.h" #include #include @@ -86,7 +87,7 @@ __global__ void get_i_idx( template __global__ void format_nlist_fill_a( - int_64 * key, + uint_64 * key, const FPTYPE * coord, const int * type, const int * numneigh, @@ -106,15 +107,18 @@ __global__ void format_nlist_fill_a( const int * nei_idx = firstneigh[i_idx[idx]]; // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); - int_64 * key_in = key + idx * MAX_NBOR_SIZE; + uint_64 * key_in = key + idx * MAX_NBOR_SIZE; FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } FPTYPE rr = sqrt(dev_dot(diff, diff)); + if(type[j_idx] > 128 || rr > 128.0 || j_idx > (1 << 24)) { + asm("trap;"); + } if (rr <= rcut) { - key_in[idy] = type[j_idx] * 1E14+ (int_64)(rr * 1.0E12) / 10000000 * 10000000 + j_idx; + key_in[idy] = ((uint_64)type[j_idx] << 57) + (uint_64)((double)rr * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + j_idx; } } @@ -142,16 +146,16 @@ __global__ void format_nlist_fill_b( } for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) { - const int & nei_type = key_out[kk] / 1E14; + const int & nei_type = key_out[kk] >> 57; if (nei_iter[nei_type] < sec[nei_type + 1]) { - row_nlist[nei_iter[nei_type]++] = key_out[kk] % 10000000; + row_nlist[nei_iter[nei_type]++] = key_out[kk] & 0xFFFFFF; } } } template void format_nbor_list_1024 ( - int_64 * key, + uint_64 * key, const FPTYPE* coord, const int* type, const deepmd::InputNlist & gpu_inlist, @@ -170,14 +174,14 @@ void format_nbor_list_1024 ( const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> ( + BlockSortKernel <<>> ( key, key + nloc * MAX_NBOR_SIZE); } template void format_nbor_list_2048 ( - int_64 * key, + uint_64 * key, const FPTYPE* coord, const int* type, const deepmd::InputNlist & gpu_inlist, @@ -196,14 +200,14 @@ void format_nbor_list_2048 ( const int ITEMS_PER_THREAD = 8; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> ( + BlockSortKernel <<>> ( key, key + nloc * MAX_NBOR_SIZE); } template void format_nbor_list_4096 ( - int_64 * key, + uint_64 * key, const FPTYPE* coord, const int* type, const deepmd::InputNlist & gpu_inlist, @@ -222,62 +226,11 @@ void format_nbor_list_4096 ( const int ITEMS_PER_THREAD = 16; const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> ( + BlockSortKernel <<>> ( key, key + nloc * MAX_NBOR_SIZE); } -template -void format_nbor_list( - int * nlist, - const FPTYPE * coord, - const int * type, - const deepmd::InputNlist & gpu_inlist, - int * array_int, - int_64 * array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec) -{ - const int LEN = 256; - const int nnei = sec.back(); - const int nblock = (nloc + LEN -1) / LEN; - int * sec_dev = array_int; - int * nei_iter = array_int + sec.size(); // = new int[sec_size]; - int * i_idx = array_int + sec.size() + nloc * sec.size(); - int_64 * key = array_longlong; - assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); - cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei)); - cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size)); - cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice)); - - get_i_idx<<>>( - i_idx, - nloc, gpu_inlist.ilist); - - if (max_nbor_size == 1024) { - format_nbor_list_1024 ( - key, - coord, type, gpu_inlist, nloc, rcut, i_idx); - } - else if (max_nbor_size == 2048) { - format_nbor_list_2048 ( - key, - coord, type, gpu_inlist, nloc, rcut, i_idx); - } - else if (max_nbor_size == 4096) { - format_nbor_list_4096 ( - key, - coord, type, gpu_inlist, nloc, rcut, i_idx); - } - - format_nlist_fill_b<<>> ( - nlist, - nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); -} - template< typename FPTYPE, int THREADS_PER_BLOCK> @@ -432,6 +385,57 @@ __global__ void compute_env_mat_r( } namespace deepmd { +template +void format_nbor_list_gpu_cuda( + int * nlist, + const FPTYPE * coord, + const int * type, + const deepmd::InputNlist & gpu_inlist, + int * array_int, + uint_64 * array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec) +{ + const int LEN = 256; + const int nnei = sec.back(); + const int nblock = (nloc + LEN -1) / LEN; + int * sec_dev = array_int; + int * nei_iter = array_int + sec.size(); // = new int[sec_size]; + int * i_idx = array_int + sec.size() + nloc * sec.size(); + uint_64 * key = array_longlong; + assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096); + cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei)); + cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size)); + cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice)); + + get_i_idx<<>>( + i_idx, + nloc, gpu_inlist.ilist); + + if (max_nbor_size == 1024) { + format_nbor_list_1024 ( + key, + coord, type, gpu_inlist, nloc, rcut, i_idx); + } + else if (max_nbor_size == 2048) { + format_nbor_list_2048 ( + key, + coord, type, gpu_inlist, nloc, rcut, i_idx); + } + else if (max_nbor_size == 4096) { + format_nbor_list_4096 ( + key, + coord, type, gpu_inlist, nloc, rcut, i_idx); + } + + format_nlist_fill_b<<>> ( + nlist, + nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size); +} + template void prod_env_mat_a_gpu_cuda( FPTYPE * em, @@ -442,7 +446,7 @@ void prod_env_mat_a_gpu_cuda( const int * type, const InputNlist & gpu_inlist, int * array_int, - int_64 * array_longlong, + uint_64 * array_longlong, const int max_nbor_size, const FPTYPE * avg, const FPTYPE * std, @@ -457,9 +461,11 @@ void prod_env_mat_a_gpu_cuda( cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); - format_nbor_list( + format_nbor_list_gpu_cuda( nlist, coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec); + nborErrcheck(cudaGetLastError()); + nborErrcheck(cudaDeviceSynchronize()); compute_env_mat_a <<>> ( em, em_deriv, rij, @@ -476,7 +482,7 @@ void prod_env_mat_r_gpu_cuda( const int * type, const InputNlist & gpu_inlist, int * array_int, - int_64 * array_longlong, + uint_64 * array_longlong, const int max_nbor_size, const FPTYPE * avg, const FPTYPE * std, @@ -491,10 +497,12 @@ void prod_env_mat_r_gpu_cuda( cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt)); cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3)); - format_nbor_list( + format_nbor_list_gpu_cuda( nlist, coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec); - + nborErrcheck(cudaGetLastError()); + nborErrcheck(cudaDeviceSynchronize()); + compute_env_mat_r <<>> ( em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut); @@ -504,4 +512,6 @@ template void prod_env_mat_a_gpu_cuda(float * em, float * em_deriv, float template void prod_env_mat_a_gpu_cuda(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); template void prod_env_mat_r_gpu_cuda(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); template void prod_env_mat_r_gpu_cuda(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); +template void format_nbor_list_gpu_cuda(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector sec); +template void format_nbor_list_gpu_cuda(int * nlist, const double * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector sec); } diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc index d32203c692..f9e262cb2c 100644 --- a/source/lib/tests/test_env_mat_a.cc +++ b/source/lib/tests/test_env_mat_a.cc @@ -545,7 +545,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; - int_64 * array_longlong_dev = NULL; + uint_64 * array_longlong_dev = NULL; deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); @@ -635,7 +635,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; - int_64 * array_longlong_dev = NULL; + uint_64 * array_longlong_dev = NULL; deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc index aac71eff8c..c7b0db5e1d 100644 --- a/source/lib/tests/test_env_mat_r.cc +++ b/source/lib/tests/test_env_mat_r.cc @@ -387,7 +387,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; - int_64 * array_longlong_dev = NULL; + uint_64 * array_longlong_dev = NULL; deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); @@ -477,7 +477,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL; double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL; int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; - int_64 * array_longlong_dev = NULL; + uint_64 * array_longlong_dev = NULL; deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc index 6dc539b7d6..d1cd3eef0b 100644 --- a/source/lib/tests/test_fmt_nlist.cc +++ b/source/lib/tests/test_fmt_nlist.cc @@ -32,6 +32,7 @@ class TestFormatNlist : public ::testing::Test 3 , 6 , -1, -1, -1, -1, -1, -1, -1, -1, 4 , 2 , 7 , -1, -1, -1, -1, -1, -1, -1 }; std::vector expect_nlist; + int max_nbor_size; void SetUp() override { double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.}; @@ -53,7 +54,8 @@ class TestFormatNlist : public ::testing::Test else{ expect_nlist.push_back(-1); } - } + } + max_nbor_size = 0; } void TearDown() override { } @@ -90,6 +92,7 @@ class TestFormatNlistShortSel : public ::testing::Test 3, 6, 4, 2, }; std::vector expect_nlist; + int max_nbor_size; void SetUp() override { double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.}; @@ -111,7 +114,8 @@ class TestFormatNlistShortSel : public ::testing::Test else{ expect_nlist.push_back(-1); } - } + } + max_nbor_size = 0; } void TearDown() override { } @@ -301,3 +305,116 @@ TEST_F(TestFormatNlistShortSel, cpu) } } +#if GOOGLE_CUDA +TEST_F(TestFormatNlist, gpu_cuda) +{ + std::vector> nlist_a_0, nlist_r_0; + build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); + // make a input nlist + int inum = nlist_a_0.size(); + std::vector ilist(inum); + std::vector numneigh(inum); + std::vector firstneigh(inum); + deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(in_nlist, nlist_a_0); + // allocate the mem for the result + std::vector nlist(inum * sec_a.back()); + EXPECT_EQ(nlist.size(), expect_nlist_cpy.size()); + + double * posi_cpy_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + for (int ii = 0; ii < inum; ii++) { + max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii]; + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + // format nlist + format_nbor_list_gpu_cuda( + nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_cuda(gpu_inlist); + // validate + for(int ii = 0; ii < nlist.size(); ++ii){ + EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); + } +} + +TEST_F(TestFormatNlistShortSel, gpu_cuda) +{ + std::vector> nlist_a_0, nlist_r_0; + build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); + // make a input nlist + int inum = nlist_a_0.size(); + std::vector ilist(inum); + std::vector numneigh(inum); + std::vector firstneigh(inum); + deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist; + convert_nlist(in_nlist, nlist_a_0); + // mem + std::vector nlist(inum * sec_a.back()); + EXPECT_EQ(nlist.size(), expect_nlist_cpy.size()); + // format nlist + double * posi_cpy_dev = NULL; + int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL; + uint_64 * array_longlong_dev = NULL; + for (int ii = 0; ii < inum; ii++) { + max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii]; + } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } + deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy); + deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy); + deepmd::malloc_device_memory_sync(nlist_dev, nlist); + deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc); + deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2); + deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size); + deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size); + // format nlist + format_nbor_list_gpu_cuda( + nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a); + deepmd::memcpy_device_to_host(nlist_dev, nlist); + deepmd::delete_device_memory(nlist_dev); + deepmd::delete_device_memory(posi_cpy_dev); + deepmd::delete_device_memory(atype_cpy_dev); + deepmd::delete_device_memory(array_int_dev); + deepmd::delete_device_memory(array_longlong_dev); + deepmd::delete_device_memory(memory_dev); + deepmd::free_nlist_gpu_cuda(gpu_inlist); + // validate + for(int ii = 0; ii < nlist.size(); ++ii){ + EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); + } +} +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index f1843f9afe..be812dbc9e 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -954,8 +954,8 @@ _prepare_coord_nlist_gpu( const int & max_cpy_trial, const int & max_nnei_trial) { - inlist.inum = nloc; if(nei_mode != 3){ + inlist.inum = nloc; // build nlist by myself // normalize and copy coord if(nei_mode == 1){ From 58e83ae5b053757c22dc66472eb93dacc9e824fc Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 23 Apr 2021 12:53:12 +0800 Subject: [PATCH 2/2] add ut for encoding/decoding nbor info --- source/lib/include/fmt_nlist.h | 10 +++ source/lib/src/cuda/prod_env_mat.cu | 72 +++++++++++++++++-- source/lib/tests/test_fmt_nlist.cc | 108 ++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 6 deletions(-) diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h index cc3adf43c5..1c6f66059c 100644 --- a/source/lib/include/fmt_nlist.h +++ b/source/lib/include/fmt_nlist.h @@ -31,6 +31,16 @@ void format_nbor_list_gpu_cuda( const int nall, const float rcut, const std::vector sec); + +template +void test_encoding_decoding_nbor_info_gpu_cuda( + uint_64 * key, + int * out_type, + int * out_index, + const int * in_type, + const FPTYPE * in_dist, + const int * in_index, + const int size_of_array); #endif //GOOGLE_CUDA } diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index bfb15dd785..1ec317b685 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -72,6 +72,31 @@ __device__ inline void spline5_switch( } } +template +__device__ inline uint_64 encoding_nbor_info( + const int type, + const FPTYPE dist, + const int index) +{ + // nbor info checking: + // the type of nbor atom must be smaller than 128 + // the distance of center atom between nbor atom must be smaller than 128 + // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24) + if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) { + asm("trap;"); + } + return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index; +} + +__device__ inline void decoding_nbor_info( + int &type, + int &index, + const uint_64 key) +{ + type = key >> 57; + index = key & 0xFFFFFF; +} + template __global__ void get_i_idx( FPTYPE * i_idx, @@ -114,11 +139,8 @@ __global__ void format_nlist_fill_a( diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } FPTYPE rr = sqrt(dev_dot(diff, diff)); - if(type[j_idx] > 128 || rr > 128.0 || j_idx > (1 << 24)) { - asm("trap;"); - } if (rr <= rcut) { - key_in[idy] = ((uint_64)type[j_idx] << 57) + (uint_64)((double)rr * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + j_idx; + key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx); } } @@ -145,14 +167,34 @@ __global__ void format_nlist_fill_b( nei_iter[ii] = sec[ii]; } + int nei_type = 0, nbor_idx = 0; for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) { - const int & nei_type = key_out[kk] >> 57; + decoding_nbor_info(nei_type, nbor_idx, key_out[kk]); if (nei_iter[nei_type] < sec[nei_type + 1]) { - row_nlist[nei_iter[nei_type]++] = key_out[kk] & 0xFFFFFF; + row_nlist[nei_iter[nei_type]++] = nbor_idx; } } } +template +__global__ void encoding_decoding_nbor_info( + uint_64 * key, + int * out_type, + int * out_index, + const int * in_type, + const FPTYPE * in_dist, + const int * in_index, + const int size_of_array) +{ + const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size_of_array) { + return; + } + + key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]); + decoding_nbor_info(out_type[idx], out_index[idx], key[idx]); +} + template void format_nbor_list_1024 ( uint_64 * key, @@ -508,10 +550,28 @@ void prod_env_mat_r_gpu_cuda( coord, avg, std, type, nlist, nnei, rcut_smth, rcut); } +template +void test_encoding_decoding_nbor_info_gpu_cuda( + uint_64 * key, + int * out_type, + int * out_index, + const int * in_type, + const FPTYPE * in_dist, + const int * in_index, + const int size_of_array) +{ + const int nblock = (size_of_array + TPB - 1) / TPB; + encoding_decoding_nbor_info<<>> ( + key, out_type, out_index, + in_type, in_dist, in_index, size_of_array); +} + template void prod_env_mat_a_gpu_cuda(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); template void prod_env_mat_a_gpu_cuda(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); template void prod_env_mat_r_gpu_cuda(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); template void prod_env_mat_r_gpu_cuda(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector sec); template void format_nbor_list_gpu_cuda(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector sec); template void format_nbor_list_gpu_cuda(int * nlist, const double * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector sec); +template void test_encoding_decoding_nbor_info_gpu_cuda(uint_64 * key, int * out_type, int * out_index, const int * in_type, const float * in_dist, const int * in_index, const int size_of_array); +template void test_encoding_decoding_nbor_info_gpu_cuda(uint_64 * key, int * out_type, int * out_index, const int * in_type, const double * in_dist, const int * in_index, const int size_of_array); } diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc index d1cd3eef0b..df2a68534f 100644 --- a/source/lib/tests/test_fmt_nlist.cc +++ b/source/lib/tests/test_fmt_nlist.cc @@ -121,6 +121,44 @@ class TestFormatNlistShortSel : public ::testing::Test } }; +class TestEncodingDecodingNborInfo : public ::testing::Test +{ +protected: + std::vector valid_type = { + 0, 1, 127, 77, 47, 9, 11 + }; + std::vector valid_dist = { + 23.3333, 0.001234, 1.456, 127.7, 2.021, 0.409, 11.2 + }; + std::vector valid_index = { + 0, 16777215, 1000000, 10000000, 202149, 478910, 5006 + }; + std::vector expect_key = { + 26270960290103296UL, 144116577447444479UL, 18304268195882549824UL, 11240646899941283456UL, 6775689283274741157UL, 1297497185738772158UL, 1597877147777635214UL + }; + + std::vector invalid_type = { + 0, 256, 128, 77, 47, 126, 1100 + }; + std::vector invalid_dist = { + 128.0, 0.001234, 1.456, 130.7, 2.021, 0.409, 11.2 + }; + std::vector invalid_index = { + 0, 16777215, 1 << 24, 10000000, 20210409, 478910, 5006 + }; + std::vector expect_cuda_error_check = { + false, false, false, false, false, true, false + }; + + std::vector expect_type = valid_type; + std::vector expect_index = valid_index; + int size_of_array = valid_type.size(); + + void SetUp() override { + } + void TearDown() override { + } +}; // orginal implementation. copy ghost TEST_F(TestFormatNlist, orig_cpy) @@ -417,4 +455,74 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]); } } + +TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) +{ + int * valid_type_dev = NULL, * valid_index_dev = NULL, * out_type_dev = NULL, * out_index_dev = NULL; + double * valid_dist_dev = NULL; + uint_64 * key_dev = NULL; + std::vector out_type(size_of_array, 0); + std::vector out_index(size_of_array, 0); + std::vector key(size_of_array, 0); + deepmd::malloc_device_memory_sync(valid_type_dev, valid_type); + deepmd::malloc_device_memory_sync(valid_dist_dev, valid_dist); + deepmd::malloc_device_memory_sync(valid_index_dev, valid_index); + deepmd::malloc_device_memory_sync(out_type_dev, out_type); + deepmd::malloc_device_memory_sync(out_index_dev, out_index); + deepmd::malloc_device_memory_sync(key_dev, key); + + deepmd::test_encoding_decoding_nbor_info_gpu_cuda( + key_dev, out_type_dev, out_index_dev, + valid_type_dev, valid_dist_dev, valid_index_dev, size_of_array + ); + + deepmd::memcpy_device_to_host(key_dev, key); + deepmd::memcpy_device_to_host(out_type_dev, out_type); + deepmd::memcpy_device_to_host(out_index_dev, out_index); + deepmd::delete_device_memory(valid_type_dev); + deepmd::delete_device_memory(valid_dist_dev); + deepmd::delete_device_memory(valid_index_dev); + deepmd::delete_device_memory(out_type_dev); + deepmd::delete_device_memory(out_index_dev); + deepmd::delete_device_memory(key_dev); + // validate + for(int ii = 0; ii < size_of_array; ii++) { + EXPECT_EQ(key[ii], expect_key[ii]); + EXPECT_EQ(out_type[ii], expect_type[ii]); + EXPECT_EQ(out_index[ii], expect_index[ii]); + } +} + +// TEST_F(TestEncodingDecodingNborInfo, invalid_nbor_info_gpu_cuda) +// { +// int * invalid_type_dev = NULL, * invalid_index_dev = NULL, * out_type_dev = NULL, * out_index_dev = NULL; +// double * invalid_dist_dev = NULL; +// uint_64 * key_dev = NULL; +// std::vector out_type(size_of_array, 0); +// std::vector out_index(size_of_array, 0); +// std::vector key(size_of_array, 0); +// deepmd::malloc_device_memory_sync(invalid_type_dev, invalid_type); +// deepmd::malloc_device_memory_sync(invalid_dist_dev, invalid_dist); +// deepmd::malloc_device_memory_sync(invalid_index_dev, invalid_index); +// deepmd::malloc_device_memory_sync(out_type_dev, out_type); +// deepmd::malloc_device_memory_sync(out_index_dev, out_index); +// deepmd::malloc_device_memory_sync(key_dev, key); + +// EXPECT_EQ(cudaGetLastError() == cudaSuccess && cudaDeviceSynchronize() == cudaSuccess, true); +// deepmd::test_encoding_decoding_nbor_info_gpu_cuda( +// key_dev, out_type_dev, out_index_dev, +// invalid_type_dev, invalid_dist_dev, invalid_index_dev, size_of_array +// ); +// EXPECT_EQ(cudaGetLastError() == cudaSuccess && cudaDeviceSynchronize() == cudaSuccess, false); +// cudaErrcheck(cudaDeviceReset()); +// deepmd::memcpy_device_to_host(key_dev, key); +// deepmd::memcpy_device_to_host(out_type_dev, out_type); +// deepmd::memcpy_device_to_host(out_index_dev, out_index); +// deepmd::delete_device_memory(invalid_type_dev); +// deepmd::delete_device_memory(invalid_dist_dev); +// deepmd::delete_device_memory(invalid_index_dev); +// deepmd::delete_device_memory(out_type_dev); +// deepmd::delete_device_memory(out_index_dev); +// deepmd::delete_device_memory(key_dev); +// } #endif // GOOGLE_CUDA \ No newline at end of file