diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h index 69a0ecf7d1..a32940a738 100644 --- a/source/lib/include/NNPInter.h +++ b/source/lib/include/NNPInter.h @@ -93,10 +93,8 @@ class NNPInter compute_t *array_double; InternalNeighborList nlist; NNPAtomMap nnpmap; - unsigned long long *array_longlong; - int *ilist, *jrange, *jlist, *array_int; + int *ilist, *jrange, *jlist; int ilist_size, jrange_size, jlist_size; - int arr_int_size, arr_ll_size, arr_dou_size; // function used for neighbor list copy vector get_sel_a() const; @@ -191,13 +189,10 @@ class NNPInterModelDevi vector > sec; InternalNeighborList nlist; NNPAtomMap nnpmap; - unsigned long long *array_longlong; - int max_sec_size = 0, max_sec_back = 0; - int *ilist, *jrange, *jlist, *array_int; - int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size; + int *ilist, *jrange, *jlist; + int ilist_size, jrange_size, jlist_size; // function used for nborlist copy - void get_max_sec(); vector > get_sel() const; void cum_sum(const std::vector > n_sel); #ifdef USE_CUDA_TOOLKIT diff --git a/source/lib/include/common.h b/source/lib/include/common.h index 8db7d817ce..3912f21f7f 100644 --- a/source/lib/include/common.h +++ b/source/lib/include/common.h @@ -144,9 +144,6 @@ session_input_tensors (vector>& input_tensors, const int * ilist, const int * jrange, const int * jlist, - int * array_int, - unsigned long long * array_longlong, - double * array_double, const vector & fparam_, const vector & aparam_, const NNPAtomMap & nnpmap, diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc index f4f39945ff..f526cad911 100644 --- a/source/lib/src/NNPInter.cc +++ b/source/lib/src/NNPInter.cc @@ -3,7 +3,6 @@ #include "SimulationRegion.h" #include -#define MAGIC_NUMBER 1024 #ifdef USE_CUDA_TOOLKIT #include "cuda_runtime.h" @@ -14,7 +13,7 @@ #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) + if (code != cudaSuccess) { fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); @@ -273,9 +272,6 @@ NNPInter::~NNPInter() { cudaErrcheck(cudaFree(ilist)); cudaErrcheck(cudaFree(jrange)); cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaFree(array_double)); } #endif } @@ -283,24 +279,12 @@ NNPInter::~NNPInter() { #ifdef USE_CUDA_TOOLKIT void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { if (!init_nbor) { - sec_a = cum_sum(get_sel_a()); cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc))); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #endif ilist_size = nlist.ilist.size(); jrange_size = nlist.jrange.size(); jlist_size = nlist.jlist.size(); - arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc; - arr_ll_size = nloc * MAGIC_NUMBER * 2; - arr_dou_size = nloc * sec_a.back() * 3; - init_nbor = true; } if (ilist_size < nlist.ilist.size()) { cudaErrcheck(cudaFree(ilist)); @@ -317,25 +301,7 @@ void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); jlist_size = nlist.jlist.size(); } - if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) { - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc))); - arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc; - } - if (arr_ll_size < nloc * MAGIC_NUMBER * 2) { - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - arr_ll_size = nloc * MAGIC_NUMBER * 2; - } - if (arr_dou_size < nloc * sec_a.back() * 3) { - cudaErrcheck(cudaFree(array_double)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #endif - arr_dou_size = nloc * sec_a.back() * 3; - } + cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); @@ -378,14 +344,10 @@ init (const string & model, const int & gpu_rank) if (dfparam < 0) dfparam = 0; if (daparam < 0) daparam = 0; inited = true; - + init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #else void @@ -415,12 +377,8 @@ init (const string & model, const int & gpu_rank) inited = true; init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #endif @@ -602,7 +560,7 @@ compute_inner (ENERGYTYPE & dener, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif @@ -669,7 +627,7 @@ compute (ENERGYTYPE & dener, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif @@ -710,9 +668,6 @@ NNPInterModelDevi::~NNPInterModelDevi() { cudaErrcheck(cudaFree(ilist)); cudaErrcheck(cudaFree(jrange)); cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaFree(array_double)); } #endif } @@ -761,14 +716,10 @@ init (const vector & models, const int & gpu_rank) // cell_size = rcut; // ntypes = get_ntypes(); inited = true; - + init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #else void @@ -798,14 +749,10 @@ init (const vector & models, const int & gpu_rank) // cell_size = rcut; // ntypes = get_ntypes(); inited = true; - + init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #endif @@ -873,41 +820,18 @@ cum_sum (const std::vector > n_sel) } } -void -NNPInterModelDevi:: -get_max_sec() -{ - for (int ii = 0; ii < numb_models; ii++) { - this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size; - this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back; - } -} - #ifdef USE_CUDA_TOOLKIT void NNPInterModelDevi:: update_nbor(const InternalNeighborList & nlist, const int nloc) { if (!init_nbor) { - cum_sum(get_sel()); - get_max_sec(); cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc))); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #endif ilist_size = nlist.ilist.size(); jrange_size = nlist.jrange.size(); jlist_size = nlist.jlist.size(); - arr_int_size = max_sec_size + nloc * max_sec_size + nloc; - arr_ll_size = nloc * MAGIC_NUMBER * 2; - arr_dou_size = nloc * max_sec_back * 3; - init_nbor = true; } if (ilist_size < nlist.ilist.size()) { cudaErrcheck(cudaFree(ilist)); @@ -924,25 +848,7 @@ update_nbor(const InternalNeighborList & nlist, const int nloc) cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); jlist_size = nlist.jlist.size(); } - if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) { - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc))); - arr_int_size = max_sec_size + nloc * max_sec_size + nloc; - } - if (arr_ll_size < nloc * MAGIC_NUMBER * 2) { - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - arr_ll_size = nloc * MAGIC_NUMBER * 2; - } - if (arr_dou_size < nloc * max_sec_back * 3) { - cudaErrcheck(cudaFree(array_double)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #endif - arr_dou_size = nloc * max_sec_back * 3; - } + cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); @@ -1044,7 +950,7 @@ compute (vector & all_energy, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif @@ -1094,7 +1000,7 @@ compute (vector & all_energy, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc index 296800a396..dd4c3f672a 100644 --- a/source/lib/src/common.cc +++ b/source/lib/src/common.cc @@ -479,9 +479,6 @@ session_input_tensors ( const int * ilist, const int * jrange, const int * jlist, - int * array_int, - unsigned long long * array_longlong, - double * array_double, const vector & fparam_, const vector & aparam_, const NNPAtomMap & nnpmap, @@ -511,7 +508,7 @@ session_input_tensors ( box_shape.AddDim (nframes); box_shape.AddDim (9); TensorShape mesh_shape; - mesh_shape.AddDim (32); + mesh_shape.AddDim (16); TensorShape natoms_shape; natoms_shape.AddDim (2 + ntypes); TensorShape fparam_shape; @@ -565,7 +562,7 @@ session_input_tensors ( } } - for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0; + for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0; mesh (0) = sizeof(int *) / sizeof(int); assert (mesh(0) * sizeof(int) == sizeof(int *)); @@ -577,9 +574,6 @@ session_input_tensors ( memcpy (&mesh(4), &(ilist), sizeof(int *)); memcpy (&mesh(8), &(jrange), sizeof(int *)); memcpy (&mesh(12), &(jlist), sizeof(int *)); - memcpy (&mesh(16), &(array_int), sizeof(int *)); - memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *)); - memcpy (&mesh(24), &(array_double), sizeof(double *)); natoms (0) = nloc; natoms (1) = nall; diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index 4b309522fa..1636df8ff5 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -18,8 +18,6 @@ limitations under the License. #include #include -#define MAGIC_NUMBER 1024 - #ifdef HIGH_PREC typedef double VALUETYPE; #else @@ -118,11 +116,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, const int * jlist, const float rcut, int_64 * key, - int * i_idx) + int * i_idx, + const int MAGIC_NUMBER) { // <<>> - const unsigned int idx = blockIdx.x; - const unsigned int idy = threadIdx.x; + const unsigned int idx = blockIdx.y; + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; if (idy >= nsize) { @@ -154,7 +153,8 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, int_64 * key, const int * sec_a, const int sec_a_size, - int * nei_iter_dev) + int * nei_iter_dev, + const int MAGIC_NUMBER) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; @@ -194,7 +194,6 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, const VALUETYPE* coord, const VALUETYPE rmin, const VALUETYPE rmax, - compute_t* sel_a_diff_dev, const int sec_a_size) { // <<>> @@ -208,16 +207,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, VALUETYPE * row_descript = descript + idx * ndescrpt; VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; VALUETYPE * row_rij = rij + idx * rij_size; - compute_t * sel_a_diff = sel_a_diff_dev + idx * nlist_size * 3; int * row_nlist = nlist + idx * nlist_size; if (row_nlist[idy] >= 0) { const int & j_idx = row_nlist[idy]; for (int kk = 0; kk < 3; kk++) { - sel_a_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; - row_rij[idy * 3 + kk] = sel_a_diff[idy * 3 + kk]; + row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const compute_t * rr = &sel_a_diff[idy * 3 + 0]; + const compute_t * rr = &row_rij[idy * 3 + 0]; compute_t nr2 = dev_dot(rr, rr); compute_t inr = 1./sqrt(nr2); compute_t nr = nr2 * inr; @@ -263,6 +260,166 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, } } +void format_nbor_list_256 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 256; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_512 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 512; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_1024 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 1024; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_2048 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 2048; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_4096 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 4096; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 16; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + void DescrptSeALauncher(const VALUETYPE* coord, const int* type, const int* ilist, @@ -270,22 +427,20 @@ void DescrptSeALauncher(const VALUETYPE* coord, const int* jlist, int* array_int, unsigned long long* array_longlong, - compute_t* array_double, const VALUETYPE* avg, const VALUETYPE* std, VALUETYPE* descript, VALUETYPE* descript_deriv, VALUETYPE* rij, int* nlist, - const int& ntypes, const int& nloc, - const int& nall, const int& nnei, const float& rcut_r, const float& rcut_r_smth, const int& ndescrpt, const std::vector& sec_a, - const bool& fill_nei_a + const bool& fill_nei_a, + const int MAGIC_NUMBER ) { const int LEN = 256; @@ -294,42 +449,76 @@ void DescrptSeALauncher(const VALUETYPE* coord, int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); int_64 * key = array_longlong; - compute_t * sel_a_diff = array_double; // = new VALUETYPE *[nlist_size]; nnei - // int_64 * key = NULL; - // VALUETYPE * sel_a_diff = NULL; // = new VALUETYPE *[nlist_size]; nnei - + cudaError_t res = cudaSuccess; - // res = cudaMalloc((void**)&sec_a_dev, sizeof(int) * sec_a.size()); cudaErrcheck(res); - // res = cudaMalloc((void**)&nei_iter, sizeof(int) * nloc * sec_a.size()); cudaErrcheck(res); - // res = cudaMalloc((void**)&i_idx, sizeof(int) * nloc); cudaErrcheck(res); - // res = cudaMalloc((void**)&key, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2); cudaErrcheck(res); - // res = cudaMalloc((void**)&sel_a_diff, sizeof(VALUETYPE) * nnei * 3 * nloc); cudaErrcheck(res); res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); - // res = cudaMemset(rij, 0.0, sizeof(VALUETYPE) * nloc * nnei * 3); cudaErrcheck(res); if (fill_nei_a) { // ~~~ // cudaProfilerStart(); get_i_idx_se_a<<>> (nloc, ilist, i_idx); - format_nlist_fill_a_se_a<<>> ( - coord, - type, - jrange, - jlist, - rcut_r, - key, - i_idx - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; - // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); - + if (nnei <= 256) { + format_nbor_list_256 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 512) { + format_nbor_list_512 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 1024) { + format_nbor_list_1024 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 2048) { + format_nbor_list_2048 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 4096) { + format_nbor_list_4096 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } + format_nlist_fill_b_se_a<<>> ( nlist, nnei, @@ -339,7 +528,8 @@ void DescrptSeALauncher(const VALUETYPE* coord, key, sec_a_dev, sec_a.size(), - nei_iter + nei_iter, + MAGIC_NUMBER ); } @@ -360,14 +550,6 @@ void DescrptSeALauncher(const VALUETYPE* coord, coord, rcut_r_smth, rcut_r, - sel_a_diff, sec_a.back() ); -//// - // res = cudaFree(sec_a_dev); cudaErrcheck(res); - // res = cudaFree(key); cudaErrcheck(res); - // res = cudaFree(i_idx); cudaErrcheck(res); - // res = cudaFree(nei_iter); cudaErrcheck(res); - // res = cudaFree(sel_a_diff); cudaErrcheck(res); - //output some interesting things... } \ No newline at end of file diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc index 70dd9c7751..fd5ae632cb 100644 --- a/source/op/descrpt_se_a_gpu.cc +++ b/source/op/descrpt_se_a_gpu.cc @@ -67,6 +67,24 @@ REGISTER_OP("DescrptSeA") .Output("nlist: int32"); #endif +int get_magic_number(int const nnei) { + if (nnei <= 256) { + return 256; + } + else if (nnei <= 512) { + return 512; + } + else if (nnei <= 1024) { + return 1024; + } + else if (nnei <= 2048) { + return 2048; + } + else if (nnei <= 4096) { + return 4096; + } +} + void DescrptSeALauncher(const VALUETYPE* coord, const int* type, const int* ilist, @@ -74,22 +92,20 @@ void DescrptSeALauncher(const VALUETYPE* coord, const int* jlist, int* array_int, unsigned long long* array_longlong, - compute_t* array_double, const VALUETYPE* avg, const VALUETYPE* std, VALUETYPE* descript, VALUETYPE* descript_deriv, VALUETYPE* rij, int* nlist, - const int& ntypes, const int& nloc, - const int& nall, const int& nnei, const float& rcut_r, const float& rcut_r_smth, const int& ndescrpt, const std::vector& sec_a, - const bool& fill_nei_a + const bool& fill_nei_a, + const int MAGIC_NUMBER ); class DescrptSeAOp : public OpKernel { @@ -112,6 +128,7 @@ class DescrptSeAOp : public OpKernel { nnei_r = sec_r.back(); nnei = nnei_a + nnei_r; fill_nei_a = (rcut_a < 0); + magic_number = get_magic_number(nnei); } void Compute(OpKernelContext* context) override { @@ -158,7 +175,7 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES (context, (ntypes == int(sel_a.size())), errors::InvalidArgument ("number of types should match the length of sel array")); OP_REQUIRES (context, (ntypes == int(sel_r.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (nnei <= 1024), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 1024!, which currently is not supported by deepmd-kit.")); + OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); // Create output tensors TensorShape descrpt_shape ; @@ -192,14 +209,23 @@ class DescrptSeAOp : public OpKernel { nlist_shape, &nlist_tensor)); - int * ilist = NULL, *jrange = NULL, *jlist = NULL; - int * array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL; + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * magic_number * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + + int * ilist = NULL, * jrange = NULL, * jlist = NULL; + int * array_int = int_temp.flat().data(); + unsigned long long * array_longlong = uint64_temp.flat().data(); cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost)); // Launch computation for (int II = 0; II < nsamples; II++) { @@ -210,22 +236,20 @@ class DescrptSeAOp : public OpKernel { jlist, array_int, array_longlong, - array_double, avg_tensor.matrix().data(), std_tensor.matrix().data(), descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), rij_tensor->matrix().data() + II * (nloc * nnei * 3), nlist_tensor->matrix().data() + II * (nloc * nnei), - ntypes, nloc, - nall, nnei, rcut_r, rcut_r_smth, ndescrpt, sec_a, - fill_nei_a + fill_nei_a, + magic_number ); } // std::cout << "done" << std::endl; @@ -242,7 +266,7 @@ class DescrptSeAOp : public OpKernel { std::vector sec_a; std::vector sec_r; int ndescrpt, ndescrpt_a, ndescrpt_r; - int nnei, nnei_a, nnei_r, nloc, nall; + int nnei, nnei_a, nnei_r, nloc, nall, magic_number; bool fill_nei_a; //private func