diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h index c446db8130..f7fd7c2496 100644 --- a/source/lib/include/CustomeOperation.h +++ b/source/lib/include/CustomeOperation.h @@ -169,7 +169,7 @@ void compute_descriptor_se_a_cpu ( } template -void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { +void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { // set & normalize coord std::vector d_coord3(nall * 3); for (int ii = 0; ii < nall; ++ii) { @@ -235,8 +235,8 @@ void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * i #if GOOGLE_CUDA template -void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { + DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size); } #endif // GOOGLE_CUDA // ****************************************************************************** @@ -432,7 +432,7 @@ void compute_descriptor_se_r_cpu ( } template -void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { +void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { // set & normalize coord std::vector d_coord3(nall * 3); for (int ii = 0; ii < nall; ++ii) { @@ -498,8 +498,8 @@ void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * i #if GOOGLE_CUDA template -void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeRGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { + DescrptSeRGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size); } #endif // GOOGLE_CUDA // ****************************************************************************** diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index 5965254111..a528c4c477 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -84,9 +84,9 @@ __global__ void format_nlist_fill_a_se_a(const FPTYPE * coord, const float rcut, int_64 * key, int * i_idx, - const int MAGIC_NUMBER) + const int MAX_NBOR_SIZE) { - // <<>> + // <<>> const unsigned int idx = blockIdx.x; const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; @@ -98,7 +98,7 @@ __global__ void format_nlist_fill_a_se_a(const FPTYPE * coord, const int * nei_idx = jlist + jrange[i_idx[idx]]; // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); - int_64 * key_in = key + idx * MAGIC_NUMBER; + int_64 * key_in = key + idx * MAX_NBOR_SIZE; FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; @@ -121,7 +121,7 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, const int * sec_a, const int sec_a_size, int * nei_iter_dev, - const int MAGIC_NUMBER) + const int MAX_NBOR_SIZE) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; @@ -132,13 +132,13 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, int * row_nlist = nlist + idy * nlist_size; int * nei_iter = nei_iter_dev + idy * sec_a_size; - int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER; + int_64 * key_out = key + nloc * MAX_NBOR_SIZE + idy * MAX_NBOR_SIZE; for (int ii = 0; ii < sec_a_size; ii++) { nei_iter[ii] = sec_a[ii]; } - for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) { + for (unsigned int kk = 0; key_out[kk] != key_out[MAX_NBOR_SIZE - 1]; kk++) { const int & nei_type = key_out[kk] / 1E15; if (nei_iter[nei_type] < sec_a[nei_type + 1]) { row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000; @@ -228,73 +228,6 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript, } } -template -void format_nbor_list_256 ( - const FPTYPE* coord, - const int* type, - const int* jrange, - const int* jlist, - const int& nloc, - const float& rcut_r, - int * i_idx, - int_64 * key -) -{ - const int LEN = 256; - const int MAGIC_NUMBER = 256; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - format_nlist_fill_a_se_a - <<>> ( - coord, - type, - jrange, - jlist, - rcut_r, - key, - i_idx, - MAGIC_NUMBER - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; - // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); -} - -template -void format_nbor_list_512 ( - const FPTYPE* coord, - const int* type, - const int* jrange, - const int* jlist, - const int& nloc, - const float& rcut_r, - int * i_idx, - int_64 * key -) -{ - const int LEN = 256; - const int MAGIC_NUMBER = 512; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - format_nlist_fill_a_se_a - <<>> ( - coord, - type, - jrange, - jlist, - rcut_r, - key, - i_idx, - MAGIC_NUMBER - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; - // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); -} template void format_nbor_list_1024 ( @@ -309,8 +242,8 @@ void format_nbor_list_1024 ( ) { const int LEN = 256; - const int MAGIC_NUMBER = 1024; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + const int MAX_NBOR_SIZE = 1024; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a @@ -322,12 +255,12 @@ void format_nbor_list_1024 ( rcut_r, key, i_idx, - MAGIC_NUMBER + MAX_NBOR_SIZE ); const int ITEMS_PER_THREAD = 8; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + BlockSortKernel <<>> (key, key + nloc * MAX_NBOR_SIZE); } template @@ -343,8 +276,8 @@ void format_nbor_list_2048 ( ) { const int LEN = 256; - const int MAGIC_NUMBER = 2048; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + const int MAX_NBOR_SIZE = 2048; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a @@ -356,12 +289,12 @@ void format_nbor_list_2048 ( rcut_r, key, i_idx, - MAGIC_NUMBER + MAX_NBOR_SIZE ); const int ITEMS_PER_THREAD = 8; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + BlockSortKernel <<>> (key, key + nloc * MAX_NBOR_SIZE); } template @@ -377,8 +310,8 @@ void format_nbor_list_4096 ( ) { const int LEN = 256; - const int MAGIC_NUMBER = 4096; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + const int MAX_NBOR_SIZE = 4096; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a @@ -390,16 +323,16 @@ void format_nbor_list_4096 ( rcut_r, key, i_idx, - MAGIC_NUMBER + MAX_NBOR_SIZE ); const int ITEMS_PER_THREAD = 16; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + BlockSortKernel <<>> (key, key + nloc * MAX_NBOR_SIZE); } template -void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { +void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; int * sec_a_dev = array_int; @@ -409,7 +342,7 @@ void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const cudaError_t res = cudaSuccess; res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); - res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); + res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res); res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); @@ -419,29 +352,7 @@ void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const // cudaProfilerStart(); get_i_idx_se_a<<>> (nloc, ilist, i_idx); - if (nnei <= 256) { - format_nbor_list_256 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 512) { - format_nbor_list_512 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 1024) { + if (max_nbor_size <= 1024) { format_nbor_list_1024 ( coord, type, @@ -452,7 +363,7 @@ void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const i_idx, key ); - } else if (nnei <= 2048) { + } else if (max_nbor_size <= 2048) { format_nbor_list_2048 ( coord, type, @@ -463,7 +374,7 @@ void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const i_idx, key ); - } else if (nnei <= 4096) { + } else if (max_nbor_size <= 4096) { format_nbor_list_4096 ( coord, type, @@ -486,7 +397,7 @@ void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const sec_a_dev, sec_a.size(), nei_iter, - MAGIC_NUMBER + max_nbor_size ); } diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu index a65ba5887a..0715f19c5e 100644 --- a/source/op/cuda/descrpt_se_r.cu +++ b/source/op/cuda/descrpt_se_r.cu @@ -84,9 +84,9 @@ __global__ void format_nlist_fill_a_se_r(const FPTYPE * coord, const float rcut, int_64 * key, int * i_idx, - const int MAGIC_NUMBER) + const int MAX_NBOR_SIZE) { - // <<>> + // <<>> const unsigned int idx = blockIdx.x; const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; @@ -98,7 +98,7 @@ __global__ void format_nlist_fill_a_se_r(const FPTYPE * coord, const int * nei_idx = jlist + jrange[i_idx[idx]]; // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize); - int_64 * key_in = key + idx * MAGIC_NUMBER; + int_64 * key_in = key + idx * MAX_NBOR_SIZE; FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; @@ -121,7 +121,7 @@ __global__ void format_nlist_fill_b_se_r(int * nlist, const int * sec_a, const int sec_a_size, int * nei_iter_dev, - const int MAGIC_NUMBER) + const int MAX_NBOR_SIZE) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; @@ -132,13 +132,13 @@ __global__ void format_nlist_fill_b_se_r(int * nlist, int * row_nlist = nlist + idy * nlist_size; int * nei_iter = nei_iter_dev + idy * sec_a_size; - int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER; + int_64 * key_out = key + nloc * MAX_NBOR_SIZE + idy * MAX_NBOR_SIZE; for (int ii = 0; ii < sec_a_size; ii++) { nei_iter[ii] = sec_a[ii]; } - for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) { + for (unsigned int kk = 0; key_out[kk] != key_out[MAX_NBOR_SIZE - 1]; kk++) { const int & nei_type = key_out[kk] / 1E15; if (nei_iter[nei_type] < sec_a[nei_type + 1]) { row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000; @@ -210,73 +210,6 @@ __global__ void compute_descriptor_se_r (FPTYPE* descript, } } -template -void format_nbor_list_256 ( - const FPTYPE* coord, - const int* type, - const int* jrange, - const int* jlist, - const int& nloc, - const float& rcut_r, - int * i_idx, - int_64 * key -) -{ - const int LEN = 256; - const int MAGIC_NUMBER = 256; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - format_nlist_fill_a_se_r - <<>> ( - coord, - type, - jrange, - jlist, - rcut_r, - key, - i_idx, - MAGIC_NUMBER - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; - // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); -} - -template -void format_nbor_list_512 ( - const FPTYPE* coord, - const int* type, - const int* jrange, - const int* jlist, - const int& nloc, - const float& rcut_r, - int * i_idx, - int_64 * key -) -{ - const int LEN = 256; - const int MAGIC_NUMBER = 512; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(1, LEN); - format_nlist_fill_a_se_r - <<>> ( - coord, - type, - jrange, - jlist, - rcut_r, - key, - i_idx, - MAGIC_NUMBER - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; - // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); -} template void format_nbor_list_1024 ( @@ -291,8 +224,8 @@ void format_nbor_list_1024 ( ) { const int LEN = 256; - const int MAGIC_NUMBER = 1024; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + const int MAX_NBOR_SIZE = 1024; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a_se_r @@ -304,12 +237,12 @@ void format_nbor_list_1024 ( rcut_r, key, i_idx, - MAGIC_NUMBER + MAX_NBOR_SIZE ); const int ITEMS_PER_THREAD = 8; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + BlockSortKernel <<>> (key, key + nloc * MAX_NBOR_SIZE); } template @@ -325,8 +258,8 @@ void format_nbor_list_2048 ( ) { const int LEN = 256; - const int MAGIC_NUMBER = 2048; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + const int MAX_NBOR_SIZE = 2048; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a_se_r @@ -338,12 +271,12 @@ void format_nbor_list_2048 ( rcut_r, key, i_idx, - MAGIC_NUMBER + MAX_NBOR_SIZE ); const int ITEMS_PER_THREAD = 8; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + BlockSortKernel <<>> (key, key + nloc * MAX_NBOR_SIZE); } template @@ -359,8 +292,8 @@ void format_nbor_list_4096 ( ) { const int LEN = 256; - const int MAGIC_NUMBER = 4096; - const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + const int MAX_NBOR_SIZE = 4096; + const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, LEN); format_nlist_fill_a_se_r @@ -372,16 +305,16 @@ void format_nbor_list_4096 ( rcut_r, key, i_idx, - MAGIC_NUMBER + MAX_NBOR_SIZE ); const int ITEMS_PER_THREAD = 16; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD; // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + BlockSortKernel <<>> (key, key + nloc * MAX_NBOR_SIZE); } template -void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { +void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; int * sec_a_dev = array_int; @@ -391,7 +324,7 @@ void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const cudaError_t res = cudaSuccess; res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); - res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); + res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res); res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); @@ -401,29 +334,7 @@ void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const // cudaProfilerStart(); get_i_idx_se_r<<>> (nloc, ilist, i_idx); - if (nnei <= 256) { - format_nbor_list_256 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 512) { - format_nbor_list_512 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 1024) { + if (max_nbor_size <= 1024) { format_nbor_list_1024 ( coord, type, @@ -434,7 +345,7 @@ void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const i_idx, key ); - } else if (nnei <= 2048) { + } else if (max_nbor_size <= 2048) { format_nbor_list_2048 ( coord, type, @@ -445,7 +356,7 @@ void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const i_idx, key ); - } else if (nnei <= 4096) { + } else if (max_nbor_size <= 4096) { format_nbor_list_4096 ( coord, type, @@ -468,7 +379,7 @@ void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const sec_a_dev, sec_a.size(), nei_iter, - MAGIC_NUMBER + max_nbor_size ); } diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc index ae5e623171..93e2cdccac 100644 --- a/source/op/descrpt_se_a_multi_device.cc +++ b/source/op/descrpt_se_a_multi_device.cc @@ -34,13 +34,13 @@ struct DeviceFunctor { template struct DescrptSeAFunctor { - void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { + DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { + DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size); } #endif // GOOGLE_CUDA }; @@ -66,7 +66,7 @@ class DescrptSeAOp : public OpKernel { nnei_r = sec_r.back(); nnei = nnei_a + nnei_r; fill_nei_a = (rcut_a < 0); - magic_number = get_magic_number(nnei); + max_nbor_size = 1024; } void Compute(OpKernelContext* context) override { @@ -117,7 +117,6 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES (context, (ntypes == int(sel_a.size())), errors::InvalidArgument ("number of types should match the length of sel array")); OP_REQUIRES (context, (ntypes == int(sel_r.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); // Create output tensors TensorShape descrpt_shape ; @@ -159,13 +158,14 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); Tensor uint64_temp; TensorShape uint64_shape; - uint64_shape.AddDim(nloc * magic_number * 2); + uint64_shape.AddDim(nloc * max_nbor_size * 2); OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); array_int = int_temp.flat().data(); array_longlong = uint64_temp.flat().data(); nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); + OP_REQUIRES (context, (max_nbor_size <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit.")); } else if (device == "CPU") { memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); @@ -198,7 +198,7 @@ class DescrptSeAOp : public OpKernel { rcut_r_smth, sec_a, fill_nei_a, - magic_number + max_nbor_size ); } @@ -212,7 +212,7 @@ class DescrptSeAOp : public OpKernel { std::vector sec_a; std::vector sec_r; int ndescrpt, ndescrpt_a, ndescrpt_r; - int nnei, nnei_a, nnei_r, nloc, nall, magic_number; + int nnei, nnei_a, nnei_r, nloc, nall, max_nbor_size; bool fill_nei_a; //private func @@ -266,27 +266,15 @@ class DescrptSeAOp : public OpKernel { cudaErrcheck(cudaMemcpy(ilist, ilist_host, sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); + + max_nbor_size = 1024; + for(int ii = 0; ii < mesh_host[2]; ii++) { + max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size; + } } delete [] mesh_host; } - int get_magic_number(int const nnei) { - if (nnei <= 256) { - return 256; - } - else if (nnei <= 512) { - return 512; - } - else if (nnei <= 1024) { - return 1024; - } - else if (nnei <= 2048) { - return 2048; - } - else if (nnei <= 4096) { - return 4096; - } - } }; // Register the CPU kernels. diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc index c5eaff616c..b94f97d6e1 100644 --- a/source/op/descrpt_se_r_multi_device.cc +++ b/source/op/descrpt_se_r_multi_device.cc @@ -31,13 +31,13 @@ struct DeviceFunctor { template struct DescrptSeRFunctor { - void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { + DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int max_nbor_size) { + DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size); } #endif // GOOGLE_CUDA }; @@ -55,9 +55,7 @@ class DescrptSeROp : public OpKernel { ndescrpt = sec.back() * 1; nnei = sec.back(); fill_nei_a = true; - magic_number = get_magic_number(nnei); - // count_nei_idx_overflow = 0; - // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl; + max_nbor_size = 1024; } void Compute(OpKernelContext* context) override { @@ -149,13 +147,14 @@ class DescrptSeROp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); Tensor uint64_temp; TensorShape uint64_shape; - uint64_shape.AddDim(nloc * magic_number * 2); + uint64_shape.AddDim(nloc * max_nbor_size * 2); OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); array_int = int_temp.flat().data(); array_longlong = uint64_temp.flat().data(); nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); + OP_REQUIRES (context, (max_nbor_size <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit.")); } else if (device == "CPU") { memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); @@ -188,7 +187,7 @@ class DescrptSeROp : public OpKernel { rcut_smth, sec, fill_nei_a, - magic_number + max_nbor_size ); } @@ -213,7 +212,7 @@ class DescrptSeROp : public OpKernel { } } - int magic_number; + int max_nbor_size; std::string device; int *array_int; unsigned long long*array_longlong; @@ -256,27 +255,15 @@ class DescrptSeROp : public OpKernel { cudaErrcheck(cudaMemcpy(ilist, ilist_host, sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); + + max_nbor_size = 1024; + for(int ii = 0; ii < mesh_host[2]; ii++) { + max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size; + } } delete [] mesh_host; } - int get_magic_number(int const nnei) { - if (nnei <= 256) { - return 256; - } - else if (nnei <= 512) { - return 512; - } - else if (nnei <= 1024) { - return 1024; - } - else if (nnei <= 2048) { - return 2048; - } - else if (nnei <= 4096) { - return 4096; - } - } }; // Register the CPU kernels.