diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu index 97321e74e8..62c7ce8926 100644 --- a/source/lib/src/cuda/prod_force.cu +++ b/source/lib/src/cuda/prod_force.cu @@ -50,12 +50,11 @@ __global__ void force_deriv_wrt_neighbors_a( const int nnei) { // idy -> nnei - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idy = blockIdx.y; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; const unsigned int idz = threadIdx.y; - const unsigned int idw = threadIdx.z; const int ndescrpt = nnei * 4; - if (idx >= nloc) { + if (idy >= nnei) { return; } // deriv wrt neighbors @@ -63,9 +62,11 @@ __global__ void force_deriv_wrt_neighbors_a( if (j_idx < 0) { return; } - atomicAdd( - force + j_idx * 3 + idz, - net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); + FPTYPE force_tmp = 0.f; + for (int idw = 0; idw < 4; ++idw) { + force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]; + } + atomicAdd(force + j_idx * 3 + idz, force_tmp); } template @@ -78,11 +79,11 @@ __global__ void force_deriv_wrt_neighbors_r( const int nnei) { // idy -> nnei - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idy = blockIdx.y; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; const unsigned int idz = threadIdx.y; const int ndescrpt = nnei * 1; - if (idx >= nloc) { + if (idy >= nnei) { return; } // deriv wrt neighbors @@ -116,9 +117,9 @@ void prod_force_a_gpu_cuda( net_deriv, in_deriv, ndescrpt); const int LEN = 64; - const int nblock = (nloc + LEN -1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 3, 4); + const int nblock = (nnei + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(LEN, 3); force_deriv_wrt_neighbors_a<<>>( force, net_deriv, in_deriv, nlist, nloc, nnei); @@ -144,8 +145,8 @@ void prod_force_r_gpu_cuda( net_deriv, in_deriv, ndescrpt); const int LEN = 64; - const int nblock = (nloc + LEN -1) / LEN; - dim3 block_grid(nblock, nnei); + const int nblock = (nnei + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); dim3 thread_grid(LEN, 3); force_deriv_wrt_neighbors_r<<>>( force, diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu index addb4df92a..08a64d71fe 100644 --- a/source/lib/src/cuda/prod_virial.cu +++ b/source/lib/src/cuda/prod_virial.cu @@ -45,12 +45,11 @@ __global__ void virial_deriv_wrt_neighbors_a( // idz = dd0 * 3 + dd1 // dd0 = idz / 3 // dd1 = idz % 3 - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idy = blockIdx.y; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; const unsigned int idz = threadIdx.y; - const unsigned int idw = threadIdx.z; const int ndescrpt = nnei * 4; - if (idx >= nloc) { + if (idy >= nnei) { return; } int j_idx = nlist[idx * nnei + idy]; @@ -60,9 +59,11 @@ __global__ void virial_deriv_wrt_neighbors_a( // atomicAdd( // virial + idz, // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); - atomicAdd( - atom_virial + j_idx * 9 + idz, - net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]); + FPTYPE virial_tmp = 0.f; + for (int idw = 0; idw < 4; ++idw) { + virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; + } + atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp); } template @@ -81,12 +82,12 @@ __global__ void virial_deriv_wrt_neighbors_r( // idz = dd0 * 3 + dd1 // dd0 = idz / 3 // dd1 = idz % 3 - const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idy = blockIdx.y; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; const unsigned int idz = threadIdx.y; const int ndescrpt = nnei * 1; - if (idx >= nloc) { + if (idy >= nnei) { return; } int j_idx = nlist[idx * nnei + idy]; @@ -122,9 +123,9 @@ void prod_virial_a_gpu_cuda( 0.0, sizeof(FPTYPE) * 9 * nall)); const int LEN = 16; - int nblock = (nloc + LEN -1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 9, 4); + int nblock = (nnei + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(LEN, 9); // compute virial of a frame virial_deriv_wrt_neighbors_a<<>>( virial, atom_virial, @@ -155,8 +156,8 @@ void prod_virial_r_gpu_cuda( 0.0, sizeof(FPTYPE) * 9 * nall)); const int LEN = 16; - int nblock = (nloc + LEN -1) / LEN; - dim3 block_grid(nblock, nnei); + int nblock = (nnei + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); dim3 thread_grid(LEN, 9); // compute virial of a frame virial_deriv_wrt_neighbors_r<<>>(