Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 95 additions & 20 deletions source/source_pw/module_pwdft/forces.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
{
ModuleBase::TITLE("Forces", "cal_force_loc");
ModuleBase::timer::tick("Forces", "cal_force_loc");

this->device = base_device::get_device_type<Device>(this->ctx);
std::complex<double>* aux = new std::complex<double>[rho_basis->nmaxgr];
// now, in all pools , the charge are the same,
// so, the force calculated by each pool is equal.
Expand Down Expand Up @@ -368,30 +368,105 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
// to G space. maybe need fftw with OpenMP
rho_basis->real2recip(aux, aux);

#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int iat = 0; iat < this->nat; ++iat)
std::vector<double> tau_h;
std::vector<double> gcar_h;
if(this->device == base_device::GpuDevice)
{
// read `it` `ia` from the table
int it = ucell.iat2it[iat];
int ia = ucell.iat2ia[iat];
for (int ig = 0; ig < rho_basis->npw; ig++)
tau_h.resize(this->nat * 3);
for(int iat = 0; iat < this->nat; ++iat)
{
const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
double sinp, cosp;
ModuleBase::libm::sincos(phase, &sinp, &cosp);
const double factor
= vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
int it = ucell.iat2it[iat];
int ia = ucell.iat2ia[iat];
tau_h[iat * 3] = ucell.atoms[it].tau[ia].x;
tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y;
tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z;
}
forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);

gcar_h.resize(rho_basis->npw * 3);
for(int ig = 0; ig < rho_basis->npw; ++ig)
{
gcar_h[ig * 3] = rho_basis->gcar[ig].x;
gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y;
gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z;
}
}
int* iat2it_d = nullptr;
int* ig2gg_d = nullptr;
double* gcar_d = nullptr;
double* tau_d = nullptr;
std::complex<double>* aux_d = nullptr;
double* forcelc_d = nullptr;
double* vloc_d = nullptr;
if(this->device == base_device::GpuDevice)
{
resmem_int_op()(iat2it_d, this->nat);
resmem_int_op()(ig2gg_d, rho_basis->npw);
resmem_var_op()(gcar_d, rho_basis->npw * 3);
resmem_var_op()(tau_d, this->nat * 3);
resmem_complex_op()(aux_d, rho_basis->npw);
resmem_var_op()(forcelc_d, this->nat * 3);
resmem_var_op()(vloc_d, vloc.nr * vloc.nc);

syncmem_int_h2d_op()(iat2it_d, ucell.iat2it, this->nat);
syncmem_int_h2d_op()(ig2gg_d, rho_basis->ig2igg, rho_basis->npw);
syncmem_var_h2d_op()(gcar_d, gcar_h.data(), rho_basis->npw * 3);
syncmem_var_h2d_op()(tau_d, tau_h.data(), this->nat * 3);
syncmem_complex_h2d_op()(aux_d, aux, rho_basis->npw);
syncmem_var_h2d_op()(forcelc_d, forcelc.c, this->nat * 3);
syncmem_var_h2d_op()(vloc_d, vloc.c, vloc.nr * vloc.nc);
}

if(this->device == base_device::GpuDevice)
{
hamilt::cal_force_loc_op<FPTYPE, Device>()(
this->nat,
rho_basis->npw,
ucell.tpiba * ucell.omega,
iat2it_d,
ig2gg_d,
gcar_d,
tau_d,
aux_d,
vloc_d,
vloc.nc,
forcelc_d);
syncmem_var_d2h_op()(forcelc.c, forcelc_d, this->nat * 3);
}
else{
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int iat = 0; iat < this->nat; ++iat)
{
// read `it` `ia` from the table
int it = ucell.iat2it[iat];
int ia = ucell.iat2ia[iat];
for (int ig = 0; ig < rho_basis->npw; ig++)
{
const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
double sinp, cosp;
ModuleBase::libm::sincos(phase, &sinp, &cosp);
const double factor
= vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
}
forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
}
}
if(this->device == base_device::GpuDevice)
{
delmem_int_op()(iat2it_d);
delmem_int_op()(ig2gg_d);
delmem_var_op()(gcar_d);
delmem_var_op()(tau_d);
delmem_complex_op()(aux_d);
delmem_var_op()(forcelc_d);
delmem_var_op()(vloc_d);
}
// this->print(GlobalV::ofs_running, "local forces", forcelc);
Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
delete[] aux;
Expand Down
137 changes: 136 additions & 1 deletion source/source_pw/module_pwdft/kernels/cuda/force_op.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "source_pw/module_pwdft/kernels/force_op.h"
// #include "source_psi/kernels/device.h"
#include "source_base/module_device/types.h"
#include "source_base/constants.h"

#include <complex>

Expand All @@ -10,9 +11,19 @@
#include <source_base/module_device/device.h>

#define THREADS_PER_BLOCK 256
#define FULL_MASK 0xffffffff
#define WARP_SIZE 32

namespace hamilt {

template <typename FPTYPE>
__forceinline__
__device__
void warp_reduce(FPTYPE & val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val += __shfl_down_sync(FULL_MASK, val, offset);
}
}

template <typename FPTYPE>
__global__ void cal_vkb1_nl(
Expand Down Expand Up @@ -606,15 +617,139 @@ void revertVkbValues(
cudaCheckOnDebug();
}

template <typename FPTYPE>
__global__ void force_loc_kernel(
const int nat,
const int npw,
const FPTYPE tpiba_omega,
const int* iat2it,
const int* ig2gg_d,
const FPTYPE* gcar_d,
const FPTYPE* tau_d,
const thrust::complex<FPTYPE>* aux_d,
const FPTYPE* vloc_d,
const int vloc_nc,
FPTYPE* forcelc_d)
{
const int iat = blockIdx.x;
const int tid = threadIdx.x;
const int warp_id = tid / WARP_SIZE;
const int lane_id = tid % WARP_SIZE;

if (iat >= nat) return;
const int it = iat2it[iat]; // get the type of atom

// Initialize force components
FPTYPE force_x = 0.0;
FPTYPE force_y = 0.0;
FPTYPE force_z = 0.0;

const auto tau_x = tau_d[iat * 3 + 0];
const auto tau_y = tau_d[iat * 3 + 1];
const auto tau_z = tau_d[iat * 3 + 2];

// Process all plane waves in chunks of blockDim.x
for (int ig = tid; ig < npw; ig += blockDim.x) {
const auto gcar_x = gcar_d[ig * 3 + 0];
const auto gcar_y = gcar_d[ig * 3 + 1];
const auto gcar_z = gcar_d[ig * 3 + 2];

// Calculate phase factor
const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x +
gcar_y * tau_y +
gcar_z * tau_z);
FPTYPE sinp, cosp;
sincos(phase, &sinp, &cosp);

// Get vloc value
const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]];

// Calculate factor
const auto aux_val = aux_d[ig];
const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real());

// Multiply by gcar components
force_x += gcar_x * factor;
force_y += gcar_y * factor;
force_z += gcar_z * factor;
}

// Warp-level reduction
warp_reduce<FPTYPE>(force_x);
warp_reduce<FPTYPE>(force_y);
warp_reduce<FPTYPE>(force_z);

// First thread in each warp writes to shared memory
__shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps
__shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE];
__shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE];

if (lane_id == 0) {
warp_sums_x[warp_id] = force_x;
warp_sums_y[warp_id] = force_y;
warp_sums_z[warp_id] = force_z;
}

__syncthreads();

// Final reduction by first warp
if (warp_id == 0) {
FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0;
FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0;
FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0;

warp_reduce<FPTYPE>(final_x);
warp_reduce<FPTYPE>(final_y);
warp_reduce<FPTYPE>(final_z);

if (lane_id == 0) {
forcelc_d[iat * 3 + 0] = final_x * tpiba_omega;
forcelc_d[iat * 3 + 1] = final_y * tpiba_omega;
forcelc_d[iat * 3 + 2] = final_z * tpiba_omega;
}
}
}

template <typename FPTYPE>
void cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const int nat,
const int npw,
const FPTYPE tpiba_omega,
const int* iat2it,
const int* ig2igg,
const FPTYPE* gcar,
const FPTYPE* tau,
const std::complex<FPTYPE>* aux,
const FPTYPE* vloc,
const int vloc_nc,
FPTYPE* forcelc)
{
force_loc_kernel<FPTYPE>
<<<nat, THREADS_PER_BLOCK>>>(nat,
npw,
tpiba_omega,
iat2it,
ig2igg,
gcar,
tau,
reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
vloc,
vloc_nc,
forcelc); // array of data

}


// for revertVkbValues functions instantiation
template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<double> *vkb_ptr, const std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex<double> coeff);
// for saveVkbValues functions instantiation
template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::complex<double> *vkb_ptr, std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);

template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
template struct cal_force_loc_op<float, base_device::DEVICE_GPU>;

template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;

template struct cal_force_loc_op<double, base_device::DEVICE_GPU>;
} // namespace hamilt
30 changes: 30 additions & 0 deletions source/source_pw/module_pwdft/kernels/force_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,21 @@ struct cal_force_nl_op
FPTYPE* force);
};

template <typename FPTYPE, typename Device>
struct cal_force_loc_op{
void operator()(
const int nat,
const int npw,
const FPTYPE tpiba_omega,
const int* iat2it,
const int* ig2igg,
const FPTYPE* gcar,
const FPTYPE* tau,
const std::complex<FPTYPE>* aux,
const FPTYPE* vloc,
const int vloc_nr,
FPTYPE* forcelc) {};
};
#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
template <typename FPTYPE>
struct cal_vkb1_nl_op<FPTYPE, base_device::DEVICE_GPU>
Expand Down Expand Up @@ -275,6 +290,21 @@ void saveVkbValues(const int* gcar_zero_ptrs,
int ipol,
int npwx);

template <typename FPTYPE>
struct cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>{
void operator()(
const int nat,
const int npw,
const FPTYPE tpiba_omega,
const int* iat2it,
const int* ig2igg,
const FPTYPE* gcar,
const FPTYPE* tau,
const std::complex<FPTYPE>* aux,
const FPTYPE* vloc,
const int vloc_nr,
FPTYPE* forcelc);
};
#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
} // namespace hamilt
#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
Loading