diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp index 532bf7e561..3454a65963 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp @@ -22,7 +22,7 @@ #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h" template -Force_Stress_LCAO::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), f_pw(nat_in), nat(nat_in) +Force_Stress_LCAO::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), nat(nat_in) { } template @@ -861,24 +861,39 @@ void Force_Stress_LCAO::calForcePwPart(UnitCell& ucell, const Structure_Factor& sf) { ModuleBase::TITLE("Force_Stress_LCAO", "calForcePwPart"); - //-------------------------------------------------------- - // local pseudopotential force: - // use charge density; plane wave; local pseudopotential; - //-------------------------------------------------------- - f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr); - //-------------------------------------------------------- - // ewald force: use plane wave only. - //-------------------------------------------------------- - f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem +#ifdef __CUDA + if(PARAM.inp.device == "gpu") + { + Forces f_pw(nat); + + //-------------------------------------------------------- + // local pseudopotential force: + // use charge density; plane wave; local pseudopotential; + //-------------------------------------------------------- + f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr); + //-------------------------------------------------------- + // ewald force: use plane wave only. + //-------------------------------------------------------- + f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem + + //-------------------------------------------------------- + // force due to core correlation. + //-------------------------------------------------------- + f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell); + //-------------------------------------------------------- + // force due to self-consistent charge. + //-------------------------------------------------------- + f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell); + } else +#endif + { + Forces f_pw(nat); + f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr); + f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem + f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell); + f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell); + } - //-------------------------------------------------------- - // force due to core correlation. - //-------------------------------------------------------- - f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell); - //-------------------------------------------------------- - // force due to self-consistent charge. - //-------------------------------------------------------- - f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell); return; } diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h index a11add920e..b55caa7935 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h +++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h @@ -60,7 +60,6 @@ class Force_Stress_LCAO Record_adj* RA; Force_LCAO flk; Stress_Func sc_pw; - Forces f_pw; void forceSymmetry(const UnitCell& ucell, ModuleBase::matrix& fcs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index a4d779366b..ecdf746019 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -20,15 +20,13 @@ #ifdef _OPENMP #include #endif -#ifdef USE_PAW -#include "module_cell/module_paw/paw_cell.h" -#endif + template void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix& force, const elecstate::ElecState& elec, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* const rho_basis, ModuleSymmetry::Symmetry* p_symm, Structure_Factor* p_sf, surchem& solvent, @@ -55,15 +53,7 @@ void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix forceonsite(nat, 3); // Force due to local ionic potential - // For PAW, calculated together in paw_cell.calculate_force - if (!PARAM.inp.use_paw) - { - this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr); - } - else - { - forcelc.zero_out(); - } + this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr); // Ewald this->cal_force_ew(ucell,forceion, rho_basis, p_sf); @@ -71,94 +61,15 @@ void Forces::cal_force(UnitCell& ucell, // Force due to nonlocal part of pseudopotential if (wfc_basis != nullptr) { - if (!PARAM.inp.use_paw) - { - this->npwx = wfc_basis->npwk_max; - Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in); - if (PARAM.globalv.use_uspp) - { - this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell); - } - } - else - { -#ifdef USE_PAW - for (int ik = 0; ik < wfc_basis->nks; ik++) - { - const int npw = wfc_basis->npwk[ik]; - ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; - for (int ig = 0; ig < npw; ig++) - { - _gk[ig] = wfc_basis->getgpluskcar(ik, ig); - } + this->npwx = wfc_basis->npwk_max; + Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in); - double* kpt; - kpt = new double[3]; - kpt[0] = wfc_basis->kvec_c[ik].x; - kpt[1] = wfc_basis->kvec_c[ik].y; - kpt[2] = wfc_basis->kvec_c[ik].z; - - double** kpg; - double** gcar; - kpg = new double*[npw]; - gcar = new double*[npw]; - for (int ipw = 0; ipw < npw; ipw++) - { - kpg[ipw] = new double[3]; - kpg[ipw][0] = _gk[ipw].x; - kpg[ipw][1] = _gk[ipw].y; - kpg[ipw][2] = _gk[ipw].z; - - gcar[ipw] = new double[3]; - gcar[ipw][0] = wfc_basis->getgcar(ik, ipw).x; - gcar[ipw][1] = wfc_basis->getgcar(ik, ipw).y; - gcar[ipw][2] = wfc_basis->getgcar(ik, ipw).z; - } - - GlobalC::paw_cell.set_paw_k(npw, - wfc_basis->npwk_max, - kpt, - wfc_basis->get_ig2ix(ik).data(), - wfc_basis->get_ig2iy(ik).data(), - wfc_basis->get_ig2iz(ik).data(), - (const double**)kpg, - ucell.tpiba, - (const double**)gcar); - - delete[] kpt; - for (int ipw = 0; ipw < npw; ipw++) - { - delete[] kpg[ipw]; - delete[] gcar[ipw]; - } - delete[] kpg; - delete[] gcar; - - GlobalC::paw_cell.get_vkb(); - - GlobalC::paw_cell.set_currentk(ik); - - psi_in[0].fix_k(ik); - double *weight, *epsilon; - weight = new double[PARAM.inp.nbands]; - epsilon = new double[PARAM.inp.nbands]; - for (int ib = 0; ib < PARAM.inp.nbands; ib++) - { - weight[ib] = wg(ik, ib); - epsilon[ib] = ekb(ik, ib); - } - GlobalC::paw_cell.paw_nl_force(reinterpret_cast*>(psi_in[0].get_pointer()), - epsilon, - weight, - PARAM.inp.nbands, - forcenl.c); - - delete[] weight; - delete[] epsilon; - } -#endif + if (PARAM.globalv.use_uspp) + { + this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell); } + // DFT+U and DeltaSpin if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) { @@ -167,26 +78,10 @@ void Forces::cal_force(UnitCell& ucell, } // non-linear core correction - // not relevant for PAW - if (!PARAM.inp.use_paw) - { - Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell); - } - else - { - forcecc.zero_out(); - } + Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell); // force due to core charge - // For PAW, calculated together in paw_cell.calculate_force - if (!PARAM.inp.use_paw) - { - this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell); - } - else - { - forcescc.zero_out(); - } + this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell); ModuleBase::matrix stress_vdw_pw; //.create(3,3); ModuleBase::matrix force_vdw; @@ -240,52 +135,6 @@ void Forces::cal_force(UnitCell& ucell, } } -#ifdef USE_PAW - if (PARAM.inp.use_paw) - { - double* force_paw; - double* rhor; - rhor = new double[rho_basis->nrxx]; - for (int ir = 0; ir < rho_basis->nrxx; ir++) - { - rhor[ir] = 0.0; - } - for (int is = 0; is < PARAM.inp.nspin; is++) - { - for (int ir = 0; ir < rho_basis->nrxx; ir++) - { - rhor[ir] += chr->rho[is][ir] + chr->nhat[is][ir]; - } - } - - force_paw = new double[3 * this->nat]; - ModuleBase::matrix v_xc, v_effective; - v_effective.create(PARAM.inp.nspin, rho_basis->nrxx); - v_effective.zero_out(); - elec.pot->update_from_charge(elec.charge, &ucell); - v_effective = elec.pot->get_effective_v(); - - v_xc.create(PARAM.inp.nspin, rho_basis->nrxx); - v_xc.zero_out(); - const std::tuple etxc_vtxc_v - = XC_Functional::v_xc(rho_basis->nrxx, elec.charge, &ucell); - v_xc = std::get<2>(etxc_vtxc_v); - - GlobalC::paw_cell.calculate_force(v_effective.c, v_xc.c, rhor, force_paw); - - for (int iat = 0; iat < this->nat; iat++) - { - // Ha to Ry - forcepaw(iat, 0) = force_paw[3 * iat] * 2.0; - forcepaw(iat, 1) = force_paw[3 * iat + 1] * 2.0; - forcepaw(iat, 2) = force_paw[3 * iat + 2] * 2.0; - } - - delete[] force_paw; - delete[] rhor; - } -#endif - // impose total force = 0 int iat = 0; for (int ipol = 0; ipol < 3; ipol++) @@ -300,11 +149,6 @@ void Forces::cal_force(UnitCell& ucell, force(iat, ipol) = forcelc(iat, ipol) + forceion(iat, ipol) + forcenl(iat, ipol) + forcecc(iat, ipol) + forcescc(iat, ipol); - if (PARAM.inp.use_paw) - { - force(iat, ipol) += forcepaw(iat, ipol); - } - if (vdw_solver != nullptr) // linpz and jiyy added vdw force, modified by zhengdy { force(iat, ipol) += force_vdw(iat, ipol); @@ -442,14 +286,7 @@ void Forces::cal_force(UnitCell& ucell, ModuleIO::print_force(GlobalV::ofs_running, ucell, "NLCC FORCE (eV/Angstrom)", forcecc, false); ModuleIO::print_force(GlobalV::ofs_running, ucell, "ION FORCE (eV/Angstrom)", forceion, false); ModuleIO::print_force(GlobalV::ofs_running, ucell, "SCC FORCE (eV/Angstrom)", forcescc, false); - if (PARAM.inp.use_paw) - { - ModuleIO::print_force(GlobalV::ofs_running, - ucell, - "PAW FORCE (eV/Angstrom)", - forcepaw, - false); - } + if (PARAM.inp.efield_flag) { ModuleIO::print_force(GlobalV::ofs_running, ucell, "EFIELD FORCE (eV/Angstrom)", force_e, false); @@ -487,13 +324,13 @@ void Forces::cal_force(UnitCell& ucell, template void Forces::cal_force_loc(const UnitCell& ucell, ModuleBase::matrix& forcelc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* const rho_basis, const ModuleBase::matrix& vloc, const Charge* const chr) { ModuleBase::TITLE("Forces", "cal_force_loc"); ModuleBase::timer::tick("Forces", "cal_force_loc"); - + this->device = base_device::get_device_type(this->ctx); std::complex* aux = new std::complex[rho_basis->nmaxgr]; // now, in all pools , the charge are the same, // so, the force calculated by each pool is equal. @@ -532,110 +369,100 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); - // sincos op for G space - - - // data preparation - std::vector tau_flat(this->nat * 3); - std::vector gcar_flat(rho_basis->npw * 3); - - - for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - } - - for (int ig = 0; ig < rho_basis->npw; ig++) { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - // calculate vloc_factors for all atom types - std::vector vloc_per_type_host(this->nat * rho_basis->npw); - for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) { - vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); - } - } - - std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) { - aux_fptype[ig] = static_cast>(aux[ig]); - } - - FPTYPE* d_gcar = gcar_flat.data(); - FPTYPE* d_tau = tau_flat.data(); - FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); - std::complex* d_aux = aux_fptype.data(); - FPTYPE* d_force = nullptr; - std::vector force_host(this->nat * 3); - - if (this->device == base_device::GpuDevice) - { - d_gcar = nullptr; - d_tau = nullptr; - d_vloc_per_type = nullptr; - d_aux = nullptr; - - resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); - resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw); - resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); - resmem_var_op()(this->ctx, d_force, this->nat * 3); - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - - base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); - } - else + if(this->device == base_device::GpuDevice) { - d_force = force_host.data(); - std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); - } - - const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); - - // call op for sincos calculation - hamilt::cal_force_loc_sincos_op()( - this->ctx, - this->nat, - rho_basis->npw, - this->nat, - d_gcar, - d_tau, - d_vloc_per_type, - d_aux, - scale_factor, - d_force - ); - - if (this->device == base_device::GpuDevice) - { - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); - - delmem_var_op()(this->ctx, d_gcar); - delmem_var_op()(this->ctx, d_tau); - delmem_var_op()(this->ctx, d_vloc_per_type); - delmem_complex_op()(this->ctx, d_aux); - delmem_var_op()(this->ctx, d_force); - } - - for (int iat = 0; iat < this->nat; iat++) { - forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); - forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); - forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); - } + std::vector tau_h; + std::vector gcar_h; + tau_h.resize(this->nat * 3); + for(int iat = 0; iat < this->nat; ++iat) + { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + tau_h[iat * 3] = ucell.atoms[it].tau[ia].x; + tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y; + tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z; + } - // this->print(GlobalV: :ofs_running, "local forces", forcelc); + gcar_h.resize(rho_basis->npw * 3); + for(int ig = 0; ig < rho_basis->npw; ++ig) + { + gcar_h[ig * 3] = rho_basis->gcar[ig].x; + gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y; + gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z; + } + + int* iat2it_d = nullptr; + int* ig2gg_d = nullptr; + double* gcar_d = nullptr; + double* tau_d = nullptr; + std::complex* aux_d = nullptr; + double* forcelc_d = nullptr; + double* vloc_d = nullptr; + + resmem_int_op()(this->ctx,iat2it_d, this->nat); + resmem_int_op()(this->ctx,ig2gg_d, rho_basis->npw); + resmem_var_op()(this->ctx,gcar_d, rho_basis->npw * 3); + resmem_var_op()(this->ctx,tau_d, this->nat * 3); + resmem_complex_op()(this->ctx,aux_d, rho_basis->npw); + resmem_var_op()(this->ctx,forcelc_d, this->nat * 3); + resmem_var_op()(this->ctx,vloc_d, vloc.nr * vloc.nc); + + syncmem_int_h2d_op()( this->ctx, this->cpu_ctx,iat2it_d, ucell.iat2it, this->nat); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, ig2gg_d, rho_basis->ig2igg, rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux, rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forcelc_d, forcelc.c, this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, vloc_d, vloc.c, vloc.nr * vloc.nc); + + hamilt::cal_force_loc_op()( + this->nat, + rho_basis->npw, + ucell.tpiba * ucell.omega, + iat2it_d, + ig2gg_d, + gcar_d, + tau_d, + aux_d, + vloc_d, + vloc.nc, + forcelc_d); + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcelc.c, forcelc_d, this->nat * 3); + + delmem_int_op()(this->ctx,iat2it_d); + delmem_int_op()(this->ctx,ig2gg_d); + delmem_var_op()(this->ctx,gcar_d); + delmem_var_op()(this->ctx,tau_d); + delmem_complex_op()(this->ctx,aux_d); + delmem_var_op()(this->ctx,forcelc_d); + delmem_var_op()(this->ctx,vloc_d); + } + else{ // calculate forces on CPU + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int iat = 0; iat < this->nat; ++iat) + { + // read `it` `ia` from the table + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + for (int ig = 0; ig < rho_basis->npw; ig++) + { + const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); + double sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + const double factor + = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; + forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; + forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; + } + forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); + forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); + forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + } + } + // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; ModuleBase::timer::tick("Forces", "cal_force_loc"); @@ -645,14 +472,14 @@ void Forces::cal_force_loc(const UnitCell& ucell, template void Forces::cal_force_ew(const UnitCell& ucell, ModuleBase::matrix& forceion, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* const rho_basis, const Structure_Factor* p_sf) { ModuleBase::TITLE("Forces", "cal_force_ew"); ModuleBase::timer::tick("Forces", "cal_force_ew"); - + this->device = base_device::get_device_type(this->ctx); double fact = 2.0; - std::complex* aux = new std::complex[rho_basis->npw]; + std::vector> aux(rho_basis->npw); /* blocking rho_basis->nrxnpwx for data locality. @@ -662,9 +489,7 @@ void Forces::cal_force_ew(const UnitCell& ucell, performance will be better when number of atom is quite huge */ const int block_ig = 1024; -#ifdef _OPENMP #pragma omp parallel for -#endif for (int igb = 0; igb < rho_basis->npw; igb += block_ig) { // calculate the actual task length of this block @@ -679,13 +504,6 @@ void Forces::cal_force_ew(const UnitCell& ucell, if (ucell.atoms[it].na != 0) { double dzv; - if (PARAM.inp.use_paw) - { - #ifdef USE_PAW - dzv = GlobalC::paw_cell.get_val(it); - #endif - } - else { dzv = ucell.atoms[it].ncpp.zv; } @@ -702,13 +520,6 @@ void Forces::cal_force_ew(const UnitCell& ucell, double charge = 0.0; for (int it = 0; it < ucell.ntype; it++) { - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - charge += ucell.atoms[it].na * GlobalC::paw_cell.get_val(it); -#endif - } - else { charge += ucell.atoms[it].na * ucell.atoms[it].ncpp.zv; // mohan modify 2007-11-7 } @@ -729,14 +540,16 @@ void Forces::cal_force_ew(const UnitCell& ucell, upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI) * erfc(sqrt(ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha)); } while (upperbound > 1.0e-6); - -#ifdef _OPENMP + const int ig0 = rho_basis->ig_gge0; #pragma omp parallel for -#endif for (int ig = 0; ig < rho_basis->npw; ig++) { + if (ig== ig0) + { + continue; // skip G=0 + } aux[ig] *= ModuleBase::libm::exp(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0) - / (rho_basis->gg[ig] * ucell.tpiba2); + / (rho_basis->gg[ig] * ucell.tpiba2); } // set pos rho_basis->ig_gge0 to zero @@ -744,144 +557,99 @@ void Forces::cal_force_ew(const UnitCell& ucell, { aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } - - // sincos op for cal_force_ew - - std::vector it_facts_host(this->nat); - std::vector tau_flat(this->nat * 3); - - // iterate over by lookup table - for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - - double zv; - if (PARAM.inp.use_paw) + if(this->device == base_device::GpuDevice) + { + std::vector tau_h(this->nat * 3); + std::vector gcar_h(rho_basis->npw * 3); + for(int iat = 0; iat < this->nat; ++iat) { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#endif + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + tau_h[iat * 3] = ucell.atoms[it].tau[ia].x; + tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y; + tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z; } - else + for(int ig = 0; ig < rho_basis->npw; ++ig) { - zv = ucell.atoms[it].ncpp.zv; + gcar_h[ig * 3] = rho_basis->gcar[ig].x; + gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y; + gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z; } + std::vector it_fact_h(ucell.ntype); + for(int it = 0; it < ucell.ntype; ++it) + { + it_fact_h[it] = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; + } + + int* iat2it_d = nullptr; + double* gcar_d = nullptr; + double* tau_d = nullptr; + double* it_fact_d = nullptr; + std::complex* aux_d = nullptr; + double* forceion_d = nullptr; + resmem_int_op()(this->ctx, iat2it_d, this->nat); + resmem_var_op()(this->ctx, gcar_d, rho_basis->npw * 3); + resmem_var_op()(this->ctx, tau_d, this->nat * 3); + resmem_var_op()(this->ctx, it_fact_d, ucell.ntype); + resmem_complex_op()(this->ctx, aux_d, rho_basis->npw); + resmem_var_op()(this->ctx, forceion_d, this->nat * 3); + + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, iat2it_d, ucell.iat2it, this->nat); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, it_fact_d, it_fact_h.data(), ucell.ntype); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux.data(), rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forceion_d, forceion.c, this->nat * 3); + + hamilt::cal_force_ew_op()( + this->nat, + rho_basis->npw, + rho_basis->ig_gge0, + iat2it_d, + gcar_d, + tau_d, + it_fact_d, + aux_d, + forceion_d); - it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba * - ModuleBase::TWO_PI / ucell.omega * fact); - - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - } - - std::vector gcar_flat(rho_basis->npw * 3); - for (int ig = 0; ig < rho_basis->npw; ig++) { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) { - aux_fptype[ig] = static_cast>(aux[ig]); - } - - FPTYPE* d_gcar = gcar_flat.data(); - FPTYPE* d_tau = tau_flat.data(); - FPTYPE* d_it_facts = it_facts_host.data(); - std::complex* d_aux = aux_fptype.data(); - FPTYPE* d_force_g = nullptr; - std::vector force_g_host(this->nat * 3); - - if (this->device == base_device::GpuDevice) - { - d_gcar = nullptr; - d_tau = nullptr; - d_it_facts = nullptr; - d_aux = nullptr; - - resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); - resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_var_op()(this->ctx, d_it_facts, this->nat); - resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); - resmem_var_op()(this->ctx, d_force_g, this->nat * 3); - - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - - - base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); - } - else - { - d_force_g = force_g_host.data(); - std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); - } - - // call op for sincos calculation - hamilt::cal_force_ew_sincos_op()( - this->ctx, - this->nat, - rho_basis->npw, - rho_basis->ig_gge0, - d_gcar, - d_tau, - d_it_facts, - d_aux, - d_force_g - ); - - - if (this->device == base_device::GpuDevice) + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forceion.c, forceion_d, this->nat * 3); + delmem_int_op()(this->ctx,iat2it_d); + delmem_var_op()(this->ctx,gcar_d); + delmem_var_op()(this->ctx,tau_d); + delmem_var_op()(this->ctx,it_fact_d); + delmem_complex_op()(this->ctx,aux_d); + delmem_var_op()(this->ctx,forceion_d); + } else // calculate forces on CPU { - - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); - - - delmem_var_op()(this->ctx, d_gcar); - delmem_var_op()(this->ctx, d_tau); - delmem_var_op()(this->ctx, d_it_facts); - delmem_complex_op()(this->ctx, d_aux); - delmem_var_op()(this->ctx, d_force_g); - } - + #pragma omp parallel for + for(int iat = 0; iat < this->nat; ++iat) + { + const int it = ucell.iat2it[iat]; + const int ia = ucell.iat2ia[iat]; + double it_fact = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; - for (int iat = 0; iat < this->nat; iat++) { - forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); - forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); - forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); + for(int ig = 0; ig < rho_basis->npw; ++ig) + { + if(ig != rho_basis->ig_gge0) // skip G=0 + { + const ModuleBase::Vector3 gcar = rho_basis->gcar[ig]; + const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]); + double sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + forceion(iat, 0) += gcar[0] * sumnb; + forceion(iat, 1) += gcar[1] * sumnb; + forceion(iat, 2) += gcar[2] * sumnb; + } + } + forceion(iat, 0) *= it_fact; + forceion(iat, 1) *= it_fact; + forceion(iat, 2) *= it_fact; + } } - - -// calculate real space force -#ifdef _OPENMP -#pragma omp parallel + // means that the processor contains G=0 term. + #pragma omp parallel { - int num_threads = omp_get_num_threads(); - int thread_id = omp_get_thread_num(); -#else - int num_threads = 1; - int thread_id = 0; -#endif - - /* Here is task distribution for multi-thread, - 0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`. - To avoid syncing, we must calculate work range of each thread by our self - 1. Calculate the iat range [iat_beg, iat_end) by each thread - a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat) - 2. each thread iterate atoms form `iat_beg` to `iat_end-1` - */ - int iat_beg, iat_end; - int it_beg, ia_beg; - ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end); - iat_end = iat_beg + iat_end; - ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - - if (rho_basis->ig_gge0 >= 0) { double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); @@ -890,84 +658,55 @@ void Forces::cal_force_ew(const UnitCell& ucell, // output of rgen: the number of vectors in the sphere const int mxr = 200; // the maximum number of R vectors included in r - ModuleBase::Vector3* r = new ModuleBase::Vector3[mxr]; - double* r2 = new double[mxr]; - ModuleBase::GlobalFunc::ZEROS(r2, mxr); - int* irr = new int[mxr]; - ModuleBase::GlobalFunc::ZEROS(irr, mxr); + std::vector> r(mxr); + std::vector r2(mxr); + std::vector irr(mxr); // the square modulus of R_j-tau_s-tau_s' - int iat1 = iat_beg; - int T1 = it_beg; - int I1 = ia_beg; const double sqa = sqrt(alpha); const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI); // iterating atoms. - // do not need to sync threads because task range of each thread is isolated - while (iat1 < iat_end) + #pragma omp for + for(int iat1 = 0; iat1 < this->nat; iat1++) { - int iat2 = 0; // mohan fix bug 2011-06-07 - int I2 = 0; - int T2 = 0; - while (iat2 < this->nat) + int T1 = ucell.iat2it[iat1]; + int I1 = ucell.iat2ia[iat1]; + for(int iat2 = 0; iat2 < this->nat; iat2++) { - if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0) + int T2 = ucell.iat2it[iat2]; + int I2 = ucell.iat2ia[iat2]; + if (iat1 != iat2) { ModuleBase::Vector3 d_tau = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2]; - H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm); + H_Ewald_pw::rgen(d_tau, rmax, irr.data(), ucell.latvec, ucell.G, r.data(), r2.data(), nrm); for (int n = 0; n < nrm; n++) { const double rr = sqrt(r2[n]) * ucell.lat0; double factor; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2 - / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; -#endif - } - else { factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv - * ModuleBase::e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; + * ModuleBase::e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) + * ucell.lat0; } - forceion(iat1, 0) -= factor * r[n].x; forceion(iat1, 1) -= factor * r[n].y; forceion(iat1, 2) -= factor * r[n].z; } } - ++iat2; - ucell.step_iait(&I2, &T2); } // atom b - ++iat1; - ucell.step_iait(&I1, &T1); } // atom a - - delete[] r; - delete[] r2; - delete[] irr; } -#ifdef _OPENMP } -#endif - Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc); - // this->print(GlobalV::ofs_running, "ewald forces", forceion); ModuleBase::timer::tick("Forces", "cal_force_ew"); - delete[] aux; - return; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h index 695520dceb..370c6a2eb8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.h +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h @@ -37,7 +37,7 @@ class Forces void cal_force(UnitCell& ucell, ModuleBase::matrix& force, const elecstate::ElecState& elec, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, ModuleSymmetry::Symmetry* p_symm, Structure_Factor* p_sf, surchem& solvent, @@ -53,15 +53,15 @@ class Forces void cal_force_loc(const UnitCell& ucell, ModuleBase::matrix& forcelc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& vloc, const Charge* const chr); void cal_force_ew(const UnitCell& ucell, ModuleBase::matrix& forceion, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const Structure_Factor* p_sf); void cal_force_cc(ModuleBase::matrix& forcecc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const Charge* const chr, const bool* numeric, UnitCell& ucell_in); @@ -96,13 +96,13 @@ class Forces const UnitCell& ucell_in, const psi::Psi, Device>* psi_in = nullptr); void cal_force_scc(ModuleBase::matrix& forcescc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& v_current, const bool vnew_exist, const bool* numeric, const UnitCell& ucell_in); void cal_force_us(ModuleBase::matrix& forcenl, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const pseudopot_cell_vnl& ppcell_in, const elecstate::ElecState& elec, const UnitCell& ucell); @@ -113,7 +113,7 @@ class Forces const FPTYPE* rab, const FPTYPE* rhoc, FPTYPE* drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, int type, const UnitCell& ucell_in); // used in nonlinear core correction stress void deriv_drhoc_scc(const bool& numeric, @@ -122,7 +122,7 @@ class Forces const FPTYPE* rab, const FPTYPE* rhoc, FPTYPE* drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const UnitCell& ucell_in); // used in nonlinear core correction stress protected: Device* ctx = {}; diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp index 3346724deb..f3bc7469a3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp @@ -31,7 +31,7 @@ template void Forces::cal_force_cc(ModuleBase::matrix& forcecc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const Charge* const chr, const bool* numeric, UnitCell& ucell_in) @@ -239,7 +239,7 @@ void Forces::deriv_drhoc const FPTYPE *rab, const FPTYPE *rhoc, FPTYPE *drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, int type, const UnitCell& ucell_in ) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp index f670ad9b27..4b089d1d26 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp @@ -24,7 +24,7 @@ template void Forces::cal_force_scc(ModuleBase::matrix& forcescc, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& vnew, const bool vnew_exist, const bool* numeric, @@ -147,7 +147,7 @@ void Forces::deriv_drhoc_scc(const bool& numeric, const FPTYPE* rab, const FPTYPE* rhoc, FPTYPE* drhocg, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const UnitCell& ucell_in) { int igl0 = 0; double gx = 0; diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp index 4323ee6c4f..8879c89c35 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp @@ -15,7 +15,7 @@ // On output: the contribution is added to \(\text{forcenl}\). template void Forces::cal_force_us(ModuleBase::matrix& forcenl, - ModulePW::PW_Basis* rho_basis, + const ModulePW::PW_Basis* rho_basis, const pseudopot_cell_vnl& nlpp, const elecstate::ElecState& elec, const UnitCell& ucell) diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 5ff3ddf2b1..b8bca46302 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -1,6 +1,7 @@ #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" -// #include "module_psi/kernels/device.h" +// #include "source_psi/kernels/device.h" #include "module_base/module_device/types.h" +#include "module_base/constants.h" #include @@ -10,124 +11,18 @@ #include #define THREADS_PER_BLOCK 256 +#define FULL_MASK 0xffffffff +#define WARP_SIZE 32 namespace hamilt { -// CUDA kernels for sincos loops template -__global__ void cal_force_loc_sincos_kernel( - const int nat, - const int npw, - const int ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const thrust::complex* aux, - const FPTYPE scale_factor, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - sincos(phase, &sinp, &cosp); - - // Calculate force factor - const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; - const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); - atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); - atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); -} - -template -__global__ void cal_force_ew_sincos_kernel( - const int nat, - const int npw, - const int ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const thrust::complex* aux, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Skip G=0 term - if (ig == ig_gge0) continue; - - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - sincos(phase, &sinp, &cosp); - - // Calculate Ewald sum contribution (fixed sign error) - const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; +__forceinline__ +__device__ +void warp_reduce(FPTYPE & val) { + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_down_sync(FULL_MASK, val, offset); } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x); - atomicAdd(&force[iat * 3 + 1], local_force_y); - atomicAdd(&force[iat * 3 + 2], local_force_z); } template @@ -304,65 +199,6 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } -// GPU operators -template -void cal_force_loc_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) -{ - // Calculate optimal grid configuration for GPU load balancing - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_per_sm = 32; // Configurable per GPU architecture - const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_loc_sincos_kernel<<>>( - nat, npw, ntype, gcar, tau, vloc_per_type, - reinterpret_cast*>(aux), - scale_factor, force - ); - - cudaCheckOnDebug(); -} - -template -void cal_force_ew_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) -{ - // Calculate optimal grid configuration for GPU load balancing - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_per_sm = 32; // Configurable per GPU architecture - const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_ew_sincos_kernel<<>>( - nat, npw, ig_gge0, gcar, tau, it_facts, - reinterpret_cast*>(aux), force - ); - - cudaCheckOnDebug(); -} - template __global__ void cal_force_nl( const int ntype, @@ -781,6 +617,242 @@ void revertVkbValues( cudaCheckOnDebug(); } +template +__global__ void force_loc_kernel( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2gg_d, + const FPTYPE* gcar_d, + const FPTYPE* tau_d, + const thrust::complex* aux_d, + const FPTYPE* vloc_d, + const int vloc_nc, + FPTYPE* forcelc_d) +{ + const int iat = blockIdx.x; + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + if (iat >= nat) return; + const int it = iat2it[iat]; // get the type of atom + + // Initialize force components + FPTYPE force_x = 0.0; + FPTYPE force_y = 0.0; + FPTYPE force_z = 0.0; + + const auto tau_x = tau_d[iat * 3 + 0]; + const auto tau_y = tau_d[iat * 3 + 1]; + const auto tau_z = tau_d[iat * 3 + 2]; + + // Process all plane waves in chunks of blockDim.x + for (int ig = tid; ig < npw; ig += blockDim.x) { + const auto gcar_x = gcar_d[ig * 3 + 0]; + const auto gcar_y = gcar_d[ig * 3 + 1]; + const auto gcar_z = gcar_d[ig * 3 + 2]; + + // Calculate phase factor + const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Get vloc value + const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]]; + + // Calculate factor + const auto aux_val = aux_d[ig]; + const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real()); + + // Multiply by gcar components + force_x += gcar_x * factor; + force_y += gcar_y * factor; + force_z += gcar_z * factor; + } + + // Warp-level reduction + warp_reduce(force_x); + warp_reduce(force_y); + warp_reduce(force_z); + + // First thread in each warp writes to shared memory + __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps + __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE]; + __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE]; + + if (lane_id == 0) { + warp_sums_x[warp_id] = force_x; + warp_sums_y[warp_id] = force_y; + warp_sums_z[warp_id] = force_z; + } + + __syncthreads(); + + // Final reduction by first warp + if (warp_id == 0) { + FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0; + FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0; + FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0; + + warp_reduce(final_x); + warp_reduce(final_y); + warp_reduce(final_z); + + if (lane_id == 0) { + forcelc_d[iat * 3 + 0] = final_x * tpiba_omega; + forcelc_d[iat * 3 + 1] = final_y * tpiba_omega; + forcelc_d[iat * 3 + 2] = final_z * tpiba_omega; + } + } +} + +template +__global__ void force_ew_kernel( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar_d, + const FPTYPE* tau_d, + const FPTYPE* it_fact_d, + const thrust::complex* aux_d, + FPTYPE* forceion_d) +{ + const int iat = blockIdx.x; + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + if( iat >= nat) return; + const int it = iat2it[iat]; // get the type of atom + const FPTYPE it_fact_val = it_fact_d[it]; // Get it_fact value + + // Initialize force components + FPTYPE force_x = 0.0; + FPTYPE force_y = 0.0; + FPTYPE force_z = 0.0; + + const auto tau_x = tau_d[iat * 3 + 0]; + const auto tau_y = tau_d[iat * 3 + 1]; + const auto tau_z = tau_d[iat * 3 + 2]; + + for (int ig = tid; ig < npw; ig += blockDim.x) { + if(ig == ig_gge0) + { continue; } + const auto gcar_x = gcar_d[ig * 3 + 0]; + const auto gcar_y = gcar_d[ig * 3 + 1]; + const auto gcar_z = gcar_d[ig * 3 + 2]; + + // Calculate phase factor + const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force contribution + const FPTYPE sumnb = -cosp * aux_d[ig].imag() + sinp * aux_d[ig].real(); + + // Multiply by gcar components + force_x += gcar_x * sumnb; + force_y += gcar_y * sumnb; + force_z += gcar_z * sumnb; + } + + // Warp-level reduction + warp_reduce(force_x); + warp_reduce(force_y); + warp_reduce(force_z); + + // First thread in each warp writes to shared memory + __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps + __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE]; + __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE]; + + if (lane_id == 0) { + warp_sums_x[warp_id] = force_x; + warp_sums_y[warp_id] = force_y; + warp_sums_z[warp_id] = force_z; + } + + __syncthreads(); + + // Final reduction by first warp + if (warp_id == 0) { + FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0; + FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0; + FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0; + + warp_reduce(final_x); + warp_reduce(final_y); + warp_reduce(final_z); + + if (lane_id == 0) { + forceion_d[iat * 3 + 0] = final_x * it_fact_val; + forceion_d[iat * 3 + 1] = final_y * it_fact_val; + forceion_d[iat * 3 + 2] = final_z * it_fact_val; + } + } +} + +template +void cal_force_loc_op::operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nc, + FPTYPE* forcelc) +{ + force_loc_kernel + <<>>(nat, + npw, + tpiba_omega, + iat2it, + ig2igg, + gcar, + tau, + reinterpret_cast*>(aux), + vloc, + vloc_nc, + forcelc); // array of data + +} + +template +void cal_force_ew_op::operator()( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_fact, + const std::complex* aux, + FPTYPE* forceion) +{ + force_ew_kernel + <<>>(nat, + npw, + ig_gge0, + iat2it, + gcar, + tau, + it_fact, + reinterpret_cast*>(aux), + forceion); // array of data +} + + // for revertVkbValues functions instantiation template void revertVkbValues(const int *gcar_zero_ptrs, std::complex *vkb_ptr, const std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex coeff); // for saveVkbValues functions instantiation @@ -788,12 +860,11 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; +template struct cal_force_loc_op; +template struct cal_force_ew_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; - +template struct cal_force_loc_op; +template struct cal_force_ew_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 109321d6c3..367415d76b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -428,116 +428,10 @@ struct cal_force_nl_op } }; -// CPU implementation of local force sincos operator -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - - const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; - const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; - - local_force[0] += gcar[ig * 3 + 0] * factor; - local_force[1] += gcar[ig * 3 + 1] * factor; - local_force[2] += gcar[ig * 3 + 2] * factor; - } - - force[iat * 3 + 0] = local_force[0]; - force[iat * 3 + 1] = local_force[1]; - force[iat * 3 + 2] = local_force[2]; - } - } -}; - -// CPU implementation of Ewald force sincos operator -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - // Skip G=0 term - if (ig == ig_gge0) continue; - - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - - const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); - - local_force[0] += gcar[ig * 3 + 0] * factor; - local_force[1] += gcar[ig * 3 + 1] * factor; - local_force[2] += gcar[ig * 3 + 2] * factor; - } - - force[iat * 3 + 0] = local_force[0]; - force[iat * 3 + 1] = local_force[1]; - force[iat * 3 + 2] = local_force[2]; - } - } -}; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; -template struct cal_force_loc_sincos_op; -template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index acf490b278..ca2c6694cf 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -146,67 +146,39 @@ struct cal_force_nl_op const FPTYPE* lambda, const std::complex* becp, const std::complex* dbecp, - FPTYPE* force); + FPTYPE* force); }; template -struct cal_force_loc_sincos_op -{ - /// @brief Calculate local pseudopotential forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param ntype - number of atom types - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw] - /// @param aux - charge density in G-space [npw] - /// @param scale_factor - tpiba * omega - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc) {}; }; template -struct cal_force_ew_sincos_op -{ - /// @brief Calculate Ewald forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param ig_gge0 - index of G=0 vector (-1 if not present) - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param it_facts - precomputed it_fact for each atom [nat] - /// @param aux - structure factor related array [npw] - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); +struct cal_force_ew_op{ + void operator()( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_fact, + const std::complex* aux, + FPTYPE* forceion + ) {}; }; - #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template struct cal_vkb1_nl_op @@ -306,35 +278,6 @@ struct cal_force_nl_op FPTYPE* force); }; -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ntype, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* vloc_per_type, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); -}; - -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); -}; - /** * @brief revert the vkb values for force_nl calculation */ @@ -362,6 +305,36 @@ void saveVkbValues(const int* gcar_zero_ptrs, int ipol, int npwx); +template +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc); +}; + +template +struct cal_force_ew_op{ + void operator()( + const int nat, + const int npw, + const int ig_gge0, + const int* iat2it, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_fact, + const std::complex* aux, + FPTYPE* forceion + ); +}; #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hamilt -#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_PW_HAMILT_PWDFT_KERNELS_FORCE_OP_H \ No newline at end of file +#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H \ No newline at end of file